diff options
507 files changed, 19752 insertions, 8520 deletions
@@ -42,5 +42,5 @@ build:remote --extra_toolchains=//tools/bazeldefs:cc-toolchain-clang-x86_64-defa build:remote --extra_execution_platforms=//tools/bazeldefs:rbe_ubuntu1604 build:remote --platforms=//tools/bazeldefs:rbe_ubuntu1604 build:remote --crosstool_top=@rbe_default//cc:toolchain -build:remote --jobs=300 +build:remote --jobs=100 build:remote --remote_timeout=3600 diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command new file mode 100644 index 000000000..ce3111f3c --- /dev/null +++ b/.buildkite/hooks/post-command @@ -0,0 +1,24 @@ +# Upload test logs on failure, if there are any. +if [[ "${BUILDKITE_COMMAND_EXIT_STATUS}" -ne "0" ]]; then + declare log_count=0 + for log in $(make testlogs 2>/dev/null | sort | uniq); do + buildkite-agent artifact upload "${log}" + log_count=$((${log_count}+1)) + # N.B. If *all* tests fail due to some common cause, then we will + # end up spending way too much time uploading logs. Instead, we just + # upload the first 100 and stop. That is hopefully enough to debug. + if [[ "${log_count}" -ge 100 ]]; then + echo "Only uploaded first 100 failures; skipping the rest." + break + fi + done + # Attempt to clear the cache and shut down. + make clean || echo "make clean failed with code $?" + make bazel-shutdown || echo "make bazel-shutdown failed with code $?" +fi + +# Kill any running containers (clear state). +CONTAINERS="$(docker ps -q)" +if ! [[ -z "${CONTAINERS}" ]]; then + docker container kill ${CONTAINERS} 2>/dev/null || true +fi
\ No newline at end of file diff --git a/.buildkite/hooks/pre-command b/.buildkite/hooks/pre-command new file mode 100644 index 000000000..7d277202b --- /dev/null +++ b/.buildkite/hooks/pre-command @@ -0,0 +1,11 @@ +# Setup for parallelization with PARTITION and TOTAL_PARTITIONS. +export PARTITION=${BUILDKITE_PARALLEL_JOB:-0} +PARTITION=$((${PARTITION}+1)) # 1-indexed, but PARALLEL_JOB is 0-indexed. +export TOTAL_PARTITIONS=${BUILDKITE_PARALLEL_JOB_COUNT:-1} + +# Ensure Docker has experimental enabled. +EXPERIMENTAL=$(sudo docker version --format='{{.Server.Experimental}}') +if test "${EXPERIMENTAL}" != "true"; then + make sudo TARGETS=//runsc:runsc ARGS="install --experimental=true" + sudo systemctl restart docker +fi diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml new file mode 100644 index 000000000..79a80d9c8 --- /dev/null +++ b/.buildkite/pipeline.yaml @@ -0,0 +1,107 @@ +_templates: + common: &common + timeout_in_minutes: 30 + retry: + automatic: + - exit_status: -1 + limit: 10 + - exit_status: "*" + limit: 2 + +steps: + # Run basic smoke tests before preceding to other tests. + - <<: *common + label: ":fire: Smoke tests" + command: make smoke-tests + - wait + + # Check that the Go branch builds. + - <<: *common + label: ":golang: Go branch" + commands: + - make go + - git checkout go && git clean -f + - go build ./... + + # Release workflow. + - <<: *common + label: ":ship: Release tests" + commands: make release + + # Basic unit tests. + - <<: *common + label: ":test_tube: Unit tests" + command: make unit-tests + + # All system call tests. + - <<: *common + label: ":toolbox: System call tests" + command: make syscall-tests + parallelism: 20 + + # Integration tests. + - <<: *common + label: ":parachute: FUSE tests" + command: make fuse-tests + - <<: *common + label: ":docker: Docker tests" + command: make docker-tests + - <<: *common + label: ":goggles: Overlay tests" + command: make overlay-tests + - <<: *common + label: ":safety_pin: Host network tests" + command: make hostnet-tests + - <<: *common + label: ":satellite: SWGSO tests" + command: make swgso-tests + - <<: *common + label: ":coffee: Do tests" + command: make do-tests + - <<: *common + label: ":person_in_lotus_position: KVM tests" + command: make kvm-tests + - <<: *common + label: ":docker: Containerd 1.3.9 tests" + command: make containerd-test-1.3.9 + - <<: *common + label: ":docker: Containerd 1.4.3 tests" + command: make containerd-test-1.4.3 + + # Check the website builds. + - <<: *common + label: ":earth_americas: Website tests" + command: make website-build + + # Networking tests. + - <<: *common + label: ":table_tennis_paddle_and_ball: IPTables tests" + command: make iptables-tests + - <<: *common + label: ":construction_worker: Packetdrill tests" + command: make packetdrill-tests + - <<: *common + label: ":hammer: Packetimpact tests" + command: make packetimpact-tests + + # Runtime tests. + - <<: *common + label: ":php: PHP runtime tests" + command: make php7.3.6-runtime-tests_vfs2 + parallelism: 10 + - <<: *common + label: ":java: Java runtime tests" + command: make java11-runtime-tests_vfs2 + parallelism: 40 + - <<: *common + label: ":golang: Go runtime tests" + command: make go1.12-runtime-tests_vfs2 + parallelism: 10 + - <<: *common + label: ":node: NodeJS runtime tests" + command: make nodejs12.4.0-runtime-tests_vfs2 + parallelism: 10 + - <<: *common + label: ":python: Python runtime tests" + command: make python3.7.3-runtime-tests_vfs2 + parallelism: 10 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md deleted file mode 100644 index 264b4e9fa..000000000 --- a/.github/pull_request_template.md +++ /dev/null @@ -1,5 +0,0 @@ -* [ ] Have you followed the guidelines in [CONTRIBUTING.md](../blob/master/CONTRIBUTING.md)? -* [ ] Have you formatted and linted your code? -* [ ] Have you added relevant tests? -* [ ] Have you added appropriate Fixes & Updates references? -* [ ] If yes, please erase all these lines! diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e28e46352..3be10b9bb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,13 +1,18 @@ +# This workflow builds the source code, extracts nogo annotations and +# posts them to GitHub, if applicable. This leverages the fact that the +# workflow token has appropriate permissions to do so, and attempts to +# leverage the GitHub workflow caches. +# +# This workflow also generates the build badge that is referred to by +# the main README. name: "Build" on: push: branches: - master - - feature/** pull_request: branches: - - master - - feature/** + - "**" jobs: default: @@ -22,7 +27,7 @@ jobs: ${{ runner.os }}-bazel- - run: make - run: make build OPTIONS="--build_tag_filters nogo" TARGETS="//..." - - run: make run TARGETS="//tools/github" ARGS="-path=bazel-bin/ nogo" + - run: make run TARGETS="//tools/github" ARGS="-path=bazel-bin/ -path=bazel-out/ nogo" env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 3a6a592d1..c87ab22ef 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -1,12 +1,12 @@ +# This workflow generates the Go branch. Note that this does not test the Go +# branch, as this is rolled into the main continuous integration pipeline. This +# workflow simply generates and pushes the branch, as long as appropriate +# permissions are available. name: "Go" on: push: branches: - master - pull_request: - branches: - - master - - feature/** jobs: generate: @@ -19,20 +19,13 @@ jobs: else echo ::set-output name=has_token::false fi - - run: | - jq -nc '{"state": "pending", "context": "go tests"}' | \ - curl -sL -X POST -d @- \ - -H "Content-Type: application/json" \ - -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ - "${{ github.event.pull_request.statuses_url }}" - if: github.event_name == 'pull_request' - uses: actions/checkout@v2 - if: github.event_name == 'push' && steps.setup.outputs.has_token == 'true' + if: steps.setup.outputs.has_token == 'true' with: fetch-depth: 0 token: '${{ secrets.GO_TOKEN }}' - uses: actions/checkout@v2 - if: github.event_name == 'pull_request' || steps.setup.outputs.has_token != 'true' + if: steps.setup.outputs.has_token != 'true' with: fetch-depth: 0 - uses: actions/setup-go@v2 @@ -50,32 +43,7 @@ jobs: key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }} restore-keys: | ${{ runner.os }}-bazel- - # Create gopath to merge the changes. The first execution will create - # symlinks to the cache, e.g. bazel-bin. Once the cache is setup, delete - # old gopath files that may exist from previous runs (and could contain - # files that are now deleted). Then run gopath again for good. - - run: | - make build TARGETS="//:gopath" - rm -rf bazel-bin/gopath - make build TARGETS="//:gopath" - - run: tools/go_branch.sh - - run: git checkout go && git clean -f - - run: go build ./... - - if: github.event_name == 'push' + - run: make go run: | git remote add upstream "https://github.com/${{ github.repository }}" git push upstream go:go - - if: ${{ success() && github.event_name == 'pull_request' }} - run: | - jq -nc '{"state": "success", "context": "go tests"}' | \ - curl -sL -X POST -d @- \ - -H "Content-Type: application/json" \ - -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ - "${{ github.event.pull_request.statuses_url }}" - - if: ${{ failure() && github.event_name == 'pull_request' }} - run: | - jq -nc '{"state": "failure", "context": "go tests"}' | \ - curl -sL -X POST -d @- \ - -H "Content-Type: application/json" \ - -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ - "${{ github.event.pull_request.statuses_url }}" diff --git a/.github/workflows/issue_reviver.yml b/.github/workflows/issue_reviver.yml index c53185620..f03b814c9 100644 --- a/.github/workflows/issue_reviver.yml +++ b/.github/workflows/issue_reviver.yml @@ -1,3 +1,5 @@ +# This workflow revives issues that are still referenced in the code, and may +# have been accidentally closed or marked stale. name: "Issue reviver" on: schedule: diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index c09f7eb36..a53fdb3e9 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -1,3 +1,4 @@ +# Labeler labels incoming pull requests. name: "Labeler" on: - pull_request diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 0b31fecf5..be10c5bc4 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -1,3 +1,5 @@ +# The stale workflow closes stale issues and pull requests, unless specific +# tags have been applied in order to keep them open. name: "Close stale issues" on: schedule: diff --git a/.gitignore b/.gitignore index a56f6ebcd..a2a3fd508 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,7 @@ # Generated bazel symlinks. -/bazel-*
\ No newline at end of file +/bazel-* +# Generated build event file. +/.build_events.json +# Generated repository. +/repo +/repo.key
\ No newline at end of file @@ -67,12 +67,15 @@ build_test( "//test/benchmarks/base:startup_test", "//test/benchmarks/base:size_test", "//test/benchmarks/base:sysbench_test", - "//test/benchmarks/database:database_test", + "//test/benchmarks/database:redis_test", "//test/benchmarks/fs:bazel_test", "//test/benchmarks/fs:fio_test", - "//test/benchmarks/media:media_test", - "//test/benchmarks/ml:ml_test", - "//test/benchmarks/network:network_test", + "//test/benchmarks/media:ffmpeg_test", + "//test/benchmarks/ml:tensorflow_test", + "//test/benchmarks/network:httpd_test", + "//test/benchmarks/network:nginx_test", + "//test/benchmarks/network:node_test", + "//test/benchmarks/network:ruby_test", ], ) @@ -14,19 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Helpful pretty-printer. -MAKEBANNER := \033[1;34mmake\033[0m -submake = echo -e '$(MAKEBANNER) $1' >&2; $(MAKE) $1 - -# Described below. -OPTIONS := -STARTUP_OPTIONS := -TARGETS := //runsc -ARGS := - default: runsc .PHONY: default +# Header for debugging (used by other macros). +header = echo --- $(1) >&2 + +# Make hacks. +EMPTY := +SPACE := $(EMPTY) $(EMPTY) + ## usage: make <target> ## or ## make <build|test|copy|run|sudo> STARTUP_OPTIONS="..." OPTIONS="..." TARGETS="..." ARGS="..." @@ -38,7 +35,6 @@ default: runsc ## requirements. ## ## There are common arguments that may be passed to targets. These are: -## STARTUP_OPTIONS - Bazel startup options. ## OPTIONS - Build or test options. ## TARGETS - The bazel targets. ## ARGS - Arguments for run or sudo. @@ -49,7 +45,7 @@ default: runsc ## make build OPTIONS="" TARGETS="//runsc"' ## help: ## Shows all targets and help from the Makefile (this message). - @grep --no-filename -E '^([a-z.A-Z_-]+:.*?|)##' $(MAKEFILE_LIST) | \ + @grep --no-filename -E '^([a-z.A-Z_%-]+:.*?|)##' $(MAKEFILE_LIST) | \ awk 'BEGIN {FS = "(:.*?|)## ?"}; { \ if (length($$1) > 0) { \ printf " \033[36m%-20s\033[0m %s\n", $$1, $$2; \ @@ -57,17 +53,34 @@ help: ## Shows all targets and help from the Makefile (this message). printf "%s\n", $$2; \ } \ }' + build: ## Builds the given $(TARGETS) with the given $(OPTIONS). E.g. make build TARGETS=runsc -test: ## Tests the given $(TARGETS) with the given $(OPTIONS). E.g. make test TARGETS=pkg/buffer:buffer_test -copy: ## Copies the given $(TARGETS) to the given $(DESTINATION). E.g. make copy TARGETS=runsc DESTINATION=/tmp -run: ## Runs the given $(TARGETS), built with $(OPTIONS), using $(ARGS). E.g. make run TARGETS=runsc ARGS=-version -sudo: ## Runs the given $(TARGETS) as per run, but using "sudo -E". E.g. make sudo TARGETS=test/root:root_test ARGS=-test.v -.PHONY: help build test copy run sudo + @$(call build,$(OPTIONS) $(TARGETS)) +.PHONY: build + +test: ## Tests the given $(TARGETS) with the given $(OPTIONS). E.g. make test TARGETS=pkg/buffer:buffer_test + @$(call build,$(OPTIONS) $(TARGETS)) +.PHONY: test + +copy: ## Copies the given $(TARGETS) to the given $(DESTINATION). E.g. make copy TARGETS=runsc DESTINATION=/tmp + @$(call copy,$(TARGETS),$(DESTINATION)) +.PHONY: copy + +run: ## Runs the given $(TARGETS), built with $(OPTIONS), using $(ARGS). E.g. make run TARGETS=runsc ARGS=-version + @$(call run,$(TARGETS),$(ARGS)) +.PHONY: run + +sudo: ## Runs the given $(TARGETS) as per run, but using "sudo -E". E.g. make sudo TARGETS=test/root:root_test ARGS=-test.v + @$(call sudo,$(TARGETS),$(ARGS)) +.PHONY: sudo + +# Load image helpers. +include tools/images.mk # Load all bazel wrappers. # # This file should define the basic "build", "test", "run" and "sudo" rules, in -# addition to the $(BRANCH_NAME) variable. +# addition to the $(BRANCH_NAME) and $(BUILD_ROOTS) variables. ifneq (,$(wildcard tools/google.mk)) include tools/google.mk else @@ -75,32 +88,74 @@ include tools/bazel.mk endif ## -## Docker image targets. -## -## Images used by the tests must also be built and available locally. -## The canonical test targets defined below will automatically load -## relevant images. These can be loaded or built manually via these -## targets. +## Development helpers and tooling. ## -## (*) Note that you may provide an ARCH parameter in order to build -## and load images from an alternate archiecture (using qemu). When -## bazel is run as a server, this has the effect of running an full -## cross-architecture chain, and can produce cross-compiled binaries. +## These targets faciliate local development by automatically +## installing and configuring a runtime. Several variables may +## be used here to tweak the installation: +## RUNTIME - The name of the installed runtime (default: branch). +## RUNTIME_DIR - Where the runtime will be installed (default: temporary directory with the $RUNTIME). +## RUNTIME_BIN - The runtime binary (default: $RUNTIME_DIR/runsc). +## RUNTIME_LOG_DIR - The logs directory (default: $RUNTIME_DIR/logs). +## RUNTIME_LOGS - The log pattern (default: $RUNTIME_LOG_DIR/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND%). ## -define images -$(1)-%: ## Image tool: $(1) a given image (also may use 'all-images'). - @$(call submake,-C images $$@) -endef -rebuild-...: ## Rebuild the given image. Also may use 'rebuild-all-images'. -$(eval $(call images,rebuild)) -push-...: ## Push the given image. Also may use 'push-all-images'. -$(eval $(call images,push)) -pull-...: ## Pull the given image. Also may use 'pull-all-images'. -$(eval $(call images,pull)) -load-...: ## Load (pull or rebuild) the given image. Also may use 'load-all-images'. -$(eval $(call images,load)) -list-images: ## List all available images. - @$(call submake, -C images $$@) +ifeq (,$(BRANCH_NAME)) +RUNTIME := runsc +RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/$(RUNTIME) +else +RUNTIME := $(BRANCH_NAME) +RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/$(RUNTIME) +endif +RUNTIME_BIN := $(RUNTIME_DIR)/runsc +RUNTIME_LOG_DIR := $(RUNTIME_DIR)/logs +RUNTIME_LOGS := $(RUNTIME_LOG_DIR)/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND% + +$(RUNTIME_BIN): # See below. + @mkdir -p "$(RUNTIME_DIR)" + @$(call copy,//runsc,$(RUNTIME_BIN)) +.PHONY: $(RUNTIME_BIN) # Real file, but force rebuild. + +# Configure helpers for below. +configure_noreload = \ + $(call header,CONFIGURE $(1) → $(RUNTIME_BIN) $(2)); \ + sudo $(RUNTIME_BIN) install --experimental=true --runtime="$(1)" -- --debug-log "$(RUNTIME_LOGS)" $(2) && \ + sudo rm -rf "$(RUNTIME_LOG_DIR)" && mkdir -p "$(RUNTIME_LOG_DIR)" +reload_docker = \ + sudo systemctl reload docker && \ + if test -f /etc/docker/daemon.json; then \ + sudo chmod 0755 /etc/docker && \ + sudo chmod 0644 /etc/docker/daemon.json; \ + fi +configure = $(call configure_noreload,$(1),$(2)) && $(reload_docker) + +# Helpers for above. Requires $(RUNTIME_BIN) dependency. +install_runtime = $(call configure,$(RUNTIME),$(1) --TESTONLY-test-name-env=RUNSC_TEST_NAME) +test_runtime = $(call test,--test_arg=--runtime=$(RUNTIME) $(PARTITIONS) $(1)) + +refresh: $(RUNTIME_BIN) ## Updates the runtime binary. +.PHONY: refresh + +dev: $(RUNTIME_BIN) ## Installs a set of local runtimes. Requires sudo. + @$(call configure_noreload,$(RUNTIME),--net-raw) + @$(call configure_noreload,$(RUNTIME)-d,--net-raw --debug --strace --log-packets) + @$(call configure_noreload,$(RUNTIME)-p,--net-raw --profile) + @$(call configure_noreload,$(RUNTIME)-vfs2-d,--net-raw --debug --strace --log-packets --vfs2) + @$(call reload_docker) +.PHONY: dev + +nogo: ## Surfaces all nogo findings. + @$(call build,--build_tag_filters nogo //...) + @$(call run,//tools/github $(foreach dir,$(BUILD_ROOTS),-path=$(CURDIR)/$(dir)) -dry-run nogo) +.PHONY: nogo + +go: ## Builds the Go branch. + @$(call clean) + @$(call build,//:gopath) + @tools/go_branch.sh + +gazelle: ## Runs gazelle to update WORKSPACE. + @$(call run,//:gazelle update-repos -from_file=go.mod -prune) +.PHONY: gazelle ## ## Canonical build and test targets. @@ -109,24 +164,32 @@ list-images: ## List all available images. ## convenient entrypoints for testing changes. If you're adding a ## new subsystem or workflow, consider adding a new target here. ## +## Some targets support a PARTITION (1-indexed) and TOTAL_PARTITIONS +## environment variables for high-level test sharding. Unlike most +## other variables, these are sourced from the environment. +## +PARTITION ?= 1 +TOTAL_PARTITIONS ?= 1 +PARTITIONS := --test_arg=--partition=$(PARTITION) --test_arg=--total_partitions=$(TOTAL_PARTITIONS) + runsc: ## Builds the runsc binary. - @$(call submake,build OPTIONS="-c opt" TARGETS="//runsc") + @$(call build,-c opt //runsc) .PHONY: runsc debian: ## Builds the debian packages. - @$(call submake,build OPTIONS="-c opt" TARGETS="//debian:debian") + @$(call build,-c opt //debian:debian) .PHONY: debian smoke-tests: ## Runs a simple smoke test after build runsc. - @$(call submake,run DOCKER_PRIVILEGED="" ARGS="--alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do true") + @$(call run,//runsc,--alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do true) .PHONY: smoke-tests fuse-tests: - @$(call submake,test OPTIONS="--test_tag_filters fuse" TARGETS="test/fuse/...") + @$(call test,--test_tag_filters=fuse $(PARTITIONS) test/fuse/...) .PHONY: fuse-tests unit-tests: ## Local package unit tests in pkg/..., runsc/, tools/.., etc. - @$(call submake,test TARGETS="pkg/... runsc/... tools/...") + @$(call test,pkg/... runsc/... tools/...) .PHONY: unit-tests tests: ## Runs all unit tests and syscall tests. @@ -135,120 +198,99 @@ tests: unit-tests syscall-tests integration-tests: ## Run all standard integration tests. integration-tests: docker-tests overlay-tests hostnet-tests swgso-tests -integration-tests: do-tests kvm-tests containerd-test-1.3.4 +integration-tests: do-tests kvm-tests containerd-test-1.3.9 .PHONY: integration-tests network-tests: ## Run all networking integration tests. network-tests: iptables-tests packetdrill-tests packetimpact-tests .PHONY: network-tests -# Standard integration targets. -INTEGRATION_TARGETS := //test/image:image_test //test/e2e:integration_test - syscall-%-tests: - @$(call submake,test OPTIONS="--test_tag_filters runsc_$*" TARGETS="test/syscalls/...") + @$(call test,--test_tag_filters=runsc_$* $(PARTITIONS) test/syscalls/...) syscall-native-tests: - @$(call submake,test OPTIONS="--test_tag_filters native" TARGETS="test/syscalls/...") + @$(call test,--test_tag_filters=native $(PARTITIONS) test/syscalls/...) .PHONY: syscall-native-tests syscall-tests: ## Run all system call tests. - @$(call submake,test TARGETS="test/syscalls/...") + @$(call test,$(PARTITIONS) test/syscalls/...) -%-runtime-tests: load-runtimes_% -ifeq ($(PARTITION),) - @$(eval PARTITION := 1) -endif -ifeq ($(TOTAL_PARTITIONS),) - @$(eval TOTAL_PARTITIONS := 1) -endif - @$(call submake,install-runtime) - @$(call submake,test-runtime OPTIONS="--test_timeout=10800 --test_arg=--partition=$(PARTITION) --test_arg=--total_partitions=$(TOTAL_PARTITIONS)" TARGETS="//test/runtimes:$*") +%-runtime-tests: load-runtimes_% $(RUNTIME_BIN) + @$(call install_runtime,) # Ensure flags are cleared. + @$(call test_runtime,--test_timeout=10800 //test/runtimes:$*) -%-runtime-tests_vfs2: load-runtimes_% -ifeq ($(PARTITION),) - @$(eval PARTITION := 1) -endif -ifeq ($(TOTAL_PARTITIONS),) - @$(eval TOTAL_PARTITIONS := 1) -endif - @$(call submake,install-runtime RUNTIME="vfs2" ARGS="--vfs2") - @$(call submake,test-runtime RUNTIME="vfs2" OPTIONS="--test_timeout=10800 --test_arg=--partition=$(PARTITION) --test_arg=--total_partitions=$(TOTAL_PARTITIONS)" TARGETS="//test/runtimes:$*") +%-runtime-tests_vfs2: load-runtimes_% $(RUNTIME_BIN) + @$(call install_runtime,--vfs2) + @$(call test_runtime,--test_timeout=10800 //test/runtimes:$*) -do-tests: runsc - @$(call submake,run TARGETS="//runsc" ARGS="--rootless do true") - @$(call submake,run TARGETS="//runsc" ARGS="--rootless -network=none do true") - @$(call submake,sudo TARGETS="//runsc" ARGS="do true") +do-tests: + @$(call run,//runsc,--rootless do true) + @$(call run,//runsc,--rootless -network=none do true) + @$(call sudo,//runsc,do true) .PHONY: do-tests simple-tests: unit-tests # Compatibility target. .PHONY: simple-tests -docker-tests: load-basic-images - @$(call submake,install-runtime RUNTIME="vfs1") - @$(call submake,test-runtime RUNTIME="vfs1" TARGETS="$(INTEGRATION_TARGETS)") - @$(call submake,install-runtime RUNTIME="vfs2" ARGS="--vfs2") - @$(call submake,test-runtime RUNTIME="vfs2" TARGETS="$(INTEGRATION_TARGETS)") +# Standard integration targets. +INTEGRATION_TARGETS := //test/image:image_test //test/e2e:integration_test + +docker-tests: load-basic $(RUNTIME_BIN) + @$(call install_runtime,) # Clear flags. + @$(call test_runtime,$(INTEGRATION_TARGETS)) + @$(call install_runtime,--vfs2) + @$(call test_runtime,$(INTEGRATION_TARGETS)) .PHONY: docker-tests -overlay-tests: load-basic-images - @$(call submake,install-runtime RUNTIME="overlay" ARGS="--overlay") - @$(call submake,test-runtime RUNTIME="overlay" TARGETS="$(INTEGRATION_TARGETS)") +overlay-tests: load-basic $(RUNTIME_BIN) + @$(call install_runtime,--overlay) + @$(call test_runtime,$(INTEGRATION_TARGETS)) .PHONY: overlay-tests -swgso-tests: load-basic-images - @$(call submake,install-runtime RUNTIME="swgso" ARGS="--software-gso=true --gso=false") - @$(call submake,test-runtime RUNTIME="swgso" TARGETS="$(INTEGRATION_TARGETS)") +swgso-tests: load-basic $(RUNTIME_BIN) + @$(call install_runtime,--software-gso=true --gso=false) + @$(call test_runtime,$(INTEGRATION_TARGETS)) .PHONY: swgso-tests -hostnet-tests: load-basic-images - @$(call submake,install-runtime RUNTIME="hostnet" ARGS="--network=host") - @$(call submake,test-runtime RUNTIME="hostnet" OPTIONS="--test_arg=-checkpoint=false" TARGETS="$(INTEGRATION_TARGETS)") +hostnet-tests: load-basic $(RUNTIME_BIN) + @$(call install_runtime,--network=host) + @$(call test_runtime,--test_arg=-checkpoint=false --test_arg=-hostnet=true $(INTEGRATION_TARGETS)) .PHONY: hostnet-tests -kvm-tests: load-basic-images +kvm-tests: load-basic $(RUNTIME_BIN) @(lsmod | grep -E '^(kvm_intel|kvm_amd)') || sudo modprobe kvm - @if ! [[ -w /dev/kvm ]]; then sudo chmod a+rw /dev/kvm; fi - @$(call submake,test TARGETS="//pkg/sentry/platform/kvm:kvm_test") - @$(call submake,install-runtime RUNTIME="kvm" ARGS="--platform=kvm") - @$(call submake,test-runtime RUNTIME="kvm" TARGETS="$(INTEGRATION_TARGETS)") + @if ! test -w /dev/kvm; then sudo chmod a+rw /dev/kvm; fi + @$(call test,//pkg/sentry/platform/kvm:kvm_test) + @$(call install_runtime,--platform=kvm) + @$(call test_runtime,$(INTEGRATION_TARGETS)) .PHONY: kvm-tests -iptables-tests: load-iptables +iptables-tests: load-iptables $(RUNTIME_BIN) @sudo modprobe iptable_filter @sudo modprobe ip6table_filter - @$(call submake,test-runtime RUNTIME="runc" TARGETS="//test/iptables:iptables_test") - @$(call submake,install-runtime RUNTIME="iptables" ARGS="--net-raw") - @$(call submake,test-runtime RUNTIME="iptables" TARGETS="//test/iptables:iptables_test") + @$(call test,--test_arg=-runtime=runc $(PARTITIONS) //test/iptables:iptables_test) + @$(call install_runtime,--net-raw) + @$(call test_runtime,//test/iptables:iptables_test) .PHONY: iptables-tests -# Run the iptables tests with runsc only. Useful for developing to skip runc -# testing. -iptables-runsc-tests: load-iptables - @sudo modprobe iptable_filter - @sudo modprobe ip6table_filter - @$(call submake,install-runtime RUNTIME="iptables" ARGS="--net-raw") - @$(call submake,test-runtime RUNTIME="iptables" TARGETS="//test/iptables:iptables_test") -.PHONY: iptables-runsc-tests - -packetdrill-tests: load-packetdrill - @$(call submake,install-runtime RUNTIME="packetdrill") - @$(call submake,test-runtime RUNTIME="packetdrill" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetdrill, tests(//...))')") +packetdrill-tests: load-packetdrill $(RUNTIME_BIN) + @$(call install_runtime,) # Clear flags. + @$(call test_runtime,//test/packetdrill:all_tests) .PHONY: packetdrill-tests -packetimpact-tests: load-packetimpact +packetimpact-tests: load-packetimpact $(RUNTIME_BIN) @sudo modprobe iptable_filter @sudo modprobe ip6table_filter - @$(call submake,install-runtime RUNTIME="packetimpact") - @$(call submake,test-runtime OPTIONS="--jobs=HOST_CPUS*3 --local_test_jobs=HOST_CPUS*3" RUNTIME="packetimpact" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetimpact, tests(//...))')") + @$(call install_runtime,) # Clear flags. + @$(call test_runtime,--jobs=HOST_CPUS*3 --local_test_jobs=HOST_CPUS*3 //test/packetimpact/tests:all_tests) .PHONY: packetimpact-tests # Specific containerd version tests. -containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd load-basic_ubuntu - @$(call submake,install-runtime RUNTIME="root") - @CONTAINERD_VERSION=$* $(MAKE) sudo TARGETS="tools/installers:containerd" - @$(MAKE) sudo TARGETS="tools/installers:shim" - @$(MAKE) sudo TARGETS="test/root:root_test" ARGS="--runtime=root -test.v" +containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd load-basic_ubuntu $(RUNTIME_BIN) + @$(call install_runtime,) # Clear flags. + @$(call sudo,tools/installers:containerd,$*) + @$(call sudo,tools/installers:shim) + @$(call sudo,test/root:root_test,--runtime=$(RUNTIME) -test.v) # Note that we can't run containerd-test-1.1.8 tests here. # @@ -257,8 +299,8 @@ containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-b # actually drive the tests. The v1 API is tested exclusively through 1.2.13. containerd-tests: ## Runs all supported containerd version tests. containerd-tests: containerd-test-1.2.13 -containerd-tests: containerd-test-1.3.4 -containerd-tests: containerd-test-1.4.0-beta.0 +containerd-tests: containerd-test-1.3.9 +containerd-tests: containerd-test-1.4.3 ## ## Benchmarks. @@ -284,35 +326,35 @@ BENCHMARKS_UPLOAD := false BENCHMARKS_OFFICIAL := false BENCHMARKS_PLATFORMS := ptrace BENCHMARKS_TARGETS := //test/benchmarks/base:startup_test -BENCHMARKS_ARGS := -test.bench=. +BENCHMARKS_ARGS := -test.bench=. -pprof-cpu -pprof-heap -pprof-heap -pprof-block -init-benchmark-table: ## Initializes a BigQuery table with the benchmark schema -## (see //tools/bigquery/bigquery.go). If the table alread exists, this is a noop. - $(call submake, run TARGETS=//tools/parsers:parser ARGS="init --project=$(BENCHMARKS_PROJECT) \ - --dataset=$(BENCHMARKS_DATASET) --table=$(BENCHMARKS_TABLE)") +init-benchmark-table: ## Initializes a BigQuery table with the benchmark schema. + @$(call run,//tools/parsers:parser,init --project=$(BENCHMARKS_PROJECT) --dataset=$(BENCHMARKS_DATASET) --table=$(BENCHMARKS_TABLE)) .PHONY: init-benchmark-table -benchmark-platforms: load-benchmarks-images ## Runs benchmarks for runc and all given platforms in BENCHMARK_PLATFORMS. - $(call submake, run-benchmark RUNTIME="runc") - $(foreach PLATFORM,$(BENCHMARKS_PLATFORMS), \ - $(call submake,install-runtime RUNTIME="$(PLATFORM)" ARGS="--platform=$(PLATFORM) --vfs2") && \ - $(call submake,run-benchmark RUNTIME="$(PLATFORM)") && \ - $(call submake,install-runtime RUNTIME="$(PLATFORM)_vfs1" ARGS="--platform=$(PLATFORM)") && \ - $(call submake,run-benchmark RUNTIME="$(PLATFORM)_vfs1") && \ +# $(1) is the runtime name, $(2) are the arguments. +run_benchmark = \ + $(call header,BENCHMARK $(1) $(2)); \ + if test "$(1)" != "runc"; then $(call install_runtime,--profile $(2)); fi \ + @T=$$(mktemp --tmpdir logs.$(RUNTIME).XXXXXX); \ + $(call sudo,$(BENCHMARKS_TARGETS) --runtime=$(RUNTIME) $(BENCHMARKS_ARGS) | tee $$T); \ + rc=$$?; \ + if test $$rc -eq 0 && test "$(BENCHMARKS_UPLOAD)" == "true"; then \ + $(call run,tools/parsers:parser parse --debug --file=$$T --runtime=$(RUNTIME) --suite_name=$(BENCHMARKS_SUITE) --project=$(BENCHMARKS_PROJECT) --dataset=$(BENCHMARKS_DATASET) --table=$(BENCHMARKS_TABLE) --official=$(BENCHMARKS_OFFICIAL)); \ + fi; \ + rm -rf $$T; \ + exit $$rc + +benchmark-platforms: load-benchmarks ## Runs benchmarks for runc and all given platforms in BENCHMARK_PLATFORMS. + @$(foreach PLATFORM,$(BENCHMARKS_PLATFORMS), \ + $(call run_benchmark,$(RUNTIME)+vfs2,$(BENCHMARK_ARGS) --platform=$(PLATFORM) --vfs2) && \ + $(call run_benchmark,$(RUNTIME),$(BENCHMARK_ARGS) --platform=$(PLATFORM)) && \ ) \ - true + $(call run-benchmark,runc) .PHONY: benchmark-platforms -run-benchmark: ## Runs single benchmark and optionally sends data to BigQuery. - @set -xeuo pipefail; T=$$(mktemp --tmpdir logs.$(RUNTIME).XXXXXX); \ - $(call submake,sudo TARGETS="$(BENCHMARKS_TARGETS)" ARGS="--runtime=$(RUNTIME) $(BENCHMARKS_ARGS)" | tee $$T); \ - if [[ "$(BENCHMARKS_UPLOAD)" == "true" ]]; then \ - $(call submake,run TARGETS=tools/parsers:parser ARGS="parse --debug --file=$$T \ - --runtime=$(RUNTIME) --suite_name=$(BENCHMARKS_SUITE) \ - --project=$(BENCHMARKS_PROJECT) --dataset=$(BENCHMARKS_DATASET) \ - --table=$(BENCHMARKS_TABLE) --official=$(BENCHMARKS_OFFICIAL)"); \ - fi; \ - rm -rf $$T +run-benchmark: load-benchmarks ## Runs single benchmark and optionally sends data to BigQuery. + @$(call run_benchmark,$(RUNTIME),$(BENCHMARK_ARGS)) .PHONY: run-benchmark ## @@ -332,7 +374,7 @@ WEBSITE_PROJECT := gvisordev WEBSITE_REGION := us-central1 website-build: load-jekyll ## Build the site image locally. - @$(call submake,run TARGETS="//website:website" ARGS="$(WEBSITE_IMAGE)") + @$(call run,//website:website,$(WEBSITE_IMAGE)) .PHONY: website-build website-server: website-build ## Run a local server for development. @@ -358,17 +400,17 @@ website-deploy: website-push ## Deploy a new version of the website. ## RELEASE_NAME - The name of the release in the proper format (needed for tag). ## RELEASE_NOTES - The file containing release notes (needed for tag). ## -RELEASE_ROOT := $(CURDIR)/repo -RELEASE_KEY := repo.key -RELEASE_NIGHTLY := false -RELEASE_COMMIT := -RELEASE_NAME := -RELEASE_NOTES := - +RELEASE_ROOT := $(CURDIR)/repo +RELEASE_KEY := repo.key +RELEASE_NIGHTLY := false +RELEASE_COMMIT := +RELEASE_NAME := +RELEASE_NOTES := GPG_TEST_OPTIONS := $(shell if gpg --pinentry-mode loopback --version >/dev/null 2>&1; then echo --pinentry-mode loopback; fi) + $(RELEASE_KEY): @echo "WARNING: Generating a key for testing ($@); don't use this." - T=$$(mktemp --tmpdir keyring.XXXXXX); \ + @T=$$(mktemp --tmpdir keyring.XXXXXX); \ C=$$(mktemp --tmpdir config.XXXXXX); \ echo Key-Type: DSA >> $$C && \ echo Key-Length: 1024 >> $$C && \ @@ -382,11 +424,11 @@ $(RELEASE_KEY): release: $(RELEASE_KEY) ## Builds a release. @mkdir -p $(RELEASE_ROOT) - @T=$$(mktemp -d --tmpdir release.XXXXXX); \ - $(call submake,copy TARGETS="//runsc:runsc" DESTINATION=$$T) && \ - $(call submake,copy TARGETS="//shim/v1:gvisor-containerd-shim" DESTINATION=$$T) && \ - $(call submake,copy TARGETS="//shim/v2:containerd-shim-runsc-v1" DESTINATION=$$T) && \ - $(call submake,copy TARGETS="//debian:debian" DESTINATION=$$T) && \ + @export T=$$(mktemp -d --tmpdir release.XXXXXX); \ + $(call copy,//runsc:runsc,$$T) && \ + $(call copy,//shim/v1:gvisor-containerd-shim,$$T) && \ + $(call copy,//shim/v2:containerd-shim-runsc-v1,$$T) && \ + $(call copy,//debian:debian,$$T) && \ NIGHTLY=$(RELEASE_NIGHTLY) tools/make_release.sh $(RELEASE_KEY) $(RELEASE_ROOT) $$T/*; \ rc=$$?; rm -rf $$T; exit $$rc .PHONY: release @@ -394,75 +436,3 @@ release: $(RELEASE_KEY) ## Builds a release. tag: ## Creates and pushes a release tag. @tools/tag_release.sh "$(RELEASE_COMMIT)" "$(RELEASE_NAME)" "$(RELEASE_NOTES)" .PHONY: tag - -## -## Development helpers and tooling. -## -## These targets faciliate local development by automatically -## installing and configuring a runtime. Several variables may -## be used here to tweak the installation: -## RUNTIME - The name of the installed runtime (default: branch). -## RUNTIME_DIR - Where the runtime will be installed (default: temporary directory with the $RUNTIME). -## RUNTIME_BIN - The runtime binary (default: $RUNTIME_DIR/runsc). -## RUNTIME_LOG_DIR - The logs directory (default: $RUNTIME_DIR/logs). -## RUNTIME_LOGS - The log pattern (default: $RUNTIME_LOG_DIR/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND%). -## -ifeq (,$(BRANCH_NAME)) -RUNTIME := runsc -RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/$(RUNTIME) -else -RUNTIME := $(BRANCH_NAME) -RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/$(RUNTIME) -endif -RUNTIME_BIN := $(RUNTIME_DIR)/runsc -RUNTIME_LOG_DIR := $(RUNTIME_DIR)/logs -RUNTIME_LOGS := $(RUNTIME_LOG_DIR)/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND% - -dev: ## Installs a set of local runtimes. Requires sudo. - @$(call submake,refresh ARGS="--net-raw") - @$(call submake,configure RUNTIME_NAME="$(RUNTIME)" ARGS="--net-raw") - @$(call submake,configure RUNTIME_NAME="$(RUNTIME)-d" ARGS="--net-raw --debug --strace --log-packets") - @$(call submake,configure RUNTIME_NAME="$(RUNTIME)-p" ARGS="--net-raw --profile") - @$(call submake,configure RUNTIME_NAME="$(RUNTIME)-vfs2-d" ARGS="--net-raw --debug --strace --log-packets --vfs2") - @sudo systemctl restart docker -.PHONY: dev - -refresh: ## Refreshes the runtime binary (for development only). Must have called 'dev' or 'install-runtime' first. - @mkdir -p "$(RUNTIME_DIR)" - @$(call submake,copy TARGETS=runsc DESTINATION="$(RUNTIME_BIN)") -.PHONY: refresh - -install-runtime: ## Installs the runtime for testing. Requires sudo. - @$(call submake,refresh ARGS="--net-raw --TESTONLY-test-name-env=RUNSC_TEST_NAME $(ARGS)") - @$(call submake,configure RUNTIME_NAME=runsc) - @$(call submake,configure RUNTIME_NAME="$(RUNTIME)") - @sudo systemctl restart docker - @if [[ -f /etc/docker/daemon.json ]]; then \ - sudo chmod 0755 /etc/docker && \ - sudo chmod 0644 /etc/docker/daemon.json; \ - fi -.PHONY: install-runtime - -install-debug-runtime: ## Installs the runtime for debugging. Requires sudo. - @$(call submake,install-runtime ARGS="--debug --strace --log-packets $(ARGS)") -.PHONY: install-debug-runtime - -configure: ## Configures a single runtime. Requires sudo. Typically called from dev or install-runtime. - @sudo sudo "$(RUNTIME_BIN)" install --experimental=true --runtime="$(RUNTIME_NAME)" -- --debug-log "$(RUNTIME_LOGS)" $(ARGS) - @echo -e "$(INFO) Installed runtime \"$(RUNTIME)\" @ $(RUNTIME_BIN)" - @echo -e "$(INFO) Logs are in: $(RUNTIME_LOG_DIR)" - @sudo rm -rf "$(RUNTIME_LOG_DIR)" && mkdir -p "$(RUNTIME_LOG_DIR)" -.PHONY: configure - -test-runtime: ## A convenient wrapper around test that provides the runtime argument. Target must still be provided. - @$(call submake,test OPTIONS="$(OPTIONS) --test_arg=--runtime=$(RUNTIME)") -.PHONY: test-runtime - -nogo: ## Surfaces all nogo findings. - @$(call submake,build OPTIONS="--build_tag_filters nogo" TARGETS="//...") - @$(call submake,run TARGETS="//tools/github" ARGS="$(foreach dir,$(BUILD_ROOTS),-path=$(CURDIR)/$(dir)) -dry-run nogo") -.PHONY: nogo - -gazelle: ## Runs gazelle to update WORKSPACE. - @$(call submake,run TARGETS="//:gazelle" ARGS="update-repos -from_file=go.mod -prune") -.PHONY: gazelle @@ -38,6 +38,12 @@ http_archive( http_archive( name = "bazel_gazelle", + patch_args = ["-p1"], + patches = [ + # False positive output complaining about Go logrus versions spam the + # logs. Strip this message in this case. Does not affect control flow. + "//tools:bazel_gazelle.patch", + ], sha256 = "b85f48fa105c4403326e9525ad2b2cc437babaa6e15a3fc0b1dbab0ab064bc7c", urls = [ "https://mirror.bazel.build/github.com/bazelbuild/bazel-gazelle/releases/download/v0.22.2/bazel-gazelle-v0.22.2.tar.gz", @@ -187,8 +193,8 @@ go_repository( name = "com_github_containerd_containerd", build_file_proto_mode = "disable", importpath = "github.com/containerd/containerd", - sum = "h1:3o0smo5SKY7H6AJCmJhsnCjR2/V2T8VmiHt7seN2/kI=", - version = "v1.3.4", + sum = "h1:K2U/F4jGAMBqeUssfgJRbFuomLcS2Fxo1vR3UM/Mbh8=", + version = "v1.3.9", ) go_repository( @@ -518,8 +524,8 @@ go_repository( name = "com_github_containerd_cgroups", build_file_proto_mode = "disable", importpath = "github.com/containerd/cgroups", - sum = "h1:5yg0k8gqOssNLsjjCtXIADoPbAtUtQZJfC8hQ4r2oFY=", - version = "v0.0.0-20181219155423-39b18af02c41", + sum = "h1:7grrpcfCtbZLsjtB0DgMuzs1umsJmpzaHMZ6cO6iAWw=", + version = "v0.0.0-20201119153540-4cbc285b3327", ) go_repository( diff --git a/g3doc/proposals/runtime_dedicate_os_thread.md b/g3doc/proposals/runtime_dedicate_os_thread.md new file mode 100644 index 000000000..dc70055b0 --- /dev/null +++ b/g3doc/proposals/runtime_dedicate_os_thread.md @@ -0,0 +1,188 @@ +# `runtime.DedicateOSThread` + +Status as of 2020-09-18: Deprioritized; initial studies in #2180 suggest that +this may be difficult to support in the Go runtime due to issues with GC. + +## Summary + +Allow goroutines to bind to kernel threads in a way that allows their scheduling +to be kernel-managed rather than runtime-managed. + +## Objectives + +* Reduce Go runtime overhead in the gVisor sentry (#2184). + +* Minimize intrusiveness of changes to the Go runtime. + +## Background + +In Go, execution contexts are referred to as goroutines, which the runtime calls +Gs. The Go runtime maintains a variably-sized pool of threads (called Ms by the +runtime) on which Gs are executed, as well as a pool of "virtual processors" +(called Ps by the runtime) of size equal to `runtime.GOMAXPROCS()`. Usually, +each M requires a P in order to execute Gs, limiting the number of concurrently +executing goroutines to `runtime.GOMAXPROCS()`. + +The `runtime.LockOSThread` function temporarily locks the invoking goroutine to +its current thread. It is primarily useful for interacting with OS or non-Go +library facilities that are per-thread. It does not reduce interactions with the +Go runtime scheduler: locked Ms relinquish their P when they become blocked, and +only continue execution after another M "chooses" their locked G to run and +donates their P to the locked M instead. + +## Problems + +### Context Switch Overhead + +Most goroutines in the gVisor sentry are task goroutines, which back application +threads. Task goroutines spend large amounts of time blocked on syscalls that +execute untrusted application code. When invoking said syscall (which varies by +gVisor platform), the task goroutine may interact with the Go runtime in one of +three ways: + +* It can invoke the syscall without informing the runtime. In this case, the + task goroutine will continue to hold its P during the syscall, limiting the + number of application threads that can run concurrently to + `runtime.GOMAXPROCS()`. This is problematic because the Go runtime scheduler + is known to scale poorly with `GOMAXPROCS`; see #1942 and + https://github.com/golang/go/issues/28808. It also means that preemption of + application threads must be driven by sentry or runtime code, which is + strictly slower than kernel-driven preemption (since the sentry must invoke + another syscall to preempt the application thread). + +* It can call `runtime.entersyscallblock` before invoking the syscall, and + `runtime.exitsyscall` after the syscall returns. In this case, the task + goroutine will release its P while the syscall is executing. This allows the + number of threads concurrently executing application code to exceed + `GOMAXPROCS`. However, this incurs additional latency on syscall entry (to + hand off the released P to another M, often requiring a `futex(FUTEX_WAKE)` + syscall) and on syscall exit (to acquire a new P). It also drastically + increases the number of threads that concurrently interact with the runtime + scheduler, which is also problematic for performance (both in terms of CPU + utilization and in terms of context switch latency); see #205. + +- It can call `runtime.entersyscall` before invoking the syscall, and + `runtime.exitsyscall` after the syscall returns. In this case, the task + goroutine "lazily releases" its P, allowing the runtime's "sysmon" thread to + steal it on behalf of another M after a 20us delay. This mitigates the + context switch latency problem when there are few task goroutines and the + interval between switches to application code (i.e. the interval between + application syscalls, page faults, or signal delivery) is short. (Cynically, + this means that it's most effective in microbenchmarks). However, the delay + before a P is stolen can also be problematic for performance when there are + both many task goroutines switching to application code (lazily releasing + their Ps) *and* many task goroutines switching to sentry code (contending + for Ps), which is likely in larger heterogeneous workloads. + +### Blocking Overhead + +Task goroutines block on behalf of application syscalls like `futex` and +`epoll_wait` by receiving from a Go channel. (Future work may convert task +goroutine blocking to use the `syncevent` package to avoid overhead associated +with channels and `select`, but this does not change how blocking interacts with +the Go runtime scheduler.) + +If `runtime.LockOSThread()` is not in effect when a task goroutine blocks, then +when the task goroutine is unblocked (by e.g. an application `FUTEX_WAKE`, +signal delivery, or a timeout) by sending to the blocked channel, +`runtime.ready` migrates the unblocked G to the unblocking P. In most cases, +this implies that every application thread block/unblock cycle results in a +migration of the thread between Ps, and therefore Ms, and therefore cores, +resulting in reduced application performance due to loss of CPU caches. +Furthermore, in most cases, the unblocking P cannot immediately switch to the +unblocked G (instead resuming execution of its current application thread after +completing the application's `futex(FUTEX_WAKE)`, `tgkill`, etc. syscall), often +requiring that another P steal the unblocked G before it can resume execution. + +If `runtime.LockOSThread()` is in effect when a task goroutine blocks, then the +G will remain locked to its M, avoiding the core migration described above; +however, wakeup latency is significantly increased since, as described in +"Background", the G still needs to be selected by the scheduler before it can +run, and the M that selects the G then needs to transfer its P to the locked M, +incurring an additional `FUTEX_WAKE` syscall and round of kernel scheduling. + +## Proposal + +We propose to add a function, tentatively called `DedicateOSThread`, to the Go +`runtime` package, documented as follows: + +```go +// DedicateOSThread wires the calling goroutine to its current operating system +// thread, and exempts it from counting against GOMAXPROCS. The calling +// goroutine will always execute in that thread, and no other goroutine will +// execute in it, until the calling goroutine has made as many calls to +// UndedicateOSThread as to DedicateOSThread. If the calling goroutine exits +// without unlocking the thread, the thread will be terminated. +// +// DedicateOSThread should only be used by long-lived goroutines that usually +// block due to blocking system calls, rather than interaction with other +// goroutines. +func DedicateOSThread() +``` + +Mechanically, `DedicateOSThread` implies `LockOSThread` (i.e. it locks the +invoking G to a M), but additionally locks the invoking M to a P. Ps locked by +`DedicateOSThread` are not counted against `GOMAXPROCS`; that is, the actual +number of Ps in the system (`len(runtime.allp)`) is `GOMAXPROCS` plus the number +of bound Ps (plus some slack to avoid frequent changes to `runtime.allp`). +Corollaries: + +* If `runtime.ready` observes that a readied G is locked to a M locked to a P, + it immediately wakes the locked M without migrating the G to the readying P + or waiting for a future call to `runtime.schedule` to select the readied G + in `runtime.findrunnable`. + +* `runtime.stoplockedm` and `runtime.reentersyscall` skip the release of + locked Ps; the latter also skips sysmon wakeup. `runtime.stoplockedm` and + `runtime.exitsyscall` skip re-acquisition of Ps if one is locked. + +* sysmon does not attempt to preempt Gs that are locked to Ps, avoiding + fruitless overhead from `tgkill` syscalls and signal delivery. + +* `runtime.findrunnable`'s work stealing skips locked Ps (suggesting that + unlocked Ps be tracked in a separate array). `runtime.findrunnable` on + locked Ps skip the global run queue, work stealing, and possibly netpoll. + +* New goroutines created by goroutines with locked Ps are enqueued on the + global run queue rather than the invoking P's local run queue. + +While gVisor's use case does not strictly require that the association is +reversible (with `runtime.UndedicateOSThread`), such a feature is required to +allow reuse of locked Ms, which is likely to be critical for performance. + +## Alternatives Considered + +* Make the runtime scale well with `GOMAXPROCS`. While we are also + concurrently investigating this problem, this would not address the issues + of increased preemption cost or blocking overhead. + +* Make the runtime scale well with number of Ms. It is unclear if this is + actually feasible, and would not address blocking overhead. + +* Make P-locking part of `LockOSThread`'s behavior. This would likely + introduce performance regressions in existing uses of `LockOSThread` that do + not fit this usage pattern. In particular, since `DedicateOSThread` + transitions the invoker's P from "counted against `GOMAXPROCS`" to "not + counted against `GOMAXPROCS`", it may need to wake another M to run a new P + (that is counted against `GOMAXPROCS`), and the converse applies to + `UndedicateOSThread`. + +* Rewrite the gVisor sentry in a language that does not force userspace + scheduling. This is a last resort due to the amount of code involved. + +## Related Issues + +The proposed functionality is directly analogous to `spawn_blocking` in Rust +async runtimes +[`async_std`](https://docs.rs/async-std/1.8.0/async_std/task/fn.spawn_blocking.html) +and [`tokio`](https://docs.rs/tokio/0.3.5/tokio/task/fn.spawn_blocking.html). + +Outside of gVisor: + +* https://github.com/golang/go/issues/21827#issuecomment-595152452 describes a + use case for this feature in go-delve, where the goroutine that would use + this feature spends much of its time blocked in `ptrace` syscalls. + +* This feature may improve performance in the use case described in + https://github.com/golang/go/issues/18237, given the prominence of + syscall.Syscall in the profile given in that bug report. @@ -10,8 +10,8 @@ require ( github.com/Microsoft/hcsshim v0.8.6 // indirect github.com/cenkalti/backoff v1.1.1-0.20190506075156-2146c9339422 // indirect github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3 // indirect - github.com/containerd/cgroups v0.0.0-20181219155423-39b18af02c41 // indirect - github.com/containerd/containerd v1.3.4 // indirect + github.com/containerd/cgroups v0.0.0-20201119153540-4cbc285b3327 + github.com/containerd/containerd v1.3.9 // indirect github.com/containerd/continuity v0.0.0-20200928162600-f2cc35102c2a // indirect github.com/containerd/fifo v0.0.0-20191213151349-ff969a566b00 // indirect github.com/containerd/go-runc v0.0.0-20200220073739-7016d3ce2328 // indirect @@ -51,12 +51,12 @@ github.com/containerd/cgroups v0.0.0-20181219155423-39b18af02c41 h1:5yg0k8gqOssN github.com/containerd/cgroups v0.0.0-20181219155423-39b18af02c41/go.mod h1:X9rLEHIqSf/wfK8NsPqxJmeZgW4pcfzdXITDrUSJ6uI= github.com/containerd/cgroups v0.0.0-20200531161412-0dbf7f05ba59 h1:qWj4qVYZ95vLWwqyNJCQg7rDsG5wPdze0UaPolH7DUk= github.com/containerd/cgroups v0.0.0-20200531161412-0dbf7f05ba59/go.mod h1:pA0z1pT8KYB3TCXK/ocprsh7MAkoW8bZVzPdih9snmM= +github.com/containerd/cgroups v0.0.0-20201119153540-4cbc285b3327 h1:7grrpcfCtbZLsjtB0DgMuzs1umsJmpzaHMZ6cO6iAWw= +github.com/containerd/cgroups v0.0.0-20201119153540-4cbc285b3327/go.mod h1:ZJeTFisyysqgcCdecO57Dj79RfL0LNeGiFUqLYQRYLE= github.com/containerd/console v0.0.0-20180822173158-c12b1e7919c1/go.mod h1:Tj/on1eG8kiEhd0+fhSDzsPAFESxzBBvdyEgyryXffw= github.com/containerd/console v0.0.0-20191206165004-02ecf6a7291e h1:GdiIYd8ZDOrT++e1NjhSD4rGt9zaJukHm4rt5F4mRQc= github.com/containerd/console v0.0.0-20191206165004-02ecf6a7291e/go.mod h1:8Pf4gM6VEbTNRIT26AyyU7hxdQU3MvAvxVI0sc00XBE= -github.com/containerd/containerd v1.3.2/go.mod h1:bC6axHOhabU15QhwfG7w5PipXdVtMXFTttgp+kVtyUA= -github.com/containerd/containerd v1.3.4 h1:3o0smo5SKY7H6AJCmJhsnCjR2/V2T8VmiHt7seN2/kI= -github.com/containerd/containerd v1.3.4/go.mod h1:bC6axHOhabU15QhwfG7w5PipXdVtMXFTttgp+kVtyUA= +github.com/containerd/containerd v1.3.9/go.mod h1:bC6axHOhabU15QhwfG7w5PipXdVtMXFTttgp+kVtyUA= github.com/containerd/continuity v0.0.0-20190426062206-aaeac12a7ffc/go.mod h1:GL3xCUCBDV3CZiTSEKksMWbLE66hEyuu9qyDOOqM47Y= github.com/containerd/continuity v0.0.0-20200928162600-f2cc35102c2a h1:jEIoR0aA5GogXZ8pP3DUzE+zrhaF6/1rYZy+7KkYEWM= github.com/containerd/continuity v0.0.0-20200928162600-f2cc35102c2a/go.mod h1:W0qIOTD7mp2He++YVq+kgfXezRYqzP1uDuMVH1bITDY= diff --git a/images/BUILD b/images/BUILD index a50f388e9..34b950644 100644 --- a/images/BUILD +++ b/images/BUILD @@ -1,11 +1 @@ package(licenses = ["notice"]) - -# The images filegroup is definitely not a hermetic target, and requires Make -# to do anything meaningful with. However, this will be slurped up and used by -# the tools/installer/images.sh installer, which will ensure that all required -# images are available locally when running vm_tests. -filegroup( - name = "images", - srcs = glob(["**"]), - visibility = ["//tools/installers:__pkg__"], -) diff --git a/images/Makefile b/images/Makefile deleted file mode 100644 index 12927c509..000000000 --- a/images/Makefile +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/make -f - -# Copyright 2018 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# ARCH is the architecture used for the build. This may be overriden at the -# command line in order to perform a cross-build (in a limited capacity). -ARCH := $(shell uname -m) - -# Note that the image prefixes used here must match the image mangling in -# runsc/testutil.MangleImage. Names are mangled in this way to ensure that all -# tests are using locally-defined images (that are consistent and idempotent). -REMOTE_IMAGE_PREFIX ?= gcr.io/gvisor-presubmit -LOCAL_IMAGE_PREFIX ?= gvisor.dev/images -ALL_IMAGES := $(subst /,_,$(subst ./,,$(shell find . -name Dockerfile -o -name Dockerfile.$(ARCH) | xargs -n 1 dirname | uniq))) -ifneq ($(ARCH),$(shell uname -m)) -DOCKER_PLATFORM_ARGS := --platform=$(ARCH) -else -DOCKER_PLATFORM_ARGS := -endif - -list-all-images: - @for image in $(ALL_IMAGES); do echo $${image}; done -.PHONY: list-build-images - -# Handy wrapper to allow load-all-images, push-all-images, etc. -%-all-images: - @$(MAKE) $(patsubst %,$*-%,$(ALL_IMAGES)) -load-all-images: - @$(MAKE) $(patsubst %,load-%,$(ALL_IMAGES)) - -# Handy wrapper to load specified "groups", e.g. load-basic-images, etc. -load-%-images: - @$(MAKE) $(patsubst %,load-%,$(subst /,_,$(subst ./,,$(shell find ./$* -name Dockerfile -exec dirname {} \;)))) - -# tag is a function that returns the tag name, given an image. -# -# The tag constructed is used to memoize the image generated (see README.md). -# This scheme is used to enable aggressive caching in a central repository, but -# ensuring that images will always be sourced using the local files if there -# are changes. -path = $(subst _,/,$(1)) -dockerfile = $$(if [ -f "$(call path,$(1))/Dockerfile.$(ARCH)" ]; then echo Dockerfile.$(ARCH); else echo Dockerfile; fi) -tag = $(shell find $(call path,$(1)) -type f -print | sort | xargs -n 1 sha256sum | sha256sum - | cut -c 1-16) -remote_image = $(REMOTE_IMAGE_PREFIX)/$(subst _,/,$(1))_$(ARCH):$(call tag,$(1)) -local_image = $(LOCAL_IMAGE_PREFIX)/$(subst _,/,$(1)) - -# rebuild builds the image locally. Only the "remote" tag will be applied. Note -# we need to explicitly repull the base layer in order to ensure that the -# architecture is correct. Note that we use the term "rebuild" here to avoid -# conflicting with the bazel "build" terminology, which is used elsewhere. -rebuild-%: FROM=$(shell grep FROM "$(call path,$*)/$(call dockerfile,$*)" | cut -d' ' -f2) -rebuild-%: register-cross - @if ! [ -f "$(call path,$*)/$(call dockerfile,$*)" ]; then \ - (echo "ERROR: Dockerfile for $* not found (is it available for $(ARCH)?)." >&2 && exit 1); \ - fi - $(foreach IMAGE,$(FROM),docker pull $(DOCKER_PLATFORM_ARGS) $(IMAGE) &&) \ - T=$$(mktemp -d) && cp -a $(call path,$*)/* $$T && \ - docker build $(DOCKER_PLATFORM_ARGS) \ - -f "$$T/$(call dockerfile,$*)" \ - -t "$(call remote_image,$*)" \ - $$T && \ - rm -rf $$T - -# pull will check the "remote" image and pull if necessary. If the remote image -# must be pulled, then it will tag with the latest local target. Note that pull -# may fail if the remote image is not available. -pull-%: - docker pull $(DOCKER_PLATFORM_ARGS) $(call remote_image,$*) - -# load will either pull the "remote" or build it locally. This is the preferred -# entrypoint, as it should never fail. The local tag should always be set after -# this returns (either by the pull or the build). -load-%: - $(MAKE) pull-$* || $(MAKE) rebuild-$* - docker tag $(call remote_image,$*) $(call local_image,$*) - -# push pushes the remote image, after either pulling (to validate that the tag -# already exists) or building manually. -push-%: load-% - docker push $(call remote_image,$*) - -# register-cross registers the necessary qemu binaries for cross-compilation. -# This may be used by any target that may execute containers that are not the -# native format. -register-cross: -ifneq ($(ARCH),$(shell uname -m)) -ifeq (,$(wildcard /proc/sys/fs/binfmt_misc/qemu-*)) - docker run --rm --privileged multiarch/qemu-user-static --reset --persistent yes -else - @true # Already registered. -endif -else - @true # No cross required. -endif -.PHONY: register-cross diff --git a/images/agent/Dockerfile b/images/agent/Dockerfile new file mode 100644 index 000000000..1d8979390 --- /dev/null +++ b/images/agent/Dockerfile @@ -0,0 +1,12 @@ +FROM golang:1.15 as build-agent +RUN git clone --depth=1 --branch=v3.25.0 https://github.com/buildkite/agent +RUN cd agent && go build -i -o /buildkite-agent . + +FROM golang:1.15 as build-agent-metrics +RUN git clone --depth=1 --branch=v5.2.0 https://github.com/buildkite/buildkite-agent-metrics +RUN cd buildkite-agent-metrics && go build -i -o /buildkite-agent-metrics . + +FROM gcr.io/distroless/base-debian10 +COPY --from=build-agent /buildkite-agent / +COPY --from=build-agent-metrics /buildkite-agent-metrics / +CMD ["/buildkite-agent"] diff --git a/images/agent/README.md b/images/agent/README.md new file mode 100644 index 000000000..acb57bd2f --- /dev/null +++ b/images/agent/README.md @@ -0,0 +1,7 @@ +# Build Agent + +This is the image used by the build agent. It is built and bundled via a +separate packaging mechanism in order to provide local caching and to ensure +that there is better build provenance. Note that continuous integration system +will generally deploy new agents from the primary branch, and will only deploy +as instances are recycled. Updates to this image should be made carefully. diff --git a/images/basic/ping4test/Dockerfile b/images/basic/ping4test/Dockerfile new file mode 100644 index 000000000..1536be376 --- /dev/null +++ b/images/basic/ping4test/Dockerfile @@ -0,0 +1,7 @@ +FROM ubuntu:bionic + +WORKDIR /root +COPY ping4.sh . +RUN chmod +x ping4.sh + +RUN apt-get update && apt-get install -y iputils-ping diff --git a/tools/vm/zone.sh b/images/basic/ping4test/ping4.sh index 79569fb19..2a343712a 100755..100644 --- a/tools/vm/zone.sh +++ b/images/basic/ping4test/ping4.sh @@ -14,4 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -exec gcloud config get-value compute/zone +set -euo pipefail + +# The docker API doesn't provide for starting a container, running a command, +# and getting the exit status of the command in one go. The most straightforward +# way to do this is to verify the output of the command, so we output nothing on +# success and an error message on failure. +if ! out=$(ping -c 10 127.0.0.1); then + echo "$out" +fi diff --git a/images/basic/ping6test/Dockerfile b/images/basic/ping6test/Dockerfile new file mode 100644 index 000000000..cb740bd60 --- /dev/null +++ b/images/basic/ping6test/Dockerfile @@ -0,0 +1,7 @@ +FROM ubuntu:bionic + +WORKDIR /root +COPY ping6.sh . +RUN chmod +x ping6.sh + +RUN apt-get update && apt-get install -y iputils-ping iproute2 diff --git a/images/basic/ping6test/ping6.sh b/images/basic/ping6test/ping6.sh new file mode 100644 index 000000000..4268951d0 --- /dev/null +++ b/images/basic/ping6test/ping6.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Copyright 2020 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Enable ipv6 on loopback if it's not already enabled. Runsc doesn't enable ipv6 +# loopback unless an ipv6 address was assigned to the container, which docker +# does not do by default. +if ! [[ $(ip -6 addr show dev lo) ]]; then + ip addr add ::1 dev lo +fi + +# The docker API doesn't provide for starting a container, running a command, +# and getting the exit status of the command in one go. The most straightforward +# way to do this is to verify the output of the command, so we output nothing on +# success and an error message on failure. +if ! out=$(/bin/ping6 -c 10 ::1); then + echo "$out" +fi diff --git a/images/benchmarks/absl/Dockerfile b/images/benchmarks/absl/Dockerfile.x86_64 index b0dd97695..810c9ef5e 100644 --- a/images/benchmarks/absl/Dockerfile +++ b/images/benchmarks/absl/Dockerfile.x86_64 @@ -12,6 +12,7 @@ RUN set -x \ unzip \ python3 \ && rm -rf /var/lib/apt/lists/* + RUN wget https://github.com/bazelbuild/bazel/releases/download/0.27.0/bazel-0.27.0-installer-linux-x86_64.sh RUN chmod +x bazel-0.27.0-installer-linux-x86_64.sh RUN ./bazel-0.27.0-installer-linux-x86_64.sh diff --git a/images/benchmarks/hey/Dockerfile b/images/benchmarks/hey/Dockerfile index f586978b6..4b6a0f849 100644 --- a/images/benchmarks/hey/Dockerfile +++ b/images/benchmarks/hey/Dockerfile @@ -1,12 +1,13 @@ -FROM ubuntu:18.04 +FROM golang:1.15 as build +RUN go get github.com/rakyll/hey +WORKDIR /go/src/github.com/rakyll/hey +RUN go mod download +RUN CGO_ENABLED=0 go build -o /hey hey.go +FROM ubuntu:18.04 RUN set -x \ && apt-get update \ && apt-get install -y \ wget \ && rm -rf /var/lib/apt/lists/* - -RUN wget https://storage.googleapis.com/hey-release/hey_linux_amd64 \ - && chmod 777 hey_linux_amd64 \ - && cp hey_linux_amd64 /bin/hey \ - && rm hey_linux_amd64 +COPY --from=build /hey /bin/hey diff --git a/images/benchmarks/runsc/Dockerfile b/images/benchmarks/runsc/Dockerfile.x86_64 index 6c3aafa57..28ae64816 100644 --- a/images/benchmarks/runsc/Dockerfile +++ b/images/benchmarks/runsc/Dockerfile.x86_64 @@ -14,6 +14,7 @@ RUN set -x \ python3 \ python3-pip \ && rm -rf /var/lib/apt/lists/* + RUN wget https://github.com/bazelbuild/bazel/releases/download/3.4.1/bazel-3.4.1-installer-linux-x86_64.sh RUN chmod +x bazel-3.4.1-installer-linux-x86_64.sh RUN ./bazel-3.4.1-installer-linux-x86_64.sh diff --git a/images/default/Dockerfile b/images/default/Dockerfile index d058b83cb..224469267 100644 --- a/images/default/Dockerfile +++ b/images/default/Dockerfile @@ -1,16 +1,20 @@ FROM fedora:31 + # Install bazel. RUN dnf install -y dnf-plugins-core && dnf copr enable -y vbatts/bazel RUN dnf install -y git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static libstdc++-static patch diffutils RUN pip install --no-cache-dir pycparser RUN dnf install -y bazel3 -# Install gcloud. + +# Install gcloud. Note that while this is "x86_64", it doesn't actually matter. RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-289.0.0-linux-x86_64.tar.gz | \ - tar zxvf - google-cloud-sdk && \ + tar zxf - google-cloud-sdk && \ google-cloud-sdk/install.sh && \ ln -s /google-cloud-sdk/bin/gcloud /usr/bin/gcloud + # Install Docker client for the website build. RUN dnf config-manager --add-repo https://download.docker.com/linux/fedora/docker-ce.repo RUN dnf install -y docker-ce-cli + WORKDIR /workspace ENTRYPOINT ["/usr/bin/bazel"] diff --git a/images/runtimes/go1.12/Dockerfile b/images/runtimes/go1.12/Dockerfile.x86_64 index cb2944062..cb2944062 100644 --- a/images/runtimes/go1.12/Dockerfile +++ b/images/runtimes/go1.12/Dockerfile.x86_64 @@ -56,124 +56,8 @@ global: - "should not use ALL_CAPS in Go names" - "should not use underscores in Go names" exclude: - # A variety of staticcheck and stylecheck - # rules apply here. These should be fixed - # and removed from here, and the global - # rules should be used sparingly. - - pkg/abi/linux/fuse.go:22 - - pkg/abi/linux/fuse.go:25 - - pkg/abi/linux/socket.go:113 - - pkg/abi/linux/tty.go:73 - - pkg/cpuid/cpuid_x86.go:675 - - pkg/gohacks/gohacks_unsafe.go:33 - - pkg/log/json.go:30 - - pkg/log/log.go:359 - - pkg/metric/metric_test.go:20 - - pkg/p9/p9test/client_test.go:687 - - pkg/p9/transport_test.go:196 - - pkg/pool/pool.go:15 - - pkg/refs/refcounter.go:510 - - pkg/refs/refcounter_test.go:169 - - pkg/refs_vfs2/refs.go:16 - - pkg/safemem/block_unsafe.go:89 - - pkg/seccomp/seccomp.go:82 - - pkg/segment/test/set_functions.go:15 - - pkg/sentry/arch/signal.go:166 - - pkg/sentry/arch/signal.go:171 - - pkg/sentry/control/pprof.go:196 - - pkg/sentry/devices/memdev/full.go:58 - - pkg/sentry/devices/memdev/null.go:59 - - pkg/sentry/devices/memdev/random.go:68 - - pkg/sentry/devices/memdev/zero.go:86 - - pkg/sentry/fdimport/fdimport.go:15 - - pkg/sentry/fs/attr.go:257 - - pkg/sentry/fsbridge/fs.go:116 - - pkg/sentry/fsbridge/vfs.go:124 - - pkg/sentry/fsbridge/vfs.go:70 - - pkg/sentry/fs/copy_up.go:365 - - pkg/sentry/fs/copy_up_test.go:65 - - pkg/sentry/fs/dev/net_tun.go:161 - - pkg/sentry/fs/dev/net_tun.go:63 - - pkg/sentry/fs/dev/null.go:97 - - pkg/sentry/fs/dirent_cache.go:64 - - pkg/sentry/fs/fdpipe/pipe_opener_test.go:366 - - pkg/sentry/fs/file_overlay.go:327 - - pkg/sentry/fs/file_overlay.go:524 - - pkg/sentry/fs/filetest/filetest.go:55 - - pkg/sentry/fs/filetest/filetest.go:60 - - pkg/sentry/fs/fs.go:77 - - pkg/sentry/fs/fsutil/file.go:290 - - pkg/sentry/fs/fsutil/file.go:346 - - pkg/sentry/fs/fsutil/host_file_mapper.go:105 - - pkg/sentry/fs/fsutil/inode_cached.go:676 - - pkg/sentry/fs/fsutil/inode_cached.go:772 - - pkg/sentry/fs/gofer/attr.go:120 - - pkg/sentry/fs/gofer/fifo.go:33 - - pkg/sentry/fs/gofer/inode.go:410 - - pkg/sentry/fsimpl/ext/disklayout/superblock_64.go:97 - - pkg/sentry/fsimpl/ext/disklayout/superblock_old.go:92 - - pkg/sentry/fsimpl/ext/disklayout/block_group_32.go:44 - - pkg/sentry/fsimpl/ext/disklayout/inode_new.go:91 - - pkg/sentry/fsimpl/ext/disklayout/inode_old.go:93 - - pkg/sentry/fsimpl/ext/disklayout/superblock_32.go:66 - - pkg/sentry/fsimpl/ext/disklayout/block_group_64.go:53 - - pkg/sentry/fsimpl/fuse/request_response.go:71 - - pkg/sentry/fsimpl/signalfd/signalfd.go:15 - - pkg/sentry/memmap/memmap.go:103 - - pkg/sentry/memmap/memmap.go:163 - - pkg/sentry/mm/aio_context.go:208 - - pkg/sentry/mm/pma.go:683 - - pkg/sentry/usage/cpu.go:42 - - pkg/shim/runsc/runsc.go:16 - - pkg/shim/runsc/utils.go:16 - - pkg/shim/v1/proc/deleted_state.go:16 - - pkg/shim/v1/proc/exec.go:16 - - pkg/shim/v1/proc/exec_state.go:16 - - pkg/shim/v1/proc/init.go:16 - - pkg/shim/v1/proc/init_state.go:16 - - pkg/shim/v1/proc/io.go:16 - - pkg/shim/v1/proc/process.go:16 - - pkg/shim/v1/proc/types.go:16 - - pkg/shim/v1/proc/utils.go:16 - - pkg/shim/v1/shim/api.go:16 - - pkg/shim/v1/shim/platform.go:16 - - pkg/shim/v1/shim/service.go:16 - - pkg/shim/v1/utils/annotations.go:15 - - pkg/shim/v1/utils/utils.go:15 - - pkg/shim/v1/utils/volumes.go:15 - - pkg/shim/v2/api.go:16 - - pkg/shim/v2/epoll.go:18 - - pkg/shim/v2/options/options.go:15 - - pkg/shim/v2/options/options.go:24 - - pkg/shim/v2/options/options.go:26 - - pkg/shim/v2/runtimeoptions/runtimeoptions.go:16 - - pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go # Generated: exempt all. - - pkg/shim/v2/runtimeoptions/runtimeoptions_test.go:22 - - pkg/shim/v2/service.go:15 - - pkg/shim/v2/service_linux.go:18 - - pkg/state/tests/integer_test.go:23 - - pkg/state/tests/integer_test.go:28 - - pkg/sync/rwmutex_test.go:105 - - pkg/syserr/host_linux.go:35 - - pkg/usermem/addr.go:34 - - pkg/usermem/usermem.go:171 - - pkg/usermem/usermem.go:170 - - runsc/boot/compat.go:56 - - test/cmd/test_app/fds.go:171 - - test/iptables/filter_output.go:251 - - test/packetimpact/testbench/connections.go:77 - - tools/bigquery/bigquery.go:106 - - tools/checkescape/test1/test1.go:108 - - tools/checkescape/test1/test1.go:122 - - tools/checkescape/test1/test1.go:137 - - tools/checkescape/test1/test1.go:151 - - tools/checkescape/test1/test1.go:170 - - tools/checkescape/test1/test1.go:39 - - tools/checkescape/test1/test1.go:45 - - tools/checkescape/test1/test1.go:50 - - tools/checkescape/test1/test1.go:64 - - tools/checkescape/test1/test1.go:80 - - tools/checkescape/test1/test1.go:94 + # Generated: exempt all. + - pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go analyzers: asmdecl: external: # Enabled. @@ -215,6 +99,8 @@ analyzers: printf: external: # Enabled. shift: + generated: # Disabled for generated code; these shifts are well-defined. + exclude: [".*"] external: # Enabled. stringintconv: external: @@ -251,3 +137,22 @@ analyzers: external: # Enabled. checkescape: external: # Enabled. + SA4016: + internal: + exclude: + - pkg/gohacks/gohacks_unsafe.go # x ^ 0 always equals x. + SA2001: + internal: + exclude: + - pkg/sentry/fs/fs.go # Intentional. + - pkg/sentry/fs/gofer/inode.go # Intentional. + - pkg/refs/refcounter_test.go # Intentional. + ST1021: + internal: + suppress: + - "comment on exported type Translation" # Intentional. + - "comment on exported type PinnedRange" # Intentional. + SA5011: + internal: + exclude: + - pkg/sentry/fs/fdpipe/pipe_opener_test.go # False positive. diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go index cc3571fad..d1ca56370 100644 --- a/pkg/abi/linux/fcntl.go +++ b/pkg/abi/linux/fcntl.go @@ -25,6 +25,8 @@ const ( F_SETLKW = 7 F_SETOWN = 8 F_GETOWN = 9 + F_SETSIG = 10 + F_GETSIG = 11 F_SETOWN_EX = 15 F_GETOWN_EX = 16 F_DUPFD_CLOEXEC = 1024 + 6 diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go index d91c97a64..1070b457c 100644 --- a/pkg/abi/linux/fuse.go +++ b/pkg/abi/linux/fuse.go @@ -19,16 +19,22 @@ import ( "gvisor.dev/gvisor/pkg/marshal/primitive" ) +// FUSEOpcode is a FUSE operation code. +// // +marshal type FUSEOpcode uint32 +// FUSEOpID is a FUSE operation ID. +// // +marshal type FUSEOpID uint64 // FUSE_ROOT_ID is the id of root inode. const FUSE_ROOT_ID = 1 -// Opcodes for FUSE operations. Analogous to the opcodes in include/linux/fuse.h. +// Opcodes for FUSE operations. +// +// Analogous to the opcodes in include/linux/fuse.h. const ( FUSE_LOOKUP FUSEOpcode = 1 FUSE_FORGET = 2 /* no reply */ diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go index 1b2f76c0b..2424884c1 100644 --- a/pkg/abi/linux/sem.go +++ b/pkg/abi/linux/sem.go @@ -32,6 +32,23 @@ const ( SEM_STAT_ANY = 20 ) +// Information about system-wide sempahore limits and parameters. +// +// Source: include/uapi/linux/sem.h +const ( + SEMMNI = 32000 + SEMMSL = 32000 + SEMMNS = SEMMNI * SEMMSL + SEMOPM = 500 + SEMVMX = 32767 + SEMAEM = SEMVMX + + SEMUME = SEMOPM + SEMMNU = SEMMNS + SEMMAP = SEMMNS + SEMUSZ = 20 +) + const SEM_UNDO = 0x1000 // Sembuf is equivalent to struct sembuf. @@ -42,3 +59,21 @@ type Sembuf struct { SemOp int16 SemFlg int16 } + +// SemInfo is equivalent to struct seminfo. +// +// Source: include/uapi/linux/sem.h +// +// +marshal +type SemInfo struct { + SemMap uint32 + SemMni uint32 + SemMns uint32 + SemMnu uint32 + SemMsl uint32 + SemOpm uint32 + SemUme uint32 + SemUsz uint32 + SemVmx uint32 + SemAem uint32 +} diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go index d156d41e4..556892dc3 100644 --- a/pkg/abi/linux/socket.go +++ b/pkg/abi/linux/socket.go @@ -111,12 +111,12 @@ type SockType int // Socket types, from linux/net.h. const ( SOCK_STREAM SockType = 1 - SOCK_DGRAM = 2 - SOCK_RAW = 3 - SOCK_RDM = 4 - SOCK_SEQPACKET = 5 - SOCK_DCCP = 6 - SOCK_PACKET = 10 + SOCK_DGRAM SockType = 2 + SOCK_RAW SockType = 3 + SOCK_RDM SockType = 4 + SOCK_SEQPACKET SockType = 5 + SOCK_DCCP SockType = 6 + SOCK_PACKET SockType = 10 ) // SOCK_TYPE_MASK covers all of the above socket types. The remaining bits are @@ -448,6 +448,8 @@ type ControlMessageCredentials struct { // A ControlMessageIPPacketInfo is IP_PKTINFO socket control message. // // ControlMessageIPPacketInfo represents struct in_pktinfo from linux/in.h. +// +// +stateify savable type ControlMessageIPPacketInfo struct { NIC int32 LocalAddr InetAddr diff --git a/pkg/coverage/coverage.go b/pkg/coverage/coverage.go index a4f4b2c5e..fdfe31417 100644 --- a/pkg/coverage/coverage.go +++ b/pkg/coverage/coverage.go @@ -27,6 +27,7 @@ import ( "io" "sort" "sync/atomic" + "testing" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -34,12 +35,6 @@ import ( "github.com/bazelbuild/rules_go/go/tools/coverdata" ) -// KcovAvailable returns whether the kcov coverage interface is available. It is -// available as long as coverage is enabled for some files. -func KcovAvailable() bool { - return len(coverdata.Cover.Blocks) > 0 -} - // coverageMu must be held while accessing coverdata.Cover. This prevents // concurrent reads/writes from multiple threads collecting coverage data. var coverageMu sync.RWMutex @@ -47,6 +42,22 @@ var coverageMu sync.RWMutex // once ensures that globalData is only initialized once. var once sync.Once +// blockBitLength is the number of bits used to represent coverage block index +// in a synthetic PC (the rest are used to represent the file index). Even +// though a PC has 64 bits, we only use the lower 32 bits because some users +// (e.g., syzkaller) may truncate that address to a 32-bit value. +// +// As of this writing, there are ~1200 files that can be instrumented and at +// most ~1200 blocks per file, so 16 bits is more than enough to represent every +// file and every block. +const blockBitLength = 16 + +// KcovAvailable returns whether the kcov coverage interface is available. It is +// available as long as coverage is enabled for some files. +func KcovAvailable() bool { + return len(coverdata.Cover.Blocks) > 0 +} + var globalData struct { // files is the set of covered files sorted by filename. It is calculated at // startup. @@ -104,14 +115,14 @@ var coveragePool = sync.Pool{ // coverage tools, we reset the global coverage data every time this function is // run. func ConsumeCoverageData(w io.Writer) int { - once.Do(initCoverageData) + InitCoverageData() coverageMu.Lock() defer coverageMu.Unlock() total := 0 var pcBuffer [8]byte - for fileIndex, file := range globalData.files { + for fileNum, file := range globalData.files { counters := coverdata.Cover.Counters[file] for index := 0; index < len(counters); index++ { if atomic.LoadUint32(&counters[index]) == 0 { @@ -119,7 +130,7 @@ func ConsumeCoverageData(w io.Writer) int { } // Non-zero coverage data found; consume it and report as a PC. atomic.StoreUint32(&counters[index], 0) - pc := globalData.syntheticPCs[fileIndex][index] + pc := globalData.syntheticPCs[fileNum][index] usermem.ByteOrder.PutUint64(pcBuffer[:], pc) n, err := w.Write(pcBuffer[:]) if err != nil { @@ -142,31 +153,84 @@ func ConsumeCoverageData(w io.Writer) int { return total } -// initCoverageData initializes globalData. It should only be called once, -// before any kcov data is written. -func initCoverageData() { - // First, order all files. Then calculate synthetic PCs for every block - // (using the well-defined ordering for files as well). - for file := range coverdata.Cover.Blocks { - globalData.files = append(globalData.files, file) +// InitCoverageData initializes globalData. It should be called before any kcov +// data is written. +func InitCoverageData() { + once.Do(func() { + // First, order all files. Then calculate synthetic PCs for every block + // (using the well-defined ordering for files as well). + for file := range coverdata.Cover.Blocks { + globalData.files = append(globalData.files, file) + } + sort.Strings(globalData.files) + + for fileNum, file := range globalData.files { + blocks := coverdata.Cover.Blocks[file] + pcs := make([]uint64, 0, len(blocks)) + for blockNum := range blocks { + pcs = append(pcs, calculateSyntheticPC(fileNum, blockNum)) + } + globalData.syntheticPCs = append(globalData.syntheticPCs, pcs) + } + }) +} + +// Symbolize prints information about the block corresponding to pc. +func Symbolize(out io.Writer, pc uint64) error { + fileNum, blockNum := syntheticPCToIndexes(pc) + file, err := fileFromIndex(fileNum) + if err != nil { + return err + } + block, err := blockFromIndex(file, blockNum) + if err != nil { + return err } - sort.Strings(globalData.files) - - // nextSyntheticPC is the first PC that we generate for a block. - // - // This uses a standard-looking kernel range for simplicity. - // - // FIXME(b/160639712): This is only necessary because syzkaller requires - // addresses in the kernel range. If we can remove this constraint, then we - // should be able to use the actual addresses. - var nextSyntheticPC uint64 = 0xffffffff80000000 - for _, file := range globalData.files { - blocks := coverdata.Cover.Blocks[file] - thisFile := make([]uint64, 0, len(blocks)) - for range blocks { - thisFile = append(thisFile, nextSyntheticPC) - nextSyntheticPC++ // Advance. + writeBlock(out, pc, file, block) + return nil +} + +// WriteAllBlocks prints all information about all blocks along with their +// corresponding synthetic PCs. +func WriteAllBlocks(out io.Writer) { + for fileNum, file := range globalData.files { + for blockNum, block := range coverdata.Cover.Blocks[file] { + writeBlock(out, calculateSyntheticPC(fileNum, blockNum), file, block) } - globalData.syntheticPCs = append(globalData.syntheticPCs, thisFile) } } + +func calculateSyntheticPC(fileNum int, blockNum int) uint64 { + return (uint64(fileNum) << blockBitLength) + uint64(blockNum) +} + +func syntheticPCToIndexes(pc uint64) (fileNum int, blockNum int) { + return int(pc >> blockBitLength), int(pc & ((1 << blockBitLength) - 1)) +} + +// fileFromIndex returns the name of the file in the sorted list of instrumented files. +func fileFromIndex(i int) (string, error) { + total := len(globalData.files) + if i < 0 || i >= total { + return "", fmt.Errorf("file index out of range: [%d] with length %d", i, total) + } + return globalData.files[i], nil +} + +// blockFromIndex returns the i-th block in the given file. +func blockFromIndex(file string, i int) (testing.CoverBlock, error) { + blocks, ok := coverdata.Cover.Blocks[file] + if !ok { + return testing.CoverBlock{}, fmt.Errorf("instrumented file %s does not exist", file) + } + total := len(blocks) + if i < 0 || i >= total { + return testing.CoverBlock{}, fmt.Errorf("block index out of range: [%d] with length %d", i, total) + } + return blocks[i], nil +} + +func writeBlock(out io.Writer, pc uint64, file string, block testing.CoverBlock) { + io.WriteString(out, fmt.Sprintf("%#x\n", pc)) + io.WriteString(out, fmt.Sprintf("%s:%d.%d,%d.%d\n", file, block.Line0, block.Col0, block.Line1, block.Col1)) +} diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go index f7f9dbf86..69eeb7528 100644 --- a/pkg/cpuid/cpuid.go +++ b/pkg/cpuid/cpuid.go @@ -36,3 +36,14 @@ package cpuid // On arm64, features are numbered according to the ELF HWCAP definition. // arch/arm64/include/uapi/asm/hwcap.h type Feature int + +// ErrIncompatible is returned by FeatureSet.HostCompatible if fs is not a +// subset of the host feature set. +type ErrIncompatible struct { + message string +} + +// Error implements error. +func (e ErrIncompatible) Error() string { + return e.message +} diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go index 17a89c00d..392711e8f 100644 --- a/pkg/cpuid/cpuid_x86.go +++ b/pkg/cpuid/cpuid_x86.go @@ -681,17 +681,6 @@ func (fs *FeatureSet) Intel() bool { return fs.VendorID == intelVendorID } -// ErrIncompatible is returned by FeatureSet.HostCompatible if fs is not a -// subset of the host feature set. -type ErrIncompatible struct { - message string -} - -// Error implements error. -func (e ErrIncompatible) Error() string { - return e.message -} - // CheckHostCompatible returns nil if fs is a subset of the host feature set. func (fs *FeatureSet) CheckHostCompatible() error { hfs := HostFeatureSet() diff --git a/pkg/crypto/BUILD b/pkg/crypto/BUILD new file mode 100644 index 000000000..08fa772ca --- /dev/null +++ b/pkg/crypto/BUILD @@ -0,0 +1,12 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "crypto", + srcs = [ + "crypto.go", + "crypto_stdlib.go", + ], + visibility = ["//:sandbox"], +) diff --git a/pkg/sleep/empty.s b/pkg/crypto/crypto.go index fb37360ac..b26b55d37 100644 --- a/pkg/sleep/empty.s +++ b/pkg/crypto/crypto.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,4 +12,5 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Empty assembly file so empty func definitions work. +// Package crypto wraps crypto primitives. +package crypto diff --git a/pkg/crypto/crypto_stdlib.go b/pkg/crypto/crypto_stdlib.go new file mode 100644 index 000000000..74a55a123 --- /dev/null +++ b/pkg/crypto/crypto_stdlib.go @@ -0,0 +1,32 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package crypto + +import ( + "crypto/ecdsa" + "crypto/sha512" + "math/big" +) + +// EcdsaVerify verifies the signature in r, s of hash using ECDSA and the +// public key, pub. Its return value records whether the signature is valid. +func EcdsaVerify(pub *ecdsa.PublicKey, hash []byte, r, s *big.Int) bool { + return ecdsa.Verify(pub, hash, r, s) +} + +// SumSha384 returns the SHA384 checksum of the data. +func SumSha384(data []byte) (sum384 [sha512.Size384]byte) { + return sha512.Sum384(data) +} diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD index aa8e4e1f3..cc31d0175 100644 --- a/pkg/flipcall/BUILD +++ b/pkg/flipcall/BUILD @@ -11,7 +11,8 @@ go_library( "futex_linux.go", "io.go", "packet_window_allocator.go", - "packet_window_mmap.go", + "packet_window_mmap_amd64.go", + "packet_window_mmap_arm64.go", ], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/flipcall/packet_window_mmap.go b/pkg/flipcall/packet_window_mmap_amd64.go index 869183b11..869183b11 100644 --- a/pkg/flipcall/packet_window_mmap.go +++ b/pkg/flipcall/packet_window_mmap_amd64.go diff --git a/pkg/syncevent/waiter_asm_unsafe.go b/pkg/flipcall/packet_window_mmap_arm64.go index 19d6b0b15..b9c9c44f6 100644 --- a/pkg/syncevent/waiter_asm_unsafe.go +++ b/pkg/flipcall/packet_window_mmap_arm64.go @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build amd64 arm64 +// +build arm64 -package syncevent +package flipcall import ( - "unsafe" + "syscall" ) -// See waiter_noasm_unsafe.go for a description of waiterUnlock. -func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool +// Return a memory mapping of the pwd in memory that can be shared outside the sandbox. +func packetWindowMmap(pwd PacketWindowDescriptor) (uintptr, syscall.Errno) { + m, _, err := syscall.RawSyscall6(syscall.SYS_MMAP, 0, uintptr(pwd.Length), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED, uintptr(pwd.FD), uintptr(pwd.Offset)) + return m, err +} diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD index d855b702c..08832a8ae 100644 --- a/pkg/goid/BUILD +++ b/pkg/goid/BUILD @@ -9,6 +9,7 @@ go_library( "goid_amd64.s", "goid_arm64.s", ], + stateify = False, visibility = ["//visibility:public"], ) diff --git a/pkg/log/json.go b/pkg/log/json.go index bdf9d691e..8c52dcc87 100644 --- a/pkg/log/json.go +++ b/pkg/log/json.go @@ -27,8 +27,8 @@ type jsonLog struct { } // MarshalJSON implements json.Marshaler.MarashalJSON. -func (lv Level) MarshalJSON() ([]byte, error) { - switch lv { +func (l Level) MarshalJSON() ([]byte, error) { + switch l { case Warning: return []byte(`"warning"`), nil case Info: @@ -36,20 +36,20 @@ func (lv Level) MarshalJSON() ([]byte, error) { case Debug: return []byte(`"debug"`), nil default: - return nil, fmt.Errorf("unknown level %v", lv) + return nil, fmt.Errorf("unknown level %v", l) } } // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON. It can unmarshal // from both string names and integers. -func (lv *Level) UnmarshalJSON(b []byte) error { +func (l *Level) UnmarshalJSON(b []byte) error { switch s := string(b); s { case "0", `"warning"`: - *lv = Warning + *l = Warning case "1", `"info"`: - *lv = Info + *l = Info case "2", `"debug"`: - *lv = Debug + *l = Debug default: return fmt.Errorf("unknown level %q", s) } diff --git a/pkg/log/log.go b/pkg/log/log.go index 37e0605ad..2e3408357 100644 --- a/pkg/log/log.go +++ b/pkg/log/log.go @@ -356,7 +356,7 @@ func CopyStandardLogTo(l Level) error { case Warning: f = Warningf default: - return fmt.Errorf("Unknown log level %v", l) + return fmt.Errorf("unknown log level %v", l) } stdlog.SetOutput(linewriter.NewWriter(func(p []byte) { diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go index 6acee90ef..aea7dde38 100644 --- a/pkg/merkletree/merkletree.go +++ b/pkg/merkletree/merkletree.go @@ -350,9 +350,13 @@ type VerifyParams struct { // For verifyMetadata, params.data is not needed. It only accesses params.tree // for the raw root hash. func verifyMetadata(params *VerifyParams, layout *Layout) error { - root := make([]byte, layout.digestSize) - if _, err := params.Tree.ReadAt(root, layout.blockOffset(layout.rootLevel(), 0 /* index */)); err != nil { - return fmt.Errorf("failed to read root hash: %w", err) + var root []byte + // Only read the root hash if we expect that the Merkle tree file is non-empty. + if params.Size != 0 { + root = make([]byte, layout.digestSize) + if _, err := params.Tree.ReadAt(root, layout.blockOffset(layout.rootLevel(), 0 /* index */)); err != nil { + return fmt.Errorf("failed to read root hash: %w", err) + } } descriptor := VerityDescriptor{ Name: params.Name, diff --git a/pkg/p9/client.go b/pkg/p9/client.go index 71e944c30..eadea390a 100644 --- a/pkg/p9/client.go +++ b/pkg/p9/client.go @@ -570,6 +570,8 @@ func (c *Client) Version() uint32 { func (c *Client) Close() { // unet.Socket.Shutdown() has no effect if unet.Socket.Close() has already // been called (by c.watch()). - c.socket.Shutdown() + if err := c.socket.Shutdown(); err != nil { + log.Warningf("Socket.Shutdown() failed (FD: %d): %v", c.socket.FD(), err) + } c.closedWg.Wait() } diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go index 28fe081d6..8b46a2987 100644 --- a/pkg/p9/client_file.go +++ b/pkg/p9/client_file.go @@ -478,28 +478,23 @@ func (r *ReadWriterFile) ReadAt(p []byte, offset int64) (int, error) { } // Write implements part of the io.ReadWriter interface. +// +// Note that this may return a short write with a nil error. This violates the +// contract of io.Writer, but is more consistent with gVisor's pattern of +// returning errors that correspond to Linux errnos. Since short writes without +// error are common in Linux, returning a nil error is appropriate. func (r *ReadWriterFile) Write(p []byte) (int, error) { n, err := r.File.WriteAt(p, r.Offset) r.Offset += uint64(n) - if err != nil { - return n, err - } - if n < len(p) { - return n, io.ErrShortWrite - } - return n, nil + return n, err } // WriteAt implements the io.WriteAt interface. +// +// Note that this may return a short write with a nil error. This violates the +// contract of io.WriterAt. See comment on Write for justification. func (r *ReadWriterFile) WriteAt(p []byte, offset int64) (int, error) { - n, err := r.File.WriteAt(p, uint64(offset)) - if err != nil { - return n, err - } - if n < len(p) { - return n, io.ErrShortWrite - } - return n, nil + return r.File.WriteAt(p, uint64(offset)) } // Rename implements File.Rename. diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go index abd237f46..81ceb37c5 100644 --- a/pkg/p9/handlers.go +++ b/pkg/p9/handlers.go @@ -296,25 +296,6 @@ func (t *Tlopen) handle(cs *connState) message { } defer ref.DecRef() - ref.openedMu.Lock() - defer ref.openedMu.Unlock() - - // Has it been opened already? - if ref.opened || !CanOpen(ref.mode) { - return newErr(syscall.EINVAL) - } - - if ref.mode.IsDir() { - // Directory must be opened ReadOnly. - if t.Flags&OpenFlagsModeMask != ReadOnly { - return newErr(syscall.EISDIR) - } - // Directory not truncatable. - if t.Flags&OpenTruncate != 0 { - return newErr(syscall.EISDIR) - } - } - var ( qid QID ioUnit uint32 @@ -326,6 +307,22 @@ func (t *Tlopen) handle(cs *connState) message { return syscall.EINVAL } + // Has it been opened already? + if ref.opened || !CanOpen(ref.mode) { + return syscall.EINVAL + } + + if ref.mode.IsDir() { + // Directory must be opened ReadOnly. + if t.Flags&OpenFlagsModeMask != ReadOnly { + return syscall.EISDIR + } + // Directory not truncatable. + if t.Flags&OpenTruncate != 0 { + return syscall.EISDIR + } + } + osFile, qid, ioUnit, err = ref.file.Open(t.Flags) return err }); err != nil { @@ -366,7 +363,7 @@ func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) { } // Not allowed on open directories. - if _, opened := ref.OpenFlags(); opened { + if ref.opened { return syscall.EINVAL } @@ -437,7 +434,7 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) { } // Not allowed on open directories. - if _, opened := ref.OpenFlags(); opened { + if ref.opened { return syscall.EINVAL } @@ -476,7 +473,7 @@ func (t *Tlink) handle(cs *connState) message { } // Not allowed on open directories. - if _, opened := ref.OpenFlags(); opened { + if ref.opened { return syscall.EINVAL } @@ -518,7 +515,7 @@ func (t *Trenameat) handle(cs *connState) message { } // Not allowed on open directories. - if _, opened := ref.OpenFlags(); opened { + if ref.opened { return syscall.EINVAL } @@ -561,7 +558,7 @@ func (t *Tunlinkat) handle(cs *connState) message { } // Not allowed on open directories. - if _, opened := ref.OpenFlags(); opened { + if ref.opened { return syscall.EINVAL } @@ -701,13 +698,12 @@ func (t *Tread) handle(cs *connState) message { ) if err := ref.safelyRead(func() (err error) { // Has it been opened already? - openFlags, opened := ref.OpenFlags() - if !opened { + if !ref.opened { return syscall.EINVAL } // Can it be read? Check permissions. - if openFlags&OpenFlagsModeMask == WriteOnly { + if ref.openFlags&OpenFlagsModeMask == WriteOnly { return syscall.EPERM } @@ -731,13 +727,12 @@ func (t *Twrite) handle(cs *connState) message { var n int if err := ref.safelyRead(func() (err error) { // Has it been opened already? - openFlags, opened := ref.OpenFlags() - if !opened { + if !ref.opened { return syscall.EINVAL } // Can it be written? Check permissions. - if openFlags&OpenFlagsModeMask == ReadOnly { + if ref.openFlags&OpenFlagsModeMask == ReadOnly { return syscall.EPERM } @@ -778,7 +773,7 @@ func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) { } // Not allowed on open directories. - if _, opened := ref.OpenFlags(); opened { + if ref.opened { return syscall.EINVAL } @@ -820,7 +815,7 @@ func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) { } // Not allowed on open directories. - if _, opened := ref.OpenFlags(); opened { + if ref.opened { return syscall.EINVAL } @@ -898,13 +893,12 @@ func (t *Tallocate) handle(cs *connState) message { if err := ref.safelyWrite(func() error { // Has it been opened already? - openFlags, opened := ref.OpenFlags() - if !opened { + if !ref.opened { return syscall.EINVAL } // Can it be written? Check permissions. - if openFlags&OpenFlagsModeMask == ReadOnly { + if ref.openFlags&OpenFlagsModeMask == ReadOnly { return syscall.EBADF } @@ -1049,8 +1043,8 @@ func (t *Treaddir) handle(cs *connState) message { return syscall.EINVAL } - // Has it been opened already? - if _, opened := ref.OpenFlags(); !opened { + // Has it been opened yet? + if !ref.opened { return syscall.EINVAL } @@ -1076,8 +1070,8 @@ func (t *Tfsync) handle(cs *connState) message { defer ref.DecRef() if err := ref.safelyRead(func() (err error) { - // Has it been opened already? - if _, opened := ref.OpenFlags(); !opened { + // Has it been opened yet? + if !ref.opened { return syscall.EINVAL } @@ -1185,8 +1179,13 @@ func doWalk(cs *connState, ref *fidRef, names []string, getattr bool) (qids []QI } // Has it been opened already? - if _, opened := ref.OpenFlags(); opened { - err = syscall.EBUSY + err = ref.safelyRead(func() (err error) { + if ref.opened { + return syscall.EBUSY + } + return nil + }) + if err != nil { return } diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go index 6e605b14c..2e3d427ae 100644 --- a/pkg/p9/p9test/client_test.go +++ b/pkg/p9/p9test/client_test.go @@ -678,16 +678,15 @@ func renameHelper(h *Harness, root p9.File, srcNames []string, dstNames []string // case. defer checkDeleted(h, dst) } else { + // If the type is different than the destination, then + // we expect the rename to fail. We expect that this + // is returned. + // + // If the file being renamed to itself, this is + // technically allowed and a no-op, but all the + // triggers will fire. if !selfRename { - // If the type is different than the - // destination, then we expect the rename to - // fail. We expect ensure that this is - // returned. expectedErr = syscall.EINVAL - } else { - // This is the file being renamed to itself. - // This is technically allowed and a no-op, but - // all the triggers will fire. } dst.Close() } diff --git a/pkg/p9/server.go b/pkg/p9/server.go index 3736f12a3..8c5c434fd 100644 --- a/pkg/p9/server.go +++ b/pkg/p9/server.go @@ -134,12 +134,11 @@ type fidRef struct { // The node above will be closed only when refs reaches zero. refs int64 - // openedMu protects opened and openFlags. - openedMu sync.Mutex - // opened indicates whether this has been opened already. // // This is updated in handlers.go. + // + // opened is protected by pathNode.opMu or renameMu (for write). opened bool // mode is the fidRef's mode from the walk. Only the type bits are @@ -151,6 +150,8 @@ type fidRef struct { // openFlags is the mode used in the open. // // This is updated in handlers.go. + // + // openFlags is protected by pathNode.opMu or renameMu (for write). openFlags OpenFlags // pathNode is the current pathNode for this FID. @@ -177,13 +178,6 @@ type fidRef struct { deleted uint32 } -// OpenFlags returns the flags the file was opened with and true iff the fid was opened previously. -func (f *fidRef) OpenFlags() (OpenFlags, bool) { - f.openedMu.Lock() - defer f.openedMu.Unlock() - return f.openFlags, f.opened -} - // IncRef increases the references on a fid. func (f *fidRef) IncRef() { atomic.AddInt64(&f.refs, 1) diff --git a/pkg/p9/transport_test.go b/pkg/p9/transport_test.go index e7406b374..a29f06ddb 100644 --- a/pkg/p9/transport_test.go +++ b/pkg/p9/transport_test.go @@ -197,33 +197,33 @@ func BenchmarkSendRecv(b *testing.B) { for i := 0; i < b.N; i++ { tag, m, err := recv(server, maximumLength, msgRegistry.get) if err != nil { - b.Fatalf("recv got err %v expected nil", err) + b.Errorf("recv got err %v expected nil", err) } if tag != Tag(1) { - b.Fatalf("got tag %v expected 1", tag) + b.Errorf("got tag %v expected 1", tag) } if _, ok := m.(*Rflush); !ok { - b.Fatalf("got message %T expected *Rflush", m) + b.Errorf("got message %T expected *Rflush", m) } if err := send(server, Tag(2), &Rflush{}); err != nil { - b.Fatalf("send got err %v expected nil", err) + b.Errorf("send got err %v expected nil", err) } } }() b.ResetTimer() for i := 0; i < b.N; i++ { if err := send(client, Tag(1), &Rflush{}); err != nil { - b.Fatalf("send got err %v expected nil", err) + b.Errorf("send got err %v expected nil", err) } tag, m, err := recv(client, maximumLength, msgRegistry.get) if err != nil { - b.Fatalf("recv got err %v expected nil", err) + b.Errorf("recv got err %v expected nil", err) } if tag != Tag(2) { - b.Fatalf("got tag %v expected 2", tag) + b.Errorf("got tag %v expected 2", tag) } if _, ok := m.(*Rflush); !ok { - b.Fatalf("got message %v expected *Rflush", m) + b.Errorf("got message %v expected *Rflush", m) } } } diff --git a/pkg/pool/pool.go b/pkg/pool/pool.go index a1b2e0cfe..54e825b28 100644 --- a/pkg/pool/pool.go +++ b/pkg/pool/pool.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package pool provides a trivial integer pool. package pool import ( diff --git a/pkg/refsvfs2/BUILD b/pkg/refsvfs2/BUILD index bfa1daa10..0377c0876 100644 --- a/pkg/refsvfs2/BUILD +++ b/pkg/refsvfs2/BUILD @@ -9,7 +9,7 @@ go_template( "refs_template.go", ], opt_consts = [ - "logTrace", + "enableLogging", ], types = [ "T", diff --git a/pkg/refsvfs2/refs_template.go b/pkg/refsvfs2/refs_template.go index f64b6c6ae..3fbc91aa5 100644 --- a/pkg/refsvfs2/refs_template.go +++ b/pkg/refsvfs2/refs_template.go @@ -74,11 +74,6 @@ func (r *Refs) LogRefs() bool { return enableLogging } -// EnableLeakCheck enables reference leak checking on r. -func (r *Refs) EnableLeakCheck() { - refsvfs2.Register(r) -} - // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *Refs) ReadRefs() int64 { @@ -136,7 +131,7 @@ func (r *Refs) TryIncRef() bool { func (r *Refs) DecRef(destroy func()) { v := atomic.AddInt64(&r.refCount, -1) if enableLogging { - refsvfs2.LogDecRef(r, v+1) + refsvfs2.LogDecRef(r, v) } switch { case v < 0: @@ -153,6 +148,6 @@ func (r *Refs) DecRef(destroy func()) { func (r *Refs) afterLoad() { if r.ReadRefs() > 0 { - r.EnableLeakCheck() + refsvfs2.Register(r) } } diff --git a/pkg/safemem/block_unsafe.go b/pkg/safemem/block_unsafe.go index e7fd30743..7857f5853 100644 --- a/pkg/safemem/block_unsafe.go +++ b/pkg/safemem/block_unsafe.go @@ -68,29 +68,29 @@ func blockFromSlice(slice []byte, needSafecopy bool) Block { } } -// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+len), which is +// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+length), which is // safe to access without safecopy. // -// Preconditions: ptr+len does not overflow. -func BlockFromSafePointer(ptr unsafe.Pointer, len int) Block { - return blockFromPointer(ptr, len, false) +// Preconditions: ptr+length does not overflow. +func BlockFromSafePointer(ptr unsafe.Pointer, length int) Block { + return blockFromPointer(ptr, length, false) } // BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which // is not safe to access without safecopy. // // Preconditions: ptr+len does not overflow. -func BlockFromUnsafePointer(ptr unsafe.Pointer, len int) Block { - return blockFromPointer(ptr, len, true) +func BlockFromUnsafePointer(ptr unsafe.Pointer, length int) Block { + return blockFromPointer(ptr, length, true) } -func blockFromPointer(ptr unsafe.Pointer, len int, needSafecopy bool) Block { - if uptr := uintptr(ptr); uptr+uintptr(len) < uptr { - panic(fmt.Sprintf("ptr %#x + len %#x overflows", ptr, len)) +func blockFromPointer(ptr unsafe.Pointer, length int, needSafecopy bool) Block { + if uptr := uintptr(ptr); uptr+uintptr(length) < uptr { + panic(fmt.Sprintf("ptr %#x + len %#x overflows", uptr, length)) } return Block{ start: ptr, - length: len, + length: length, needSafecopy: needSafecopy, } } diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go index 752e2dc32..ec17ebc4d 100644 --- a/pkg/seccomp/seccomp.go +++ b/pkg/seccomp/seccomp.go @@ -79,7 +79,7 @@ func Install(rules SyscallRules) error { // Perform the actual installation. if errno := SetFilter(instrs); errno != 0 { - return fmt.Errorf("Failed to set filter: %v", errno) + return fmt.Errorf("failed to set filter: %v", errno) } log.Infof("Seccomp filters installed.") diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go index 7cd895cc7..652c010da 100644 --- a/pkg/segment/test/set_functions.go +++ b/pkg/segment/test/set_functions.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package segment is a test package. package segment type setFunctions struct{} diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index d75d665ae..dd2effdf9 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -365,3 +365,18 @@ func (a SyscallArgument) SizeT() uint { func (a SyscallArgument) ModeT() uint { return uint(uint16(a.Value)) } + +// ErrFloatingPoint indicates a failed restore due to unusable floating point +// state. +type ErrFloatingPoint struct { + // supported is the supported floating point state. + supported uint64 + + // saved is the saved floating point state. + saved uint64 +} + +// Error returns a sensible description of the restore error. +func (e ErrFloatingPoint) Error() string { + return fmt.Sprintf("floating point state contains unsupported features; supported: %#x saved: %#x", e.supported, e.saved) +} diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index 19ce99d25..840e53d33 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -17,27 +17,10 @@ package arch import ( - "fmt" - "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/usermem" ) -// ErrFloatingPoint indicates a failed restore due to unusable floating point -// state. -type ErrFloatingPoint struct { - // supported is the supported floating point state. - supported uint64 - - // saved is the saved floating point state. - saved uint64 -} - -// Error returns a sensible description of the restore error. -func (e ErrFloatingPoint) Error() string { - return fmt.Sprintf("floating point state contains unsupported features; supported: %#x saved: %#x", e.supported, e.saved) -} - // XSTATE_BV does not exist if FXSAVE is used, but FXSAVE implicitly saves x87 // and SSE state, so this is the equivalent XSTATE_BV value. const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go index c9fb55d00..35d2e07c3 100644 --- a/pkg/sentry/arch/signal.go +++ b/pkg/sentry/arch/signal.go @@ -152,23 +152,23 @@ func (s *SignalInfo) FixSignalCodeForUser() { } } -// Pid returns the si_pid field. -func (s *SignalInfo) Pid() int32 { +// PID returns the si_pid field. +func (s *SignalInfo) PID() int32 { return int32(usermem.ByteOrder.Uint32(s.Fields[0:4])) } -// SetPid mutates the si_pid field. -func (s *SignalInfo) SetPid(val int32) { +// SetPID mutates the si_pid field. +func (s *SignalInfo) SetPID(val int32) { usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) } -// Uid returns the si_uid field. -func (s *SignalInfo) Uid() int32 { +// UID returns the si_uid field. +func (s *SignalInfo) UID() int32 { return int32(usermem.ByteOrder.Uint32(s.Fields[4:8])) } -// SetUid mutates the si_uid field. -func (s *SignalInfo) SetUid(val int32) { +// SetUID mutates the si_uid field. +func (s *SignalInfo) SetUID(val int32) { usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) } @@ -251,3 +251,26 @@ func (s *SignalInfo) Arch() uint32 { func (s *SignalInfo) SetArch(val uint32) { usermem.ByteOrder.PutUint32(s.Fields[12:16], val) } + +// Band returns the si_band field. +func (s *SignalInfo) Band() int64 { + return int64(usermem.ByteOrder.Uint64(s.Fields[0:8])) +} + +// SetBand mutates the si_band field. +func (s *SignalInfo) SetBand(val int64) { + // Note: this assumes the platform uses `long` as `__ARCH_SI_BAND_T`. + // On some platforms, which gVisor doesn't support, `__ARCH_SI_BAND_T` is + // `int`. See siginfo.h. + usermem.ByteOrder.PutUint64(s.Fields[0:8], uint64(val)) +} + +// FD returns the si_fd field. +func (s *SignalInfo) FD() uint32 { + return usermem.ByteOrder.Uint32(s.Fields[8:12]) +} + +// SetFD mutates the si_fd field. +func (s *SignalInfo) SetFD(val uint32) { + usermem.ByteOrder.PutUint32(s.Fields[8:12], val) +} diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index 2bf3c45e1..91b8fb44f 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -193,7 +193,7 @@ func (p *Profile) StopTrace(_, _ *struct{}) error { defer p.mu.Unlock() if p.traceFile == nil { - return errors.New("Execution tracing not started") + return errors.New("execution tracing not started") } // Similarly to the case above, if tasks have not ended traces, we will diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go index d800f2c85..62eaca965 100644 --- a/pkg/sentry/control/state.go +++ b/pkg/sentry/control/state.go @@ -62,6 +62,7 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error { Callback: func(err error) { if err == nil { log.Infof("Save succeeded: exiting...") + s.Kernel.SetSaveSuccess(false /* autosave */) } else { log.Warningf("Save failed: exiting...") s.Kernel.SetSaveError(err) diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go index 314661475..badd5b073 100644 --- a/pkg/sentry/fdimport/fdimport.go +++ b/pkg/sentry/fdimport/fdimport.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package fdimport provides the Import function. package fdimport import ( diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index ea85ab33c..5c3e852e9 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -49,13 +49,13 @@ go_library( "//pkg/amutex", "//pkg/context", "//pkg/log", - "//pkg/metric", "//pkg/p9", "//pkg/refs", "//pkg/secio", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs/lock", + "//pkg/sentry/fsmetric", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index ff2fe6712..8e0aa9019 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -336,7 +336,12 @@ func cleanupUpper(ctx context.Context, parent *Inode, name string, copyUpErr err // copyUpBuffers is a buffer pool for copying file content. The buffer // size is the same used by io.Copy. -var copyUpBuffers = sync.Pool{New: func() interface{} { return make([]byte, 8*usermem.PageSize) }} +var copyUpBuffers = sync.Pool{ + New: func() interface{} { + b := make([]byte, 8*usermem.PageSize) + return &b + }, +} // copyContentsLocked copies the contents of lower to upper. It panics if // less than size bytes can be copied. @@ -361,7 +366,7 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in defer lowerFile.DecRef(ctx) // Use a buffer pool to minimize allocations. - buf := copyUpBuffers.Get().([]byte) + buf := copyUpBuffers.Get().(*[]byte) defer copyUpBuffers.Put(buf) // Transfer the contents. @@ -371,7 +376,7 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in // optimizations could be self-defeating. So we leave this as simple as possible. var offset int64 for { - nr, err := lowerFile.FileOperations.Read(ctx, lowerFile, usermem.BytesIOSequence(buf), offset) + nr, err := lowerFile.FileOperations.Read(ctx, lowerFile, usermem.BytesIOSequence(*buf), offset) if err != nil && err != io.EOF { return err } @@ -383,7 +388,7 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in } return nil } - nw, err := upperFile.FileOperations.Write(ctx, upperFile, usermem.BytesIOSequence(buf[:nr]), offset) + nw, err := upperFile.FileOperations.Write(ctx, upperFile, usermem.BytesIOSequence((*buf)[:nr]), offset) if err != nil { return err } diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go index c7a11eec1..e04784db2 100644 --- a/pkg/sentry/fs/copy_up_test.go +++ b/pkg/sentry/fs/copy_up_test.go @@ -64,7 +64,7 @@ func TestConcurrentCopyUp(t *testing.T) { wg.Add(1) go func(o *overlayTestFile) { if err := o.File.Dirent.Inode.Truncate(ctx, o.File.Dirent, truncateFileSize); err != nil { - t.Fatalf("failed to copy up: %v", err) + t.Errorf("failed to copy up: %v", err) } wg.Done() }(file) diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 72ea70fcf..57f904801 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -17,13 +17,12 @@ package fs import ( "math" "sync/atomic" - "time" "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" @@ -33,28 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -var ( - // RecordWaitTime controls writing metrics for filesystem reads. - // Enabling this comes at a small CPU cost due to performing two - // monotonic clock reads per read call. - // - // Note that this is only performed in the direct read path, and may - // not be consistently applied for other forms of reads, such as - // splice. - RecordWaitTime = false - - reads = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.") - readWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.") -) - -// IncrementWait increments the given wait time metric, if enabled. -func IncrementWait(m *metric.Uint64Metric, start time.Time) { - if !RecordWaitTime { - return - } - m.IncrementBy(uint64(time.Since(start))) -} - // FileMaxOffset is the maximum possible file offset. const FileMaxOffset = math.MaxInt64 @@ -257,22 +234,19 @@ func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error { // // Returns syserror.ErrInterrupted if reading was interrupted. func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) { - var start time.Time - if RecordWaitTime { - start = time.Now() - } + start := fsmetric.StartReadWait() + defer fsmetric.FinishReadWait(fsmetric.ReadWait, start) + if !f.mu.Lock(ctx) { - IncrementWait(readWait, start) return 0, syserror.ErrInterrupted } - reads.Increment() + fsmetric.Reads.Increment() n, err := f.FileOperations.Read(ctx, f, dst, f.offset) if n > 0 && !f.flags.NonSeekable { atomic.AddInt64(&f.offset, n) } f.mu.Unlock() - IncrementWait(readWait, start) return n, err } @@ -282,19 +256,16 @@ func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) // // Otherwise same as Readv. func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { - var start time.Time - if RecordWaitTime { - start = time.Now() - } + start := fsmetric.StartReadWait() + defer fsmetric.FinishReadWait(fsmetric.ReadWait, start) + if !f.mu.Lock(ctx) { - IncrementWait(readWait, start) return 0, syserror.ErrInterrupted } - reads.Increment() + fsmetric.Reads.Increment() n, err := f.FileOperations.Read(ctx, f, dst, offset) f.mu.Unlock() - IncrementWait(readWait, start) return n, err } diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go index 8049538f2..ec3d3f96c 100644 --- a/pkg/sentry/fs/filetest/filetest.go +++ b/pkg/sentry/fs/filetest/filetest.go @@ -52,10 +52,10 @@ func NewTestFile(tb testing.TB) *fs.File { // Read just fails the request. func (*TestFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, fmt.Errorf("Readv not implemented") + return 0, fmt.Errorf("TestFileOperations.Read not implemented") } // Write just fails the request. func (*TestFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, fmt.Errorf("Writev not implemented") + return 0, fmt.Errorf("TestFileOperations.Write not implemented") } diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index d2dbff268..a020da53b 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -65,7 +65,7 @@ var ( // runs with the lock held for reading. AsyncBarrier will take the lock // for writing, thus ensuring that all Async work completes before // AsyncBarrier returns. - workMu sync.RWMutex + workMu sync.CrossGoroutineRWMutex // asyncError is used to store up to one asynchronous execution error. asyncError = make(chan error, 1) diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index fea135eea..4c30098cd 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -28,7 +28,6 @@ go_library( "//pkg/context", "//pkg/fd", "//pkg/log", - "//pkg/metric", "//pkg/p9", "//pkg/refs", "//pkg/safemem", @@ -38,6 +37,7 @@ go_library( "//pkg/sentry/fs/fdpipe", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/host", + "//pkg/sentry/fsmetric", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go index d481baf77..e5579095b 100644 --- a/pkg/sentry/fs/gofer/attr.go +++ b/pkg/sentry/fs/gofer/attr.go @@ -117,8 +117,6 @@ func ntype(pattr p9.Attr) fs.InodeType { return fs.BlockDevice case pattr.Mode.IsSocket(): return fs.Socket - case pattr.Mode.IsRegular(): - fallthrough default: return fs.RegularFile } diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index c0bc63a32..bb63448cb 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -21,27 +21,17 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) -var ( - opensWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.") - opens9P = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a 9P file was opened from a gofer.") - opensHost = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a host file was opened from a gofer.") - reads9P = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.") - readWait9P = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.") - readsHost = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.") - readWaitHost = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.") -) - // fileOperations implements fs.FileOperations for a remote file system. // // +stateify savable @@ -101,14 +91,14 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF } if flags.Write { if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Execute: true}); err == nil { - opensWX.Increment() + fsmetric.GoferOpensWX.Increment() log.Warningf("Opened a writable executable: %q", name) } } if handles.Host != nil { - opensHost.Increment() + fsmetric.GoferOpensHost.Increment() } else { - opens9P.Increment() + fsmetric.GoferOpens9P.Increment() } return fs.NewFile(ctx, dirent, flags, f) } @@ -278,20 +268,17 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I // use this function rather than using a defer in Read() to avoid the performance hit of defer. func (f *fileOperations) incrementReadCounters(start time.Time) { if f.handles.Host != nil { - readsHost.Increment() - fs.IncrementWait(readWaitHost, start) + fsmetric.GoferReadsHost.Increment() + fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) } else { - reads9P.Increment() - fs.IncrementWait(readWait9P, start) + fsmetric.GoferReads9P.Increment() + fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start) } } // Read implements fs.FileOperations.Read. func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - var start time.Time - if fs.RecordWaitTime { - start = time.Now() - } + start := fsmetric.StartReadWait() if fs.IsDir(file.Dirent.Inode.StableAttr) { // Not all remote file systems enforce this so this client does. f.incrementReadCounters(start) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 3a225fd39..9d6fdd08f 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -117,7 +117,7 @@ type inodeFileState struct { // loading is acquired when the inodeFileState begins an asynchronous // load. It releases when the load is complete. Callers that require all // state to be available should call waitForLoad() to ensure that. - loading sync.Mutex `state:".(struct{})"` + loading sync.CrossGoroutineMutex `state:".(struct{})"` // savedUAttr is only allocated during S/R. It points to the save-time // unstable attributes and is used to validate restore-time ones. diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index 004910453..9b3d8166a 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -18,9 +18,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" @@ -28,8 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -var opens = metric.MustCreateNewUint64Metric("/fs/opens", false /* sync */, "Number of file opens.") - // Inode is a file system object that can be simultaneously referenced by different // components of the VFS (Dirent, fs.File, etc). // @@ -247,7 +245,7 @@ func (i *Inode) GetFile(ctx context.Context, d *Dirent, flags FileFlags) (*File, if i.overlay != nil { return overlayGetFile(ctx, i.overlay, d, flags) } - opens.Increment() + fsmetric.Opens.Increment() return i.InodeOperations.GetFile(ctx, d, flags) } diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go index f8aad2dbd..b998fb75d 100644 --- a/pkg/sentry/fs/proc/sys.go +++ b/pkg/sentry/fs/proc/sys.go @@ -84,6 +84,7 @@ func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode children := map[string]*fs.Inode{ "hostname": newProcInode(ctx, &h, msrc, fs.SpecialFile, nil), + "sem": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))), "shmall": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMALL, 10))), "shmmax": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMAX, 10))), "shmmni": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMNI, 10))), diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index aa7199014..b521a86a2 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -15,12 +15,12 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", - "//pkg/metric", "//pkg/safemem", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", + "//pkg/sentry/fsmetric", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index d6c65301c..e04cd608d 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -18,14 +18,13 @@ import ( "fmt" "io" "math" - "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -35,13 +34,6 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -var ( - opensRO = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.") - opensW = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.") - reads = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.") - readWait = metric.MustCreateNewUint64NanosecondsMetric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.") -) - // fileInodeOperations implements fs.InodeOperations for a regular tmpfs file. // These files are backed by pages allocated from a platform.Memory, and may be // directly mapped. @@ -157,9 +149,9 @@ func (*fileInodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldPare // GetFile implements fs.InodeOperations.GetFile. func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { if flags.Write { - opensW.Increment() + fsmetric.TmpfsOpensW.Increment() } else if flags.Read { - opensRO.Increment() + fsmetric.TmpfsOpensRO.Increment() } flags.Pread = true flags.Pwrite = true @@ -319,14 +311,12 @@ func (*fileInodeOperations) StatFS(context.Context) (fs.Info, error) { } func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - var start time.Time - if fs.RecordWaitTime { - start = time.Now() - } - reads.Increment() + start := fsmetric.StartReadWait() + defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start) + fsmetric.TmpfsReads.Increment() + // Zero length reads for tmpfs are no-ops. if dst.NumBytes() == 0 { - fs.IncrementWait(readWait, start) return 0, nil } @@ -343,7 +333,6 @@ func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst userm size := f.attr.Size f.dataMu.RUnlock() if offset >= size { - fs.IncrementWait(readWait, start) return 0, io.EOF } @@ -354,7 +343,6 @@ func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst userm f.attr.AccessTime = ktime.NowFromContext(ctx) f.attrMu.Unlock() } - fs.IncrementWait(readWait, start) return n, err } diff --git a/pkg/sentry/fsimpl/fuse/connection_control.go b/pkg/sentry/fsimpl/fuse/connection_control.go index 1b3459c1d..4ab894965 100644 --- a/pkg/sentry/fsimpl/fuse/connection_control.go +++ b/pkg/sentry/fsimpl/fuse/connection_control.go @@ -84,11 +84,7 @@ func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { Flags: fuseDefaultInitFlags, } - req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in) - if err != nil { - return err - } - + req := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in) // Since there is no task to block on and FUSE_INIT is the request // to unblock other requests, use nil. return conn.CallAsync(nil, req) diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go index 91d16c1cf..d8b0d7657 100644 --- a/pkg/sentry/fsimpl/fuse/connection_test.go +++ b/pkg/sentry/fsimpl/fuse/connection_test.go @@ -76,10 +76,7 @@ func TestConnectionAbort(t *testing.T) { var futNormal []*futureResponse for i := 0; i < int(numRequests); i++ { - req, err := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj) - if err != nil { - t.Fatalf("NewRequest creation failed: %v", err) - } + req := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj) fut, err := conn.callFutureLocked(task, req) if err != nil { t.Fatalf("callFutureLocked failed: %v", err) @@ -105,10 +102,7 @@ func TestConnectionAbort(t *testing.T) { } // After abort, Call() should return directly with ENOTCONN. - req, err := conn.NewRequest(creds, 0, 0, 0, testObj) - if err != nil { - t.Fatalf("NewRequest creation failed: %v", err) - } + req := conn.NewRequest(creds, 0, 0, 0, testObj) _, err = conn.Call(task, req) if err != syserror.ENOTCONN { t.Fatalf("Incorrect error code received for Call() after connection aborted") diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 89c3ef079..1bbe6fdb7 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -363,7 +363,7 @@ func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask { var ready waiter.EventMask - if fd.fs.umounted { + if fd.fs == nil || fd.fs.umounted { ready |= waiter.EventErr return ready & mask } diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index 95c475a65..bb2d0d31a 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -219,10 +219,7 @@ func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *con data: rand.Uint32(), } - req, err := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj) - if err != nil { - t.Fatalf("NewRequest creation failed: %v", err) - } + req := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj) // Queue up a request. // Analogous to Call except it doesn't block on the task. diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go index 8f220a04b..fcc5d9a2a 100644 --- a/pkg/sentry/fsimpl/fuse/directory.go +++ b/pkg/sentry/fsimpl/fuse/directory.go @@ -68,11 +68,7 @@ func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirent } // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS. - req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in) - if err != nil { - return err - } - + req := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in) res, err := fusefs.conn.Call(task, req) if err != nil { return err diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go index 83f2816b7..e138b11f8 100644 --- a/pkg/sentry/fsimpl/fuse/file.go +++ b/pkg/sentry/fsimpl/fuse/file.go @@ -83,12 +83,8 @@ func (fd *fileDescription) Release(ctx context.Context) { opcode = linux.FUSE_RELEASE } kernelTask := kernel.TaskFromContext(ctx) - // ignoring errors and FUSE server reply is analogous to Linux's behavior. - req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in) - if err != nil { - // No way to invoke Call() with an errored request. - return - } + // Ignoring errors and FUSE server reply is analogous to Linux's behavior. + req := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in) // The reply will be ignored since no callback is defined in asyncCallBack(). conn.CallAsync(kernelTask, req) } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 23e827f90..3af807a21 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -119,7 +119,8 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */) if err != nil { - return nil, nil, err + log.Debugf("%s.GetFilesystem: device FD '%v' not parsable: %v", fsType.Name(), deviceDescriptorStr, err) + return nil, nil, syserror.EINVAL } kernelTask := kernel.TaskFromContext(ctx) @@ -360,12 +361,8 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentr in.Flags &= ^uint32(linux.O_TRUNC) } - req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in) - if err != nil { - return nil, err - } - // Send the request and receive the reply. + req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in) res, err := i.fs.conn.Call(kernelTask, req) if err != nil { return nil, err @@ -485,10 +482,7 @@ func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) err return syserror.EINVAL } in := linux.FUSEUnlinkIn{Name: name} - req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in) - if err != nil { - return err - } + req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in) res, err := i.fs.conn.Call(kernelTask, req) if err != nil { return err @@ -515,11 +509,7 @@ func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) erro task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) in := linux.FUSERmDirIn{Name: name} - req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in) - if err != nil { - return err - } - + req := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in) res, err := i.fs.conn.Call(task, req) if err != nil { return err @@ -535,10 +525,7 @@ func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMo log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) return nil, syserror.EINVAL } - req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload) - if err != nil { - return nil, err - } + req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload) res, err := i.fs.conn.Call(kernelTask, req) if err != nil { return nil, err @@ -574,10 +561,7 @@ func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context") return "", syserror.EINVAL } - req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{}) - if err != nil { - return "", err - } + req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{}) res, err := i.fs.conn.Call(kernelTask, req) if err != nil { return "", err @@ -680,11 +664,7 @@ func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOp GetAttrFlags: flags, Fh: fh, } - req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in) - if err != nil { - return linux.FUSEAttr{}, err - } - + req := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in) res, err := i.fs.conn.Call(task, req) if err != nil { return linux.FUSEAttr{}, err @@ -803,11 +783,7 @@ func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre UID: opts.Stat.UID, GID: opts.Stat.GID, } - req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in) - if err != nil { - return err - } - + req := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in) res, err := conn.Call(task, req) if err != nil { return err diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go index 2d396e84c..23ce91849 100644 --- a/pkg/sentry/fsimpl/fuse/read_write.go +++ b/pkg/sentry/fsimpl/fuse/read_write.go @@ -79,13 +79,9 @@ func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off ui in.Offset = off + (uint64(pagesRead) << usermem.PageShift) in.Size = pagesCanRead << usermem.PageShift - req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in) - if err != nil { - return nil, 0, err - } - // TODO(gvisor.dev/issue/3247): support async read. + req := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in) res, err := fs.conn.Call(t, req) if err != nil { return nil, 0, err @@ -204,11 +200,7 @@ func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, in.Offset = off + uint64(written) in.Size = toWrite - req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), inode.nodeID, linux.FUSE_WRITE, &in) - if err != nil { - return 0, err - } - + req := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), inode.nodeID, linux.FUSE_WRITE, &in) req.payload = data[written : written+toWrite] // TODO(gvisor.dev/issue/3247): support async write. diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go index 7fa00569b..41d679358 100644 --- a/pkg/sentry/fsimpl/fuse/request_response.go +++ b/pkg/sentry/fsimpl/fuse/request_response.go @@ -70,6 +70,7 @@ func (r *fuseInitRes) UnmarshalBytes(src []byte) { out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2])) src = src[2:] } + _ = src // Remove unused warning. } // SizeBytes is the size of the payload of the FUSE_INIT response. @@ -104,7 +105,7 @@ type Request struct { } // NewRequest creates a new request that can be sent to the FUSE server. -func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { +func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) *Request { conn.fd.mu.Lock() defer conn.fd.mu.Unlock() conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) @@ -130,7 +131,7 @@ func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint id: hdr.Unique, hdr: &hdr, data: buf, - }, nil + } } // futureResponse represents an in-flight request, that may or may not have diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 4c3e9acf8..807b6ed1f 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -59,6 +59,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsmetric", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 2294c490e..df27554d3 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" @@ -985,14 +986,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open switch d.fileType() { case linux.S_IFREG: if !d.fs.opts.regularFilesUseSpecialFileFD { - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil { + if err := d.ensureSharedHandle(ctx, ats.MayRead(), ats.MayWrite(), trunc); err != nil { return nil, err } - fd := ®ularFileFD{} - fd.LockFD.Init(&d.locks) - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ - AllowDirectIO: true, - }); err != nil { + fd, err := newRegularFileFD(mnt, d, opts.Flags) + if err != nil { return nil, err } vfd = &fd.vfsfd @@ -1019,6 +1017,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } + if atomic.LoadInt32(&d.readFD) >= 0 { + fsmetric.GoferOpensHost.Increment() + } else { + fsmetric.GoferOpens9P.Increment() + } return &fd.vfsfd, nil case linux.S_IFLNK: // Can't open symlinks without O_PATH (which is unimplemented). @@ -1110,7 +1113,7 @@ retry: return nil, err } } - fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags) + fd, err := newSpecialFileFD(h, mnt, d, opts.Flags) if err != nil { h.close(ctx) return nil, err @@ -1205,11 +1208,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // Finally, construct a file description representing the created file. var childVFSFD *vfs.FileDescription if useRegularFileFD { - fd := ®ularFileFD{} - fd.LockFD.Init(&child.locks) - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ - AllowDirectIO: true, - }); err != nil { + fd, err := newRegularFileFD(mnt, child, opts.Flags) + if err != nil { return nil, err } childVFSFD = &fd.vfsfd @@ -1221,7 +1221,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving if fdobj != nil { h.fd = int32(fdobj.Release()) } - fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags) + fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) if err != nil { h.close(ctx) return nil, err diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 75a836899..3cdb1e659 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -743,7 +743,9 @@ type dentry struct { // for memory mappings. If mmapFD is -1, no such FD is available, and the // internal page cache implementation is used for memory mappings instead. // - // These fields are protected by handleMu. + // These fields are protected by handleMu. readFD, writeFD, and mmapFD are + // additionally written using atomic memory operations, allowing them to be + // read (albeit racily) with atomic.LoadInt32() without locking handleMu. // // readFile and writeFile may or may not represent the same p9.File. Once // either p9.File transitions from closed (isNil() == true) to open @@ -1351,16 +1353,11 @@ func (d *dentry) checkCachingLocked(ctx context.Context) { return } if refs > 0 { - if d.cached { - // This isn't strictly necessary (fs.cachedDentries is permitted to - // contain dentries with non-zero refs, which are skipped by - // fs.evictCachedDentryLocked() upon reaching the end of the LRU), - // but since we are already holding fs.renameMu for writing we may - // as well. - d.fs.cachedDentries.Remove(d) - d.fs.cachedDentriesLen-- - d.cached = false - } + // This isn't strictly necessary (fs.cachedDentries is permitted to + // contain dentries with non-zero refs, which are skipped by + // fs.evictCachedDentryLocked() upon reaching the end of the LRU), but + // since we are already holding fs.renameMu for writing we may as well. + d.removeFromCacheLocked() return } // Deleted and invalidated dentries with zero references are no longer @@ -1369,20 +1366,18 @@ func (d *dentry) checkCachingLocked(ctx context.Context) { if d.isDeleted() { d.watches.HandleDeletion(ctx) } - if d.cached { - d.fs.cachedDentries.Remove(d) - d.fs.cachedDentriesLen-- - d.cached = false - } + d.removeFromCacheLocked() d.destroyLocked(ctx) return } - // If d still has inotify watches and it is not deleted or invalidated, we - // cannot cache it and allow it to be evicted. Otherwise, we will lose its - // watches, even if a new dentry is created for the same file in the future. - // Note that the size of d.watches cannot concurrently transition from zero - // to non-zero, because adding a watch requires holding a reference on d. + // If d still has inotify watches and it is not deleted or invalidated, it + // can't be evicted. Otherwise, we will lose its watches, even if a new + // dentry is created for the same file in the future. Note that the size of + // d.watches cannot concurrently transition from zero to non-zero, because + // adding a watch requires holding a reference on d. if d.watches.Size() > 0 { + // As in the refs > 0 case, this is not strictly necessary. + d.removeFromCacheLocked() return } @@ -1413,6 +1408,15 @@ func (d *dentry) checkCachingLocked(ctx context.Context) { } } +// Preconditions: d.fs.renameMu must be locked for writing. +func (d *dentry) removeFromCacheLocked() { + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } +} + // Precondition: fs.renameMu must be locked for writing; it may be temporarily // unlocked. func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { @@ -1426,12 +1430,10 @@ func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { // * fs.cachedDentriesLen != 0. func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { victim := fs.cachedDentries.Back() - fs.cachedDentries.Remove(victim) - fs.cachedDentriesLen-- - victim.cached = false - // victim.refs may have become non-zero from an earlier path resolution - // since it was inserted into fs.cachedDentries. - if atomic.LoadInt64(&victim.refs) == 0 { + victim.removeFromCacheLocked() + // victim.refs or victim.watches.Size() may have become non-zero from an + // earlier path resolution since it was inserted into fs.cachedDentries. + if atomic.LoadInt64(&victim.refs) == 0 && victim.watches.Size() == 0 { if victim.parent != nil { victim.parent.dirMu.Lock() if !victim.vfsd.IsDead() { @@ -1668,7 +1670,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool } fdsToClose = append(fdsToClose, d.readFD) invalidateTranslations = true - d.readFD = h.fd + atomic.StoreInt32(&d.readFD, h.fd) } else { // Otherwise, we want to avoid invalidating existing // memmap.Translations (which is expensive); instead, use @@ -1689,15 +1691,15 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool h.fd = d.readFD } } else { - d.readFD = h.fd + atomic.StoreInt32(&d.readFD, h.fd) } if d.writeFD != h.fd && d.writeFD >= 0 { fdsToClose = append(fdsToClose, d.writeFD) } - d.writeFD = h.fd - d.mmapFD = h.fd + atomic.StoreInt32(&d.writeFD, h.fd) + atomic.StoreInt32(&d.mmapFD, h.fd) } else if openReadable && d.readFD < 0 { - d.readFD = h.fd + atomic.StoreInt32(&d.readFD, h.fd) // If the file has not been opened for writing, the new FD may // be used for read-only memory mappings. If the file was // previously opened for reading (without an FD), then existing @@ -1705,10 +1707,10 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // invalidate those mappings. if d.writeFile.isNil() { invalidateTranslations = !d.readFile.isNil() - d.mmapFD = h.fd + atomic.StoreInt32(&d.mmapFD, h.fd) } } else if openWritable && d.writeFD < 0 { - d.writeFD = h.fd + atomic.StoreInt32(&d.writeFD, h.fd) if d.readFD >= 0 { // We have an existing read-only FD, but the file has just // been opened for writing, so we need to start supporting @@ -1717,7 +1719,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // writable memory mappings. Switch to using the internal // page cache. invalidateTranslations = true - d.mmapFD = -1 + atomic.StoreInt32(&d.mmapFD, -1) } } else { // The new FD is not useful. @@ -1729,7 +1731,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // memory mappings. However, we have no writable host FD. Switch to // using the internal page cache. invalidateTranslations = true - d.mmapFD = -1 + atomic.StoreInt32(&d.mmapFD, -1) } // Switch to new fids. diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 652142ecc..283b220bb 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -48,6 +49,25 @@ type regularFileFD struct { off int64 } +func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) { + fd := ®ularFileFD{} + fd.LockFD.Init(&d.locks) + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) { + fsmetric.GoferOpensWX.Increment() + } + if atomic.LoadInt32(&d.mmapFD) >= 0 { + fsmetric.GoferOpensHost.Increment() + } else { + fsmetric.GoferOpens9P.Increment() + } + return fd, nil +} + // Release implements vfs.FileDescriptionImpl.Release. func (fd *regularFileFD) Release(context.Context) { } @@ -89,6 +109,18 @@ func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + start := fsmetric.StartReadWait() + d := fd.dentry() + defer func() { + if atomic.LoadInt32(&d.readFD) >= 0 { + fsmetric.GoferReadsHost.Increment() + fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) + } else { + fsmetric.GoferReads9P.Increment() + fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start) + } + }() + if offset < 0 { return 0, syserror.EINVAL } @@ -102,7 +134,6 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // Check for reading at EOF before calling into MM (but not under // InteropModeShared, which makes d.size unreliable). - d := fd.dentry() if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) { return 0, io.EOF } @@ -647,10 +678,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt // Whether or not we have a host FD, we're not allowed to use it. return syserror.ENODEV } - d.handleMu.RLock() - haveFD := d.mmapFD >= 0 - d.handleMu.RUnlock() - if !haveFD { + if atomic.LoadInt32(&d.mmapFD) < 0 { return syserror.ENODEV } default: @@ -668,10 +696,7 @@ func (d *dentry) mayCachePages() bool { if d.fs.opts.forcePageCache { return true } - d.handleMu.RLock() - haveFD := d.mmapFD >= 0 - d.handleMu.RUnlock() - return haveFD + return atomic.LoadInt32(&d.mmapFD) >= 0 } // AddMapping implements memmap.Mappable.AddMapping. diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 625400c0b..089955a96 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -70,7 +71,7 @@ type specialFileFD struct { buf []byte } -func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) { +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { ftype := d.fileType() seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0 @@ -80,7 +81,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, seekable: seekable, haveQueue: haveQueue, } - fd.LockFD.Init(locks) + fd.LockFD.Init(&d.locks) if haveQueue { if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { return nil, err @@ -98,6 +99,14 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, d.fs.syncMu.Lock() d.fs.specialFileFDs[fd] = struct{}{} d.fs.syncMu.Unlock() + if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) { + fsmetric.GoferOpensWX.Increment() + } + if h.fd >= 0 { + fsmetric.GoferOpensHost.Increment() + } else { + fsmetric.GoferOpens9P.Increment() + } return fd, nil } @@ -161,6 +170,17 @@ func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + start := fsmetric.StartReadWait() + defer func() { + if fd.handle.fd >= 0 { + fsmetric.GoferReadsHost.Increment() + fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) + } else { + fsmetric.GoferReads9P.Increment() + fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start) + } + }() + if fd.seekable && offset < 0 { return 0, syserror.EINVAL } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index c14abcff4..565d723f0 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -286,7 +286,7 @@ func (d *Dentry) cacheLocked(ctx context.Context) { refs := atomic.LoadInt64(&d.refs) if refs == -1 { // Dentry has already been destroyed. - panic(fmt.Sprintf("cacheLocked called on a dentry which has already been destroyed: %v", d)) + return } if refs > 0 { if d.cached { diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 469f3a33d..27b00cf6f 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -16,7 +16,6 @@ package overlay import ( "fmt" - "io" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -129,25 +128,9 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { return err } defer newFD.DecRef(ctx) - bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size - for { - readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{}) - if readErr != nil && readErr != io.EOF { - cleanupUndoCopyUp() - return readErr - } - total := int64(0) - for total < readN { - writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{}) - total += writeN - if writeErr != nil { - cleanupUndoCopyUp() - return writeErr - } - } - if readErr == io.EOF { - break - } + if _, err := vfs.CopyRegularFileData(ctx, newFD, oldFD); err != nil { + cleanupUndoCopyUp() + return err } d.mapsMu.Lock() defer d.mapsMu.Unlock() diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go index 2b89a7a6d..25c785fd4 100644 --- a/pkg/sentry/fsimpl/overlay/regular_file.go +++ b/pkg/sentry/fsimpl/overlay/regular_file.go @@ -103,8 +103,8 @@ func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescript for e, mask := range fd.lowerWaiters { fd.cachedFD.EventUnregister(e) upperFD.EventRegister(e, mask) - if ready&mask != 0 { - e.Callback.Callback(e) + if m := ready & mask; m != 0 { + e.Callback.Callback(e, m) } } } diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 5cf8a071a..d4f6a5a9b 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -208,7 +208,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { for _, se := range n.kernel.ListSockets() { s := se.SockVFS2 if !s.TryIncRef() { - log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) + // Racing with socket destruction, this is ok. continue } if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { @@ -351,7 +351,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, for _, se := range k.ListSockets() { s := se.SockVFS2 if !s.TryIncRef() { - log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) + // Racing with socket destruction, this is ok. continue } sops, ok := s.Impl().(socket.SocketVFS2) @@ -516,7 +516,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { for _, se := range d.kernel.ListSockets() { s := se.SockVFS2 if !s.TryIncRef() { - log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) + // Racing with socket destruction, this is ok. continue } sops, ok := s.Impl().(socket.SocketVFS2) diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 7c7afdcfa..25c407d98 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -44,6 +44,7 @@ func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k * return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "hostname": fs.newInode(ctx, root, 0444, &hostnameData{}), + "sem": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))), "shmall": fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)), "shmmax": fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)), "shmmni": fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)), diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index 10f1452ef..246bd87bc 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package signalfd provides basic signalfd file implementations. package signalfd import ( @@ -98,8 +99,8 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen Signo: uint32(info.Signo), Errno: info.Errno, Code: info.Code, - PID: uint32(info.Pid()), - UID: uint32(info.Uid()), + PID: uint32(info.PID()), + UID: uint32(info.UID()), Status: info.Status(), Overrun: uint32(info.Overrun()), Addr: info.Addr(), diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index fe520b6fd..09957c2b7 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -67,6 +67,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/lock", + "//pkg/sentry/fsmetric", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e39cd305b..9296db2fb 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -381,6 +382,8 @@ afterTrailingSymlink: creds := rp.Credentials() child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)) parentDir.insertChildLocked(child, name) + child.IncRef() + defer child.DecRef(ctx) unlock() fd, err := child.open(ctx, rp, &opts, true) if err != nil { @@ -437,6 +440,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return nil, err } } + if fd.vfsfd.IsWritable() { + fsmetric.TmpfsOpensW.Increment() + } else if fd.vfsfd.IsReadable() { + fsmetric.TmpfsOpensRO.Increment() + } return &fd.vfsfd, nil case *directory: // Can't open directories writably. diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index f8e0cffb0..6255a7c84 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -359,6 +360,10 @@ func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + start := fsmetric.StartReadWait() + defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start) + fsmetric.TmpfsReads.Increment() + if offset < 0 { return 0, syserror.EINVAL } diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index add5dd48e..a4ad625bb 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -107,8 +107,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de // Dentries which may have a reference count of zero, and which therefore // should be dropped once traversal is complete, are appended to ds. // -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. -// !rp.Done(). +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// * !rp.Done(). func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR @@ -158,15 +160,19 @@ afterSymlink: return child, nil } -// verifyChild verifies the hash of child against the already verified hash of -// the parent to ensure the child is expected. verifyChild triggers a sentry -// panic if unexpected modifications to the file system are detected. In -// noCrashOnVerificationFailure mode it returns a syserror instead. -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// verifyChildLocked verifies the hash of child against the already verified +// hash of the parent to ensure the child is expected. verifyChild triggers a +// sentry panic if unexpected modifications to the file system are detected. In +// ErrorOnViolation mode it returns a syserror instead. +// +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// // TODO(b/166474175): Investigate all possible errors returned in this // function, and make sure we differentiate all errors that indicate unexpected // modifications to the file system from the ones that are not harmful. -func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) { +func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) { vfsObj := fs.vfsfs.VirtualFilesystem() // Get the path to the child dentry. This is only used to provide path @@ -248,7 +254,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err)) } - fdReader := vfs.FileReadWriteSeeker{ + fdReader := FileReadWriteSeeker{ FD: parentMerkleFD, Ctx: ctx, } @@ -268,7 +274,8 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // contain the hash of the children in the parent Merkle tree when // Verify returns with success. var buf bytes.Buffer - if _, err := merkletree.Verify(&merkletree.VerifyParams{ + parent.hashMu.RLock() + _, err = merkletree.Verify(&merkletree.VerifyParams{ Out: &buf, File: &fdReader, Tree: &fdReader, @@ -284,21 +291,27 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())), Expected: parent.hash, DataAndTreeInSameFile: true, - }); err != nil && err != io.EOF { + }) + parent.hashMu.RUnlock() + if err != nil && err != io.EOF { return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err)) } // Cache child hash when it's verified the first time. + child.hashMu.Lock() if len(child.hash) == 0 { child.hash = buf.Bytes() } + child.hashMu.Unlock() return child, nil } -// verifyStatAndChildren verifies the stat and children names against the +// verifyStatAndChildrenLocked verifies the stat and children names against the // verified hash. The mode/uid/gid and childrenNames of the file is cached // after verified. -func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat linux.Statx) error { +// +// Preconditions: d.dirMu must be locked. +func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry, stat linux.Statx) error { vfsObj := fs.vfsfs.VirtualFilesystem() // Get the path to the child dentry. This is only used to provide path @@ -384,12 +397,13 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat } } - fdReader := vfs.FileReadWriteSeeker{ + fdReader := FileReadWriteSeeker{ FD: fd, Ctx: ctx, } var buf bytes.Buffer + d.hashMu.RLock() params := &merkletree.VerifyParams{ Out: &buf, Tree: &fdReader, @@ -407,6 +421,7 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat Expected: d.hash, DataAndTreeInSameFile: false, } + d.hashMu.RUnlock() if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR { params.DataAndTreeInSameFile = true } @@ -421,7 +436,9 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat return nil } -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if child, ok := parent.children[name]; ok { // If verity is enabled on child, we should check again whether @@ -470,7 +487,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s // be cached before enabled. if fs.allowRuntimeEnable { if parent.verityEnabled() { - if _, err := fs.verifyChild(ctx, parent, child); err != nil { + if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil { return nil, err } } @@ -486,7 +503,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s if err != nil { return nil, err } - if err := fs.verifyStatAndChildren(ctx, child, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil { return nil, err } } @@ -506,7 +523,9 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s return child, nil } -// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) { vfsObj := fs.vfsfs.VirtualFilesystem() @@ -597,13 +616,13 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, // allowRuntimeEnable mode and the parent directory hasn't been enabled // yet. if parent.verityEnabled() { - if _, err := fs.verifyChild(ctx, parent, child); err != nil { + if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil { child.destroyLocked(ctx) return nil, err } } if child.verityEnabled() { - if err := fs.verifyStatAndChildren(ctx, child, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil { child.destroyLocked(ctx) return nil, err } @@ -617,7 +636,9 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // -// Preconditions: fs.renameMu must be locked. !rp.Done(). +// Preconditions: +// * fs.renameMu must be locked. +// * !rp.Done(). func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() @@ -958,11 +979,13 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return linux.Statx{}, err } + d.dirMu.Lock() if d.verityEnabled() { - if err := fs.verifyStatAndChildren(ctx, d, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { return linux.Statx{}, err } } + d.dirMu.Unlock() return stat, nil } diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index 87dabe038..66029c64d 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -19,6 +19,18 @@ // The verity file system is read-only, except for one case: when // allowRuntimeEnable is true, additional Merkle files can be generated using // the FS_IOC_ENABLE_VERITY ioctl. +// +// Lock order: +// +// filesystem.renameMu +// dentry.dirMu +// fileDescription.mu +// filesystem.verityMu +// dentry.hashMu +// +// Locking dentry.dirMu in multiple dentries requires that parent dentries are +// locked before child dentries, and that filesystem.renameMu is locked to +// stabilize this relationship. package verity import ( @@ -52,6 +64,10 @@ const ( // tree file for "/foo" is "/.merkle.verity.foo". merklePrefix = ".merkle.verity." + // merkleRootPrefix is the prefix of the Merkle tree root file. This + // needs to be different from merklePrefix to avoid name collision. + merkleRootPrefix = ".merkleroot.verity." + // merkleOffsetInParentXattr is the extended attribute name specifying the // offset of the child hash in its parent's Merkle tree. merkleOffsetInParentXattr = "user.merkle.offset" @@ -76,13 +92,8 @@ const ( ) var ( - // noCrashOnVerificationFailure indicates whether the sandbox should panic - // whenever verification fails. If true, an error is returned instead of - // panicking. This should only be set for tests. - // - // TODO(b/165661693): Decide whether to panic or return error based on this - // flag. - noCrashOnVerificationFailure bool + // action specifies the action towards detected violation. + action ViolationAction // verityMu synchronizes concurrent operations that enable verity and perform // verification checks. @@ -93,6 +104,18 @@ var ( // content. type HashAlgorithm int +// ViolationAction is a type specifying the action when an integrity violation +// is detected. +type ViolationAction int + +const ( + // PanicOnViolation terminates the sentry on detected violation. + PanicOnViolation ViolationAction = 0 + // ErrorOnViolation returns an error from the violating system call on + // detected violation. + ErrorOnViolation = 1 +) + // Currently supported hashing algorithms include SHA256 and SHA512. const ( SHA256 HashAlgorithm = iota @@ -187,10 +210,8 @@ type InternalFilesystemOptions struct { // system wrapped by verity file system. LowerGetFSOptions vfs.GetFilesystemOptions - // NoCrashOnVerificationFailure indicates whether the sandbox should - // panic whenever verification fails. If true, an error is returned - // instead of panicking. This should only be set for tests. - NoCrashOnVerificationFailure bool + // Action specifies the action on an integrity violation. + Action ViolationAction } // Name implements vfs.FilesystemType.Name. @@ -202,10 +223,10 @@ func (FilesystemType) Name() string { func (FilesystemType) Release(ctx context.Context) {} // alertIntegrityViolation alerts a violation of integrity, which usually means -// unexpected modification to the file system is detected. In -// noCrashOnVerificationFailure mode, it returns EIO, otherwise it panic. +// unexpected modification to the file system is detected. In ErrorOnViolation +// mode, it returns EIO, otherwise it panic. func alertIntegrityViolation(msg string) error { - if noCrashOnVerificationFailure { + if action == ErrorOnViolation { return syserror.EIO } panic(msg) @@ -218,7 +239,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs") return nil, nil, syserror.EINVAL } - noCrashOnVerificationFailure = iopts.NoCrashOnVerificationFailure + action = iopts.Action // Mount the lower file system. The lower file system is wrapped inside // verity, and should not be exposed or connected. @@ -246,7 +267,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt lowerVD.IncRef() d.lowerVD = lowerVD - rootMerkleName := merklePrefix + iopts.RootMerkleFileName + rootMerkleName := merkleRootPrefix + iopts.RootMerkleFileName lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{ Root: lowerVD, @@ -372,12 +393,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames: %v", err)) } - if err := fs.verifyStatAndChildren(ctx, d, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { return nil, nil, err } } + d.hashMu.Lock() copy(d.hash, iopts.RootHash) + d.hashMu.Unlock() d.vfsd.Init(d) fs.rootDentry = d @@ -402,7 +425,8 @@ type dentry struct { fs *filesystem // mode, uid, gid and size are the file mode, owner, group, and size of - // the file in the underlying file system. + // the file in the underlying file system. They are set when a dentry + // is initialized, and never modified. mode uint32 uid uint32 gid uint32 @@ -425,18 +449,22 @@ type dentry struct { // childrenNames stores the name of all children of the dentry. This is // used by verity to check whether a child is expected. This is only - // populated by enableVerity. + // populated by enableVerity. childrenNames is also protected by dirMu. childrenNames map[string]struct{} - // lowerVD is the VirtualDentry in the underlying file system. + // lowerVD is the VirtualDentry in the underlying file system. It is + // never modified after initialized. lowerVD vfs.VirtualDentry // lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree - // in the underlying file system. + // in the underlying file system. It is never modified after + // initialized. lowerMerkleVD vfs.VirtualDentry - // hash is the calculated hash for the current file or directory. - hash []byte + // hash is the calculated hash for the current file or directory. hash + // is protected by hashMu. + hashMu sync.RWMutex `state:"nosave"` + hash []byte } // newDentry creates a new dentry representing the given verity file. The @@ -519,7 +547,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) { // destroyLocked destroys the dentry. // -// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. +// Preconditions: +// * d.fs.renameMu must be locked for writing. +// * d.refs == 0. func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: @@ -599,6 +629,8 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) // mode, it returns true if the target has been enabled with // ioctl(FS_IOC_ENABLE_VERITY). func (d *dentry) verityEnabled() bool { + d.hashMu.RLock() + defer d.hashMu.RUnlock() return !d.fs.allowRuntimeEnable || len(d.hash) != 0 } @@ -678,11 +710,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu if err != nil { return linux.Statx{}, err } + fd.d.dirMu.Lock() if fd.d.verityEnabled() { - if err := fd.d.fs.verifyStatAndChildren(ctx, fd.d, stat); err != nil { + if err := fd.d.fs.verifyStatAndChildrenLocked(ctx, fd.d, stat); err != nil { return linux.Statx{}, err } } + fd.d.dirMu.Unlock() return stat, nil } @@ -718,22 +752,24 @@ func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) return offset, nil } -// generateMerkle generates a Merkle tree file for fd. If fd points to a file -// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash -// of the generated Merkle tree and the data size is returned. If fd points to -// a regular file, the data is the content of the file. If fd points to a -// directory, the data is all hahes of its children, written to the Merkle tree -// file. -func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) { - fdReader := vfs.FileReadWriteSeeker{ +// generateMerkleLocked generates a Merkle tree file for fd. If fd points to a +// file /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The +// hash of the generated Merkle tree and the data size is returned. If fd +// points to a regular file, the data is the content of the file. If fd points +// to a directory, the data is all hashes of its children, written to the Merkle +// tree file. +// +// Preconditions: fd.d.fs.verityMu must be locked. +func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, uint64, error) { + fdReader := FileReadWriteSeeker{ FD: fd.lowerFD, Ctx: ctx, } - merkleReader := vfs.FileReadWriteSeeker{ + merkleReader := FileReadWriteSeeker{ FD: fd.merkleReader, Ctx: ctx, } - merkleWriter := vfs.FileReadWriteSeeker{ + merkleWriter := FileReadWriteSeeker{ FD: fd.merkleWriter, Ctx: ctx, } @@ -793,11 +829,14 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, return hash, uint64(params.Size), err } -// recordChildren writes the names of fd's children into the corresponding -// Merkle tree file, and saves the offset/size of the map into xattrs. +// recordChildrenLocked writes the names of fd's children into the +// corresponding Merkle tree file, and saves the offset/size of the map into +// xattrs. // -// Preconditions: fd.d.isDir() == true -func (fd *fileDescription) recordChildren(ctx context.Context) error { +// Preconditions: +// * fd.d.fs.verityMu must be locked. +// * fd.d.isDir() == true. +func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error { // Record the children names in the Merkle tree file. childrenNames, err := json.Marshal(fd.d.childrenNames) if err != nil { @@ -847,7 +886,7 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds") } - hash, dataSize, err := fd.generateMerkle(ctx) + hash, dataSize, err := fd.generateMerkleLocked(ctx) if err != nil { return 0, err } @@ -888,11 +927,13 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { } if fd.d.isDir() { - if err := fd.recordChildren(ctx); err != nil { + if err := fd.recordChildrenLocked(ctx); err != nil { return 0, err } } - fd.d.hash = append(fd.d.hash, hash...) + fd.d.hashMu.Lock() + fd.d.hash = hash + fd.d.hashMu.Unlock() return 0, nil } @@ -904,6 +945,9 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest userm } var metadata linux.DigestMetadata + fd.d.hashMu.RLock() + defer fd.d.hashMu.RUnlock() + // If allowRuntimeEnable is true, an empty fd.d.hash indicates that // verity is not enabled for the file. If allowRuntimeEnable is false, // this is an integrity violation because all files should have verity @@ -940,11 +984,13 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest userm func (fd *fileDescription) verityFlags(ctx context.Context, flags usermem.Addr) (uintptr, error) { f := int32(0) + fd.d.hashMu.RLock() // All enabled files should store a hash. This flag is not settable via // FS_IOC_SETFLAGS. if len(fd.d.hash) != 0 { f |= linux.FS_VERITY_FL } + fd.d.hashMu.RUnlock() t := kernel.TaskFromContext(ctx) if t == nil { @@ -1013,16 +1059,17 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of return 0, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err)) } - dataReader := vfs.FileReadWriteSeeker{ + dataReader := FileReadWriteSeeker{ FD: fd.lowerFD, Ctx: ctx, } - merkleReader := vfs.FileReadWriteSeeker{ + merkleReader := FileReadWriteSeeker{ FD: fd.merkleReader, Ctx: ctx, } + fd.d.hashMu.RLock() n, err := merkletree.Verify(&merkletree.VerifyParams{ Out: dst.Writer(ctx), File: &dataReader, @@ -1040,6 +1087,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of Expected: fd.d.hash, DataAndTreeInSameFile: false, }) + fd.d.hashMu.RUnlock() if err != nil { return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err)) } @@ -1065,3 +1113,45 @@ func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence) } + +// FileReadWriteSeeker is a helper struct to pass a vfs.FileDescription as +// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc. +type FileReadWriteSeeker struct { + FD *vfs.FileDescription + Ctx context.Context + ROpts vfs.ReadOptions + WOpts vfs.WriteOptions +} + +// ReadAt implements io.ReaderAt.ReadAt. +func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts) + return int(n), err +} + +// Read implements io.ReadWriteSeeker.Read. +func (f *FileReadWriteSeeker) Read(p []byte) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.Read(f.Ctx, dst, f.ROpts) + return int(n), err +} + +// Seek implements io.ReadWriteSeeker.Seek. +func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) { + return f.FD.Seek(f.Ctx, offset, int32(whence)) +} + +// WriteAt implements io.WriterAt.WriteAt. +func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts) + return int(n), err +} + +// Write implements io.ReadWriteSeeker.Write. +func (f *FileReadWriteSeeker) Write(p []byte) (int, error) { + buf := usermem.BytesIOSequence(p) + n, err := f.FD.Write(f.Ctx, buf, f.WOpts) + return int(n), err +} diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go index 7196e74eb..30d8b4355 100644 --- a/pkg/sentry/fsimpl/verity/verity_test.go +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -35,16 +35,39 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// rootMerkleFilename is the name of the root Merkle tree file. -const rootMerkleFilename = "root.verity" +const ( + // rootMerkleFilename is the name of the root Merkle tree file. + rootMerkleFilename = "root.verity" + // maxDataSize is the maximum data size of a test file. + maxDataSize = 100000 +) + +var hashAlgs = []HashAlgorithm{SHA256, SHA512} -// maxDataSize is the maximum data size written to the file for test. -const maxDataSize = 100000 +func dentryFromVD(t *testing.T, vd vfs.VirtualDentry) *dentry { + t.Helper() + d, ok := vd.Dentry().Impl().(*dentry) + if !ok { + t.Fatalf("can't assert %T as a *dentry", vd) + } + return d +} + +// dentryFromFD returns the dentry corresponding to fd. +func dentryFromFD(t *testing.T, fd *vfs.FileDescription) *dentry { + t.Helper() + f, ok := fd.Impl().(*fileDescription) + if !ok { + t.Fatalf("can't assert %T as a *fileDescription", fd) + } + return f.d +} // newVerityRoot creates a new verity mount, and returns the root. The // underlying file system is tmpfs. If the error is not nil, then cleanup // should be called when the root is no longer needed. func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) { + t.Helper() k, err := testutil.Boot() if err != nil { t.Fatalf("testutil.Boot: %v", err) @@ -69,11 +92,11 @@ func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, mntns, err := vfsObj.NewMountNamespace(ctx, auth.CredentialsFromContext(ctx), "", "verity", &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ InternalData: InternalFilesystemOptions{ - RootMerkleFileName: rootMerkleFilename, - LowerName: "tmpfs", - Alg: hashAlg, - AllowRuntimeEnable: true, - NoCrashOnVerificationFailure: true, + RootMerkleFileName: rootMerkleFilename, + LowerName: "tmpfs", + Alg: hashAlg, + AllowRuntimeEnable: true, + Action: ErrorOnViolation, }, }, }) @@ -92,7 +115,6 @@ func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, t.Fatalf("testutil.CreateTask: %v", err) } - t.Helper() t.Cleanup(func() { root.DecRef(ctx) mntns.DecRef(ctx) @@ -100,21 +122,97 @@ func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, return vfsObj, root, task, nil } -// newFileFD creates a new file in the verity mount, and returns the FD. The FD -// points to a file that has random data generated. -func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) { - creds := auth.CredentialsFromContext(ctx) - lowerRoot := root.Dentry().Impl().(*dentry).lowerVD +// openVerityAt opens a verity file. +// +// TODO(chongc): release reference from opening the file when done. +func openVerityAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, path string, flags uint32, mode linux.FileMode) (*vfs.FileDescription, error) { + return vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: vd, + Start: vd, + Path: fspath.Parse(path), + }, &vfs.OpenOptions{ + Flags: flags, + Mode: mode, + }) +} - // Create the file in the underlying file system. - lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: lowerRoot, - Start: lowerRoot, - Path: fspath.Parse(filePath), +// openLowerAt opens the file in the underlying file system. +// +// TODO(chongc): release reference from opening the file when done. +func (d *dentry) openLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, path string, flags uint32, mode linux.FileMode) (*vfs.FileDescription, error) { + return vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + Path: fspath.Parse(path), + }, &vfs.OpenOptions{ + Flags: flags, + Mode: mode, + }) +} + +// openLowerMerkleAt opens the Merkle file in the underlying file system. +// +// TODO(chongc): release reference from opening the file when done. +func (d *dentry) openLowerMerkleAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, flags uint32, mode linux.FileMode) (*vfs.FileDescription, error) { + return vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, - Mode: linux.ModeRegular | mode, + Flags: flags, + Mode: mode, + }) +} + +// unlinkLowerAt deletes the file in the underlying file system. +func (d *dentry) unlinkLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, path string) error { + return vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + Path: fspath.Parse(path), }) +} + +// unlinkLowerMerkleAt deletes the Merkle file in the underlying file system. +func (d *dentry) unlinkLowerMerkleAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, path string) error { + return vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + Path: fspath.Parse(merklePrefix + path), + }) +} + +// renameLowerAt renames file name to newName in the underlying file system. +func (d *dentry) renameLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, name string, newName string) error { + return vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + Path: fspath.Parse(name), + }, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + Path: fspath.Parse(newName), + }, &vfs.RenameOptions{}) +} + +// renameLowerMerkleAt renames Merkle file name to newName in the underlying +// file system. +func (d *dentry) renameLowerMerkleAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, name string, newName string) error { + return vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + Path: fspath.Parse(merklePrefix + name), + }, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + Path: fspath.Parse(merklePrefix + newName), + }, &vfs.RenameOptions{}) +} + +// newFileFD creates a new file in the verity mount, and returns the FD. The FD +// points to a file that has random data generated. +func newFileFD(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) { + // Create the file in the underlying file system. + lowerFD, err := dentryFromVD(t, root).openLowerAt(ctx, vfsObj, filePath, linux.O_RDWR|linux.O_CREAT|linux.O_EXCL, linux.ModeRegular|mode) if err != nil { return nil, 0, err } @@ -137,20 +235,24 @@ func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.Virt lowerFD.DecRef(ctx) // Now open the verity file descriptor. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filePath), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - Mode: linux.ModeRegular | mode, - }) + fd, err := openVerityAt(ctx, vfsObj, root, filePath, linux.O_RDONLY, mode) return fd, dataSize, err } -// corruptRandomBit randomly flips a bit in the file represented by fd. -func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error { - // Flip a random bit in the underlying file. +// newEmptyFileFD creates a new empty file in the verity mount, and returns the FD. +func newEmptyFileFD(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, error) { + // Create the file in the underlying file system. + _, err := dentryFromVD(t, root).openLowerAt(ctx, vfsObj, filePath, linux.O_RDWR|linux.O_CREAT|linux.O_EXCL, linux.ModeRegular|mode) + if err != nil { + return nil, err + } + // Now open the verity file descriptor. + fd, err := openVerityAt(ctx, vfsObj, root, filePath, linux.O_RDONLY, mode) + return fd, err +} + +// flipRandomBit randomly flips a bit in the file represented by fd. +func flipRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error { randomPos := int64(rand.Intn(size)) byteToModify := make([]byte, 1) if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil { @@ -163,7 +265,14 @@ func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) er return nil } -var hashAlgs = []HashAlgorithm{SHA256, SHA512} +func enableVerity(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { + t.Helper() + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("enable verity: %v", err) + } +} // TestOpen ensures that when a file is created, the corresponding Merkle tree // file and the root Merkle tree file exist. @@ -175,30 +284,18 @@ func TestOpen(t *testing.T) { } filename := "verity-test-file" - if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil { + fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) + if err != nil { t.Fatalf("newFileFD: %v", err) } // Ensure that the corresponding Merkle tree file is created. - lowerRoot := root.Dentry().Impl().(*dentry).lowerVD - if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: lowerRoot, - Start: lowerRoot, - Path: fspath.Parse(merklePrefix + filename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - }); err != nil { + if _, err = dentryFromFD(t, fd).openLowerMerkleAt(ctx, vfsObj, linux.O_RDONLY, linux.ModeRegular); err != nil { t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err) } // Ensure the root merkle tree file is created. - if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: lowerRoot, - Start: lowerRoot, - Path: fspath.Parse(merklePrefix + rootMerkleFilename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - }); err != nil { + if _, err = dentryFromVD(t, root).openLowerMerkleAt(ctx, vfsObj, linux.O_RDONLY, linux.ModeRegular); err != nil { t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err) } } @@ -214,17 +311,13 @@ func TestPReadUnmodifiedFileSucceeds(t *testing.T) { } filename := "verity-test-file" - fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file and confirm a normal read succeeds. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) buf := make([]byte, size) n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}) @@ -248,17 +341,13 @@ func TestReadUnmodifiedFileSucceeds(t *testing.T) { } filename := "verity-test-file" - fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file and confirm a normal read succeeds. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) buf := make([]byte, size) n, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) @@ -272,6 +361,36 @@ func TestReadUnmodifiedFileSucceeds(t *testing.T) { } } +// TestReadUnmodifiedEmptyFileSucceeds ensures that read from an untouched empty verity +// file succeeds after enabling verity for it. +func TestReadUnmodifiedEmptyFileSucceeds(t *testing.T) { + for _, alg := range hashAlgs { + vfsObj, root, ctx, err := newVerityRoot(t, alg) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-empty-file" + fd, err := newEmptyFileFD(ctx, t, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newEmptyFileFD: %v", err) + } + + // Enable verity on the file and confirm a normal read succeeds. + enableVerity(ctx, t, fd) + + var buf []byte + n, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) + if err != nil && err != io.EOF { + t.Fatalf("fd.Read: %v", err) + } + + if n != 0 { + t.Errorf("fd.Read got read length %d, expected 0", n) + } + } +} + // TestReopenUnmodifiedFileSucceeds ensures that reopen an untouched verity file // succeeds after enabling verity for it. func TestReopenUnmodifiedFileSucceeds(t *testing.T) { @@ -282,27 +401,16 @@ func TestReopenUnmodifiedFileSucceeds(t *testing.T) { } filename := "verity-test-file" - fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file and confirms a normal read succeeds. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) // Ensure reopening the verity enabled file succeeds. - if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - Mode: linux.ModeRegular, - }); err != nil { + if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err != nil { t.Errorf("reopen enabled file failed: %v", err) } } @@ -317,43 +425,24 @@ func TestOpenNonexistentFile(t *testing.T) { } filename := "verity-test-file" - fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file and confirms a normal read succeeds. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) // Enable verity on the parent directory. - parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: root, - Start: root, - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - }) + parentFD, err := openVerityAt(ctx, vfsObj, root, "", linux.O_RDONLY, linux.ModeRegular) if err != nil { t.Fatalf("OpenAt: %v", err) } - - if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, parentFD) // Ensure open an unexpected file in the parent directory fails with // ENOENT rather than verification failure. - if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename + "abc"), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - Mode: linux.ModeRegular, - }); err != syserror.ENOENT { + if _, err = openVerityAt(ctx, vfsObj, root, filename+"abc", linux.O_RDONLY, linux.ModeRegular); err != syserror.ENOENT { t.Errorf("OpenAt unexpected error: %v", err) } } @@ -368,33 +457,22 @@ func TestPReadModifiedFileFails(t *testing.T) { } filename := "verity-test-file" - fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) // Open a new lowerFD that's read/writable. - lowerVD := fd.Impl().(*fileDescription).d.lowerVD - - lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: lowerVD, - Start: lowerVD, - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR, - }) + lowerFD, err := dentryFromFD(t, fd).openLowerAt(ctx, vfsObj, "", linux.O_RDWR, linux.ModeRegular) if err != nil { t.Fatalf("OpenAt: %v", err) } - if err := corruptRandomBit(ctx, lowerFD, size); err != nil { - t.Fatalf("corruptRandomBit: %v", err) + if err := flipRandomBit(ctx, lowerFD, size); err != nil { + t.Fatalf("flipRandomBit: %v", err) } // Confirm that read from the modified file fails. @@ -415,33 +493,22 @@ func TestReadModifiedFileFails(t *testing.T) { } filename := "verity-test-file" - fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) // Open a new lowerFD that's read/writable. - lowerVD := fd.Impl().(*fileDescription).d.lowerVD - - lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: lowerVD, - Start: lowerVD, - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR, - }) + lowerFD, err := dentryFromFD(t, fd).openLowerAt(ctx, vfsObj, "", linux.O_RDWR, linux.ModeRegular) if err != nil { t.Fatalf("OpenAt: %v", err) } - if err := corruptRandomBit(ctx, lowerFD, size); err != nil { - t.Fatalf("corruptRandomBit: %v", err) + if err := flipRandomBit(ctx, lowerFD, size); err != nil { + t.Fatalf("flipRandomBit: %v", err) } // Confirm that read from the modified file fails. @@ -462,27 +529,16 @@ func TestModifiedMerkleFails(t *testing.T) { } filename := "verity-test-file" - fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, size, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) // Open a new lowerMerkleFD that's read/writable. - lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD - - lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: lowerMerkleVD, - Start: lowerMerkleVD, - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR, - }) + lowerMerkleFD, err := dentryFromFD(t, fd).openLowerMerkleAt(ctx, vfsObj, linux.O_RDWR, linux.ModeRegular) if err != nil { t.Fatalf("OpenAt: %v", err) } @@ -493,14 +549,13 @@ func TestModifiedMerkleFails(t *testing.T) { t.Errorf("lowerMerkleFD.Stat: %v", err) } - if err := corruptRandomBit(ctx, lowerMerkleFD, int(stat.Size)); err != nil { - t.Fatalf("corruptRandomBit: %v", err) + if err := flipRandomBit(ctx, lowerMerkleFD, int(stat.Size)); err != nil { + t.Fatalf("flipRandomBit: %v", err) } // Confirm that read from a file with modified Merkle tree fails. buf := make([]byte, size) if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil { - fmt.Println(buf) t.Fatalf("fd.PRead succeeded with modified Merkle file") } } @@ -517,42 +572,23 @@ func TestModifiedParentMerkleFails(t *testing.T) { } filename := "verity-test-file" - fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) // Enable verity on the parent directory. - parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: root, - Start: root, - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - }) + parentFD, err := openVerityAt(ctx, vfsObj, root, "", linux.O_RDONLY, linux.ModeRegular) if err != nil { t.Fatalf("OpenAt: %v", err) } - - if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, parentFD) // Open a new lowerMerkleFD that's read/writable. - parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD - - parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: parentLowerMerkleVD, - Start: parentLowerMerkleVD, - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR, - }) + parentLowerMerkleFD, err := dentryFromFD(t, fd).parent.openLowerMerkleAt(ctx, vfsObj, linux.O_RDWR, linux.ModeRegular) if err != nil { t.Fatalf("OpenAt: %v", err) } @@ -572,21 +608,14 @@ func TestModifiedParentMerkleFails(t *testing.T) { if err != nil { t.Fatalf("Failed convert size to int: %v", err) } - if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil { - t.Fatalf("corruptRandomBit: %v", err) + if err := flipRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil { + t.Fatalf("flipRandomBit: %v", err) } parentLowerMerkleFD.DecRef(ctx) // Ensure reopening the verity enabled file fails. - if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - Mode: linux.ModeRegular, - }); err == nil { + if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err == nil { t.Errorf("OpenAt file with modified parent Merkle succeeded") } } @@ -602,18 +631,13 @@ func TestUnmodifiedStatSucceeds(t *testing.T) { } filename := "verity-test-file" - fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } - // Enable verity on the file and confirms stat succeeds. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("fd.Ioctl: %v", err) - } - + // Enable verity on the file and confirm that stat succeeds. + enableVerity(ctx, t, fd) if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil { t.Errorf("fd.Stat: %v", err) } @@ -630,17 +654,13 @@ func TestModifiedStatFails(t *testing.T) { } filename := "verity-test-file" - fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("fd.Ioctl: %v", err) - } + enableVerity(ctx, t, fd) lowerFD := fd.Impl().(*fileDescription).lowerFD // Change the stat of the underlying file, and check that stat fails. @@ -663,73 +683,57 @@ func TestModifiedStatFails(t *testing.T) { // and/or the corresponding Merkle tree file fails with the verity error. func TestOpenDeletedFileFails(t *testing.T) { testCases := []struct { + name string // The original file is removed if changeFile is true. changeFile bool // The Merkle tree file is removed if changeMerkleFile is true. changeMerkleFile bool }{ { + name: "FileOnly", changeFile: true, changeMerkleFile: false, }, { + name: "MerkleOnly", changeFile: false, changeMerkleFile: true, }, { + name: "FileAndMerkle", changeFile: true, changeMerkleFile: true, }, } for _, tc := range testCases { - t.Run(fmt.Sprintf("changeFile:%t, changeMerkleFile:%t", tc.changeFile, tc.changeMerkleFile), func(t *testing.T) { + t.Run(tc.name, func(t *testing.T) { vfsObj, root, ctx, err := newVerityRoot(t, SHA256) if err != nil { t.Fatalf("newVerityRoot: %v", err) } filename := "verity-test-file" - fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) - rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD if tc.changeFile { - if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(filename), - }); err != nil { + if err := dentryFromVD(t, root).unlinkLowerAt(ctx, vfsObj, filename); err != nil { t.Fatalf("UnlinkAt: %v", err) } } if tc.changeMerkleFile { - if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(merklePrefix + filename), - }); err != nil { + if err := dentryFromVD(t, root).unlinkLowerMerkleAt(ctx, vfsObj, filename); err != nil { t.Fatalf("UnlinkAt: %v", err) } } // Ensure reopening the verity enabled file fails. - if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - Mode: linux.ModeRegular, - }); err != syserror.EIO { + if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err != syserror.EIO { t.Errorf("got OpenAt error: %v, expected EIO", err) } }) @@ -740,82 +744,58 @@ func TestOpenDeletedFileFails(t *testing.T) { // and/or the corresponding Merkle tree file fails with the verity error. func TestOpenRenamedFileFails(t *testing.T) { testCases := []struct { + name string // The original file is renamed if changeFile is true. changeFile bool // The Merkle tree file is renamed if changeMerkleFile is true. changeMerkleFile bool }{ { + name: "FileOnly", changeFile: true, changeMerkleFile: false, }, { + name: "MerkleOnly", changeFile: false, changeMerkleFile: true, }, { + name: "FileAndMerkle", changeFile: true, changeMerkleFile: true, }, } for _, tc := range testCases { - t.Run(fmt.Sprintf("changeFile:%t, changeMerkleFile:%t", tc.changeFile, tc.changeMerkleFile), func(t *testing.T) { + t.Run(tc.name, func(t *testing.T) { vfsObj, root, ctx, err := newVerityRoot(t, SHA256) if err != nil { t.Fatalf("newVerityRoot: %v", err) } filename := "verity-test-file" - fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + fd, _, err := newFileFD(ctx, t, vfsObj, root, filename, 0644) if err != nil { t.Fatalf("newFileFD: %v", err) } // Enable verity on the file. - var args arch.SyscallArguments - args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} - if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { - t.Fatalf("Ioctl: %v", err) - } + enableVerity(ctx, t, fd) - rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD newFilename := "renamed-test-file" if tc.changeFile { - if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(filename), - }, &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(newFilename), - }, &vfs.RenameOptions{}); err != nil { + if err := dentryFromVD(t, root).renameLowerAt(ctx, vfsObj, filename, newFilename); err != nil { t.Fatalf("RenameAt: %v", err) } } if tc.changeMerkleFile { - if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(merklePrefix + filename), - }, &vfs.PathOperation{ - Root: rootLowerVD, - Start: rootLowerVD, - Path: fspath.Parse(merklePrefix + newFilename), - }, &vfs.RenameOptions{}); err != nil { + if err := dentryFromVD(t, root).renameLowerMerkleAt(ctx, vfsObj, filename, newFilename); err != nil { t.Fatalf("UnlinkAt: %v", err) } } // Ensure reopening the verity enabled file fails. - if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - Mode: linux.ModeRegular, - }); err != syserror.EIO { + if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err != syserror.EIO { t.Errorf("got OpenAt error: %v, expected EIO", err) } }) diff --git a/pkg/sentry/fsmetric/BUILD b/pkg/sentry/fsmetric/BUILD new file mode 100644 index 000000000..4e86fbdd8 --- /dev/null +++ b/pkg/sentry/fsmetric/BUILD @@ -0,0 +1,10 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "fsmetric", + srcs = ["fsmetric.go"], + visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/metric"], +) diff --git a/pkg/sentry/fsmetric/fsmetric.go b/pkg/sentry/fsmetric/fsmetric.go new file mode 100644 index 000000000..7e535b527 --- /dev/null +++ b/pkg/sentry/fsmetric/fsmetric.go @@ -0,0 +1,83 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsmetric defines filesystem metrics that are used by both VFS1 and +// VFS2. +// +// TODO(gvisor.dev/issue/1624): Once VFS1 is deleted, inline these metrics into +// VFS2. +package fsmetric + +import ( + "time" + + "gvisor.dev/gvisor/pkg/metric" +) + +// RecordWaitTime enables the ReadWait, GoferReadWait9P, GoferReadWaitHost, and +// TmpfsReadWait metrics. Enabling this comes at a CPU cost due to performing +// three clock reads per read call. +// +// Note that this is only performed in the direct read path, and may not be +// consistently applied for other forms of reads, such as splice. +var RecordWaitTime = false + +// Metrics that apply to all filesystems. +var ( + Opens = metric.MustCreateNewUint64Metric("/fs/opens", false /* sync */, "Number of file opens.") + Reads = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.") + ReadWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.") +) + +// Metrics that only apply to fs/gofer and fsimpl/gofer. +var ( + GoferOpensWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a executable file was opened writably from a gofer.") + GoferOpens9P = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a file was opened from a gofer and did not have a host file descriptor.") + GoferOpensHost = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a file was opened from a gofer and did have a host file descriptor.") + GoferReads9P = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.") + GoferReadWait9P = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.") + GoferReadsHost = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.") + GoferReadWaitHost = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.") +) + +// Metrics that only apply to fs/tmpfs and fsimpl/tmpfs. +var ( + TmpfsOpensRO = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.") + TmpfsOpensW = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.") + TmpfsReads = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.") + TmpfsReadWait = metric.MustCreateNewUint64NanosecondsMetric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.") +) + +// StartReadWait indicates the beginning of a file read. +func StartReadWait() time.Time { + if !RecordWaitTime { + return time.Time{} + } + return time.Now() +} + +// FinishReadWait indicates the end of a file read whose time is accounted by +// m. start must be the value returned by the corresponding call to +// StartReadWait. +// +// FinishReadWait is marked nosplit for performance since it's often called +// from defer statements, which prevents it from being inlined +// (https://github.com/golang/go/issues/38471). +//go:nosplit +func FinishReadWait(m *metric.Uint64Metric, start time.Time) { + if !RecordWaitTime { + return + } + m.IncrementBy(uint64(time.Since(start).Nanoseconds())) +} diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 15519f0df..61aeca044 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -273,7 +273,7 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { // // Callback is called when one of the files we're polling becomes ready. It // moves said file to the readyList if it's currently in the waiting list. -func (p *pollEntry) Callback(*waiter.Entry) { +func (p *pollEntry) Callback(*waiter.Entry, waiter.EventMask) { e := p.epoll e.listsMu.Lock() @@ -306,9 +306,8 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) { f.EventRegister(&entry.waiter, entry.mask) // Check if the file happens to already be in a ready state. - ready := f.Readiness(entry.mask) & entry.mask - if ready != 0 { - entry.Callback(&entry.waiter) + if ready := f.Readiness(entry.mask) & entry.mask; ready != 0 { + entry.Callback(&entry.waiter, ready) } } diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 2b3955598..f855f038b 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -8,11 +8,13 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/sentry/arch", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", + "//pkg/syserror", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index 153d2cd9b..b66d61c6f 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -17,22 +17,45 @@ package fasync import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) -// New creates a new fs.FileAsync. -func New() fs.FileAsync { - return &FileAsync{} +// Table to convert waiter event masks into si_band siginfo codes. +// Taken from fs/fcntl.c:band_table. +var bandTable = map[waiter.EventMask]int64{ + // POLL_IN + waiter.EventIn: linux.EPOLLIN | linux.EPOLLRDNORM, + // POLL_OUT + waiter.EventOut: linux.EPOLLOUT | linux.EPOLLWRNORM | linux.EPOLLWRBAND, + // POLL_ERR + waiter.EventErr: linux.EPOLLERR, + // POLL_PRI + waiter.EventPri: linux.EPOLLPRI | linux.EPOLLRDBAND, + // POLL_HUP + waiter.EventHUp: linux.EPOLLHUP | linux.EPOLLERR, } -// NewVFS2 creates a new vfs.FileAsync. -func NewVFS2() vfs.FileAsync { - return &FileAsync{} +// New returns a function that creates a new fs.FileAsync with the given file +// descriptor. +func New(fd int) func() fs.FileAsync { + return func() fs.FileAsync { + return &FileAsync{fd: fd} + } +} + +// NewVFS2 returns a function that creates a new vfs.FileAsync with the given +// file descriptor. +func NewVFS2(fd int) func() vfs.FileAsync { + return func() vfs.FileAsync { + return &FileAsync{fd: fd} + } } // FileAsync sends signals when the registered file is ready for IO. @@ -42,6 +65,12 @@ type FileAsync struct { // e is immutable after first use (which is protected by mu below). e waiter.Entry + // fd is the file descriptor to notify about. + // It is immutable, set at allocation time. This matches Linux semantics in + // fs/fcntl.c:fasync_helper. + // The fd value is passed to the signal recipient in siginfo.si_fd. + fd int + // regMu protects registeration and unregistration actions on e. // // regMu must be held while registration decisions are being made @@ -56,6 +85,10 @@ type FileAsync struct { mu sync.Mutex `state:"nosave"` requester *auth.Credentials registered bool + // signal is the signal to deliver upon I/O being available. + // The default value ("zero signal") means the default SIGIO signal will be + // delivered. + signal linux.Signal // Only one of the following is allowed to be non-nil. recipientPG *kernel.ProcessGroup @@ -64,10 +97,10 @@ type FileAsync struct { } // Callback sends a signal. -func (a *FileAsync) Callback(e *waiter.Entry) { +func (a *FileAsync) Callback(e *waiter.Entry, mask waiter.EventMask) { a.mu.Lock() + defer a.mu.Unlock() if !a.registered { - a.mu.Unlock() return } t := a.recipientT @@ -80,19 +113,34 @@ func (a *FileAsync) Callback(e *waiter.Entry) { } if t == nil { // No recipient has been registered. - a.mu.Unlock() return } c := t.Credentials() // Logic from sigio_perm in fs/fcntl.c. - if a.requester.EffectiveKUID == 0 || + permCheck := (a.requester.EffectiveKUID == 0 || a.requester.EffectiveKUID == c.SavedKUID || a.requester.EffectiveKUID == c.RealKUID || a.requester.RealKUID == c.SavedKUID || - a.requester.RealKUID == c.RealKUID { - t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO)) + a.requester.RealKUID == c.RealKUID) + if !permCheck { + return } - a.mu.Unlock() + signalInfo := &arch.SignalInfo{ + Signo: int32(linux.SIGIO), + Code: arch.SignalInfoKernel, + } + if a.signal != 0 { + signalInfo.Signo = int32(a.signal) + signalInfo.SetFD(uint32(a.fd)) + var band int64 + for m, bandCode := range bandTable { + if m&mask != 0 { + band |= bandCode + } + } + signalInfo.SetBand(band) + } + t.SendSignal(signalInfo) } // Register sets the file which will be monitored for IO events. @@ -186,3 +234,25 @@ func (a *FileAsync) ClearOwner() { a.recipientTG = nil a.recipientPG = nil } + +// Signal returns which signal will be sent to the signal recipient. +// A value of zero means the signal to deliver wasn't customized, which means +// the default signal (SIGIO) will be delivered. +func (a *FileAsync) Signal() linux.Signal { + a.mu.Lock() + defer a.mu.Unlock() + return a.signal +} + +// SetSignal overrides which signal to send when I/O is available. +// The default behavior can be reset by specifying signal zero, which means +// to send SIGIO. +func (a *FileAsync) SetSignal(signal linux.Signal) error { + if signal != 0 && !signal.IsValid() { + return syserror.EINVAL + } + a.mu.Lock() + defer a.mu.Unlock() + a.signal = signal + return nil +} diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index 470d8bf83..f17f9c59c 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -121,18 +121,21 @@ func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 panic("VFS1 and VFS2 files set") } - slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + slicePtr := (*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) // Grow the table as required. - if last := int32(len(slice)); fd >= last { + if last := int32(len(*slicePtr)); fd >= last { end := fd + 1 if end < 2*last { end = 2 * last } - slice = append(slice, make([]unsafe.Pointer, end-last)...) - atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) + newSlice := append(*slicePtr, make([]unsafe.Pointer, end-last)...) + slicePtr = &newSlice + atomic.StorePointer(&f.slice, unsafe.Pointer(slicePtr)) } + slice := *slicePtr + var desc *descriptor if file != nil || fileVFS2 != nil { desc = &descriptor{ diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 2cdcdfc1f..b8627a54f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -214,9 +214,11 @@ type Kernel struct { // netlinkPorts manages allocation of netlink socket port IDs. netlinkPorts *port.Manager - // saveErr is the error causing the sandbox to exit during save, if - // any. It is protected by extMu. - saveErr error `state:"nosave"` + // saveStatus is nil if the sandbox has not been saved, errSaved or + // errAutoSaved if it has been saved successfully, or the error causing the + // sandbox to exit during save. + // It is protected by extMu. + saveStatus error `state:"nosave"` // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` @@ -1481,12 +1483,42 @@ func (k *Kernel) NetlinkPorts() *port.Manager { return k.netlinkPorts } -// SaveError returns the sandbox error that caused the kernel to exit during -// save. -func (k *Kernel) SaveError() error { +var ( + errSaved = errors.New("sandbox has been successfully saved") + errAutoSaved = errors.New("sandbox has been successfully auto-saved") +) + +// SaveStatus returns the sandbox save status. If it was saved successfully, +// autosaved indicates whether save was triggered by autosave. If it was not +// saved successfully, err indicates the sandbox error that caused the kernel to +// exit during save. +func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) { + k.extMu.Lock() + defer k.extMu.Unlock() + switch k.saveStatus { + case nil: + return false, false, nil + case errSaved: + return true, false, nil + case errAutoSaved: + return true, true, nil + default: + return false, false, k.saveStatus + } +} + +// SetSaveSuccess sets the flag indicating that save completed successfully, if +// no status was already set. +func (k *Kernel) SetSaveSuccess(autosave bool) { k.extMu.Lock() defer k.extMu.Unlock() - return k.saveErr + if k.saveStatus == nil { + if autosave { + k.saveStatus = errAutoSaved + } else { + k.saveStatus = errSaved + } + } } // SetSaveError sets the sandbox error that caused the kernel to exit during @@ -1494,8 +1526,8 @@ func (k *Kernel) SaveError() error { func (k *Kernel) SetSaveError(err error) { k.extMu.Lock() defer k.extMu.Unlock() - if k.saveErr == nil { - k.saveErr = err + if k.saveStatus == nil { + k.saveStatus = err } } diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 1abfe2201..cef58a590 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -259,8 +259,8 @@ func (t *Task) ptraceTrapLocked(code int32) { Signo: int32(linux.SIGTRAP), Code: code, } - t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) - t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) if t.beginPtraceStopLocked() { tracer := t.Tracer() tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP)) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index b99c0bffa..db01e4a97 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -29,17 +29,17 @@ import ( ) const ( - valueMax = 32767 // SEMVMX + // Maximum semaphore value. + valueMax = linux.SEMVMX - // semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL). - semaphoresMax = 32000 + // Maximum number of semaphore sets. + setsMax = linux.SEMMNI - // setMax is "system-wide limit on the number of semaphore sets" (SEMMNI). - setsMax = 32000 + // Maximum number of semaphroes in a semaphore set. + semsMax = linux.SEMMSL - // semaphoresTotalMax is "system-wide limit on the number of semaphores" - // (SEMMNS = SEMMNI*SEMMSL). - semaphoresTotalMax = 1024000000 + // Maximum number of semaphores in all semaphroe sets. + semsTotalMax = linux.SEMMNS ) // Registry maintains a set of semaphores that can be found by key or ID. @@ -52,6 +52,9 @@ type Registry struct { mu sync.Mutex `state:"nosave"` semaphores map[int32]*Set lastIDUsed int32 + // indexes maintains a mapping between a set's index in virtual array and + // its identifier. + indexes map[int32]int32 } // Set represents a set of semaphores that can be operated atomically. @@ -113,6 +116,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ userNS: userNS, semaphores: make(map[int32]*Set), + indexes: make(map[int32]int32), } } @@ -122,7 +126,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { // be found. If exclusive is true, it fails if a set with the same key already // exists. func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { - if nsems < 0 || nsems > semaphoresMax { + if nsems < 0 || nsems > semsMax { return nil, syserror.EINVAL } @@ -163,10 +167,13 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu } // Apply system limits. + // + // Map semaphores and map indexes in a registry are of the same size, + // check map semaphores only here for the system limit. if len(r.semaphores) >= setsMax { return nil, syserror.EINVAL } - if r.totalSems() > int(semaphoresTotalMax-nsems) { + if r.totalSems() > int(semsTotalMax-nsems) { return nil, syserror.EINVAL } @@ -176,6 +183,53 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu return r.newSet(ctx, key, owner, owner, perms, nsems) } +// IPCInfo returns information about system-wide semaphore limits and parameters. +func (r *Registry) IPCInfo() *linux.SemInfo { + return &linux.SemInfo{ + SemMap: linux.SEMMAP, + SemMni: linux.SEMMNI, + SemMns: linux.SEMMNS, + SemMnu: linux.SEMMNU, + SemMsl: linux.SEMMSL, + SemOpm: linux.SEMOPM, + SemUme: linux.SEMUME, + SemUsz: linux.SEMUSZ, + SemVmx: linux.SEMVMX, + SemAem: linux.SEMAEM, + } +} + +// SemInfo returns a seminfo structure containing the same information as +// for IPC_INFO, except that SemUsz field returns the number of existing +// semaphore sets, and SemAem field returns the number of existing semaphores. +func (r *Registry) SemInfo() *linux.SemInfo { + r.mu.Lock() + defer r.mu.Unlock() + + info := r.IPCInfo() + info.SemUsz = uint32(len(r.semaphores)) + info.SemAem = uint32(r.totalSems()) + + return info +} + +// HighestIndex returns the index of the highest used entry in +// the kernel's array. +func (r *Registry) HighestIndex() int32 { + r.mu.Lock() + defer r.mu.Unlock() + + // By default, highest used index is 0 even though + // there is no semaphroe set. + var highestIndex int32 + for index := range r.indexes { + if index > highestIndex { + highestIndex = index + } + } + return highestIndex +} + // RemoveID removes set with give 'id' from the registry and marks the set as // dead. All waiters will be awakened and fail. func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { @@ -186,6 +240,11 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { if set == nil { return syserror.EINVAL } + index, found := r.findIndexByID(id) + if !found { + // Inconsistent state. + panic(fmt.Sprintf("unable to find an index for ID: %d", id)) + } set.mu.Lock() defer set.mu.Unlock() @@ -197,6 +256,7 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { } delete(r.semaphores, set.ID) + delete(r.indexes, index) set.destroy() return nil } @@ -220,6 +280,11 @@ func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.File continue } if r.semaphores[id] == nil { + index, found := r.findFirstAvailableIndex() + if !found { + panic("unable to find an available index") + } + r.indexes[index] = id r.lastIDUsed = id r.semaphores[id] = set set.ID = id @@ -238,6 +303,18 @@ func (r *Registry) FindByID(id int32) *Set { return r.semaphores[id] } +// FindByIndex looks up a set given an index. +func (r *Registry) FindByIndex(index int32) *Set { + r.mu.Lock() + defer r.mu.Unlock() + + id, present := r.indexes[index] + if !present { + return nil + } + return r.semaphores[id] +} + func (r *Registry) findByKey(key int32) *Set { for _, v := range r.semaphores { if v.key == key { @@ -247,6 +324,24 @@ func (r *Registry) findByKey(key int32) *Set { return nil } +func (r *Registry) findIndexByID(id int32) (int32, bool) { + for k, v := range r.indexes { + if v == id { + return k, true + } + } + return 0, false +} + +func (r *Registry) findFirstAvailableIndex() (int32, bool) { + for index := int32(0); index < setsMax; index++ { + if _, present := r.indexes[index]; !present { + return index, true + } + } + return 0, false +} + func (r *Registry) totalSems() int { totalSems := 0 for _, v := range r.semaphores { diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 80a592c8f..073e14507 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -6,6 +6,9 @@ package(licenses = ["notice"]) go_template_instance( name = "shm_refs", out = "shm_refs.go", + consts = { + "enableLogging": "true", + }, package = "shm", prefix = "Shm", template = "//pkg/refsvfs2:refs_template", diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go index e8cce37d0..2488ae7d5 100644 --- a/pkg/sentry/kernel/signal.go +++ b/pkg/sentry/kernel/signal.go @@ -73,7 +73,7 @@ func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo Signo: int32(sig), Code: arch.SignalInfoUser, } - info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) - info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + info.SetPID(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) + info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) return info } diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 78f718cfe..884966120 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -106,8 +106,8 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS Signo: uint32(info.Signo), Errno: info.Errno, Code: info.Code, - PID: uint32(info.Pid()), - UID: uint32(info.Uid()), + PID: uint32(info.PID()), + UID: uint32(info.UID()), Status: info.Status(), Overrun: uint32(info.Overrun()), Addr: info.Addr(), diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index a83ce219c..3fee7aa68 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -75,6 +75,12 @@ func (s *syslog) Log() []byte { "Checking naughty and nice process list...", // Check it up to twice. "Granting licence to kill(2)...", // British spelling for British movie. "Letting the watchdogs out...", + "Conjuring /dev/null black hole...", + "Adversarially training Redcode AI...", + "Singleplexing /dev/ptmx...", + "Recruiting cron-ies...", + "Verifying that no non-zero bytes made their way into /dev/zero...", + "Accelerating teletypewriter to 9600 baud...", } selectMessage := func() string { diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index c5137c282..16986244c 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -368,8 +368,8 @@ func (t *Task) exitChildren() { Signo: int32(sig), Code: arch.SignalInfoUser, } - siginfo.SetPid(int32(c.tg.pidns.tids[t])) - siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) + siginfo.SetPID(int32(c.tg.pidns.tids[t])) + siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) c.tg.signalHandlers.mu.Lock() c.sendSignalLocked(siginfo, true /* group */) c.tg.signalHandlers.mu.Unlock() @@ -698,8 +698,8 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.Si info := &arch.SignalInfo{ Signo: int32(sig), } - info.SetPid(int32(receiver.tg.pidns.tids[t])) - info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + info.SetPID(int32(receiver.tg.pidns.tids[t])) + info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) if t.exitStatus.Signaled() { info.Code = arch.CLD_KILLED info.SetStatus(int32(t.exitStatus.Signo)) diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 42dd3e278..75af3af79 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -914,8 +914,8 @@ func (t *Task) signalStop(target *Task, code int32, status int32) { Signo: int32(linux.SIGCHLD), Code: code, } - sigchld.SetPid(int32(t.tg.pidns.tids[target])) - sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + sigchld.SetPID(int32(t.tg.pidns.tids[target])) + sigchld.SetUID(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) sigchld.SetStatus(status) // TODO(b/72102453): Set utime, stime. t.sendSignalLocked(sigchld, true /* group */) @@ -1022,8 +1022,8 @@ func (*runInterrupt) execute(t *Task) taskRunState { Signo: int32(sig), Code: t.ptraceCode, } - t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) - t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) } else { t.ptraceCode = int32(sig) t.ptraceSiginfo = nil @@ -1114,11 +1114,11 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { if parent == nil { // Tracer has detached and t was created by Kernel.CreateProcess(). // Pretend the parent is in an ancestor PID + user namespace. - info.SetPid(0) - info.SetUid(int32(auth.OverflowUID)) + info.SetPID(0) + info.SetUID(int32(auth.OverflowUID)) } else { - info.SetPid(int32(t.tg.pidns.tids[parent])) - info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + info.SetPID(int32(t.tg.pidns.tids[parent])) + info.SetUID(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) } } t.tg.signalHandlers.mu.Lock() diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index 7fd77925f..49e21026e 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -160,7 +160,7 @@ func CheckTranslateResult(required, optional MappableRange, at usermem.AccessTyp // Translations must be contiguous and in increasing order of // Translation.Source. if i > 0 && ts[i-1].Source.End != t.Source.Start { - return fmt.Errorf("Translations %+v and %+v are not contiguous", ts[i-1], t) + return fmt.Errorf("Translation %+v and Translation %+v are not contiguous", ts[i-1], t) } // At least part of each Translation must be required. if t.Source.Intersect(required).Length() == 0 { diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 4c8cd38ed..5ab2ef79f 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -36,12 +36,12 @@ type aioManager struct { contexts map[uint64]*AIOContext } -func (a *aioManager) destroy() { - a.mu.Lock() - defer a.mu.Unlock() +func (mm *MemoryManager) destroyAIOManager(ctx context.Context) { + mm.aioManager.mu.Lock() + defer mm.aioManager.mu.Unlock() - for _, ctx := range a.contexts { - ctx.destroy() + for id := range mm.aioManager.contexts { + mm.destroyAIOContextLocked(ctx, id) } } @@ -68,16 +68,26 @@ func (a *aioManager) newAIOContext(events uint32, id uint64) bool { // be drained. // // Nil is returned if the context does not exist. -func (a *aioManager) destroyAIOContext(id uint64) *AIOContext { - a.mu.Lock() - defer a.mu.Unlock() - ctx, ok := a.contexts[id] +// +// Precondition: mm.aioManager.mu is locked. +func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64) *AIOContext { + aioCtx, ok := mm.aioManager.contexts[id] if !ok { return nil } - delete(a.contexts, id) - ctx.destroy() - return ctx + + // Only unmaps after it assured that the address is a valid aio context to + // prevent random memory from been unmapped. + // + // Note: It's possible to unmap this address and map something else into + // the same address. Then it would be unmapping memory that it doesn't own. + // This is, however, the way Linux implements AIO. Keeps the same [weird] + // semantics in case anyone relies on it. + mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize) + + delete(mm.aioManager.contexts, id) + aioCtx.destroy() + return aioCtx } // lookupAIOContext looks up the given context. @@ -140,16 +150,21 @@ func (ctx *AIOContext) checkForDone() { } } -// Prepare reserves space for a new request, returning true if available. -// Returns false if the context is busy. -func (ctx *AIOContext) Prepare() bool { +// Prepare reserves space for a new request, returning nil if available. +// Returns EAGAIN if the context is busy and EINVAL if the context is dead. +func (ctx *AIOContext) Prepare() error { ctx.mu.Lock() defer ctx.mu.Unlock() + if ctx.dead { + // Context died after the caller looked it up. + return syserror.EINVAL + } if ctx.outstanding >= ctx.maxOutstanding { - return false + // Context is busy. + return syserror.EAGAIN } ctx.outstanding++ - return true + return nil } // PopRequest pops a completed request if available, this function does not do @@ -391,20 +406,13 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint // DestroyAIOContext destroys an asynchronous I/O context. It returns the // destroyed context. nil if the context does not exist. func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext { - if _, ok := mm.LookupAIOContext(ctx, id); !ok { + if !mm.isValidAddr(ctx, id) { return nil } - // Only unmaps after it assured that the address is a valid aio context to - // prevent random memory from been unmapped. - // - // Note: It's possible to unmap this address and map something else into - // the same address. Then it would be unmapping memory that it doesn't own. - // This is, however, the way Linux implements AIO. Keeps the same [weird] - // semantics in case anyone relies on it. - mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize) - - return mm.aioManager.destroyAIOContext(id) + mm.aioManager.mu.Lock() + defer mm.aioManager.mu.Unlock() + return mm.destroyAIOContextLocked(ctx, id) } // LookupAIOContext looks up the given context. It returns false if the context @@ -415,13 +423,18 @@ func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOC return nil, false } - // Protect against 'ids' that are inaccessible (Linux also reads 4 bytes - // from id). - var buf [4]byte - _, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{}) - if err != nil { + // Protect against 'id' that is inaccessible. + if !mm.isValidAddr(ctx, id) { return nil, false } return aioCtx, true } + +// isValidAddr determines if the address `id` is valid. (Linux also reads 4 +// bytes from id). +func (mm *MemoryManager) isValidAddr(ctx context.Context, id uint64) bool { + var buf [4]byte + _, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{}) + return err == nil +} diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go index 3dabac1af..e8931922f 100644 --- a/pkg/sentry/mm/aio_context_state.go +++ b/pkg/sentry/mm/aio_context_state.go @@ -15,6 +15,6 @@ package mm // afterLoad is invoked by stateify. -func (a *AIOContext) afterLoad() { - a.requestReady = make(chan struct{}, 1) +func (ctx *AIOContext) afterLoad() { + ctx.requestReady = make(chan struct{}, 1) } diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index 09dbc06a4..120707429 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -253,7 +253,7 @@ func (mm *MemoryManager) DecUsers(ctx context.Context) { panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users)) } - mm.aioManager.destroy() + mm.destroyAIOManager(ctx) mm.metadataMu.Lock() exe := mm.executable diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go index acac3d357..bc53bd41e 100644 --- a/pkg/sentry/mm/mm_test.go +++ b/pkg/sentry/mm/mm_test.go @@ -229,3 +229,46 @@ func TestIOAfterMProtect(t *testing.T) { t.Errorf("CopyOut got %d want 1", n) } } + +// TestAIOPrepareAfterDestroy tests that AIOContext should not be able to be +// prepared after destruction. +func TestAIOPrepareAfterDestroy(t *testing.T) { + ctx := contexttest.Context(t) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + id, err := mm.NewAIOContext(ctx, 1) + if err != nil { + t.Fatalf("mm.NewAIOContext got err %v want nil", err) + } + aioCtx, ok := mm.LookupAIOContext(ctx, id) + if !ok { + t.Fatalf("AIOContext not found") + } + mm.DestroyAIOContext(ctx, id) + + // Prepare should fail because aioCtx should be destroyed. + if err := aioCtx.Prepare(); err != syserror.EINVAL { + t.Errorf("aioCtx.Prepare got err %v want nil", err) + } else if err == nil { + aioCtx.CancelPendingRequest() + } +} + +// TestAIOLookupAfterDestroy tests that AIOContext should not be able to be +// looked up after memory manager is destroyed. +func TestAIOLookupAfterDestroy(t *testing.T) { + ctx := contexttest.Context(t) + mm := testMemoryManager(ctx) + + id, err := mm.NewAIOContext(ctx, 1) + if err != nil { + mm.DecUsers(ctx) + t.Fatalf("mm.NewAIOContext got err %v want nil", err) + } + mm.DecUsers(ctx) // This destroys the AIOContext manager. + + if _, ok := mm.LookupAIOContext(ctx, id); ok { + t.Errorf("AIOContext found even after AIOContext manager is destroyed") + } +} diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 7c297fb9e..d99be7f46 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -423,11 +423,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.File } if f.opts.ManualZeroing { - if err := f.forEachMappingSlice(fr, func(bs []byte) { - for i := range bs { - bs[i] = 0 - } - }); err != nil { + if err := f.manuallyZero(fr); err != nil { return memmap.FileRange{}, err } } @@ -560,19 +556,39 @@ func (f *MemoryFile) Decommit(fr memmap.FileRange) error { panic(fmt.Sprintf("invalid range: %v", fr)) } + if f.opts.ManualZeroing { + // FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in + // effect. + if err := f.manuallyZero(fr); err != nil { + return err + } + } else { + if err := f.decommitFile(fr); err != nil { + return err + } + } + + f.markDecommitted(fr) + return nil +} + +func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error { + return f.forEachMappingSlice(fr, func(bs []byte) { + for i := range bs { + bs[i] = 0 + } + }) +} + +func (f *MemoryFile) decommitFile(fr memmap.FileRange) error { // "After a successful call, subsequent reads from this range will // return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with // FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2) - err := syscall.Fallocate( + return syscall.Fallocate( int(f.file.Fd()), _FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE, int64(fr.Start), int64(fr.Length())) - if err != nil { - return err - } - f.markDecommitted(fr) - return nil } func (f *MemoryFile) markDecommitted(fr memmap.FileRange) { @@ -1044,20 +1060,20 @@ func (f *MemoryFile) runReclaim() { break } - if err := f.Decommit(fr); err != nil { - log.Warningf("Reclaim failed to decommit %v: %v", fr, err) - // Zero the pages manually. This won't reduce memory usage, but at - // least ensures that the pages will be zero when reallocated. - f.forEachMappingSlice(fr, func(bs []byte) { - for i := range bs { - bs[i] = 0 + // If ManualZeroing is in effect, pages will be zeroed on allocation + // and may not be freed by decommitFile, so calling decommitFile is + // unnecessary. + if !f.opts.ManualZeroing { + if err := f.decommitFile(fr); err != nil { + log.Warningf("Reclaim failed to decommit %v: %v", fr, err) + // Zero the pages manually. This won't reduce memory usage, but at + // least ensures that the pages will be zero when reallocated. + if err := f.manuallyZero(fr); err != nil { + panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err)) } - }) - // Pretend the pages were decommitted even though they weren't, - // since the memory accounting implementation has no idea how to - // deal with this. - f.markDecommitted(fr) + } } + f.markDecommitted(fr) f.markReclaimed(fr) } diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index acad4c793..f8ccb7430 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -91,6 +91,13 @@ func bluepillSigBus(c *vCPU) { } } +// bluepillHandleEnosys is reponsible for handling enosys error. +// +//go:nosplit +func bluepillHandleEnosys(c *vCPU) { + throw("run failed: ENOSYS") +} + // bluepillReadyStopGuest checks whether the current vCPU is ready for interrupt injection. // //go:nosplit @@ -126,3 +133,10 @@ func bluepillReadyStopGuest(c *vCPU) bool { } return true } + +// bluepillArchHandleExit checks architecture specific exitcode. +// +//go:nosplit +func bluepillArchHandleExit(c *vCPU, context unsafe.Pointer) { + c.die(bluepillArchContext(context), "unknown") +} diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index 965ad66b5..1f09813ba 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -42,6 +42,13 @@ var ( sErrEsr: _ESR_ELx_SERR_NMI, }, } + + // vcpuExtDabt is the event of ext_dabt. + vcpuExtDabt = kvmVcpuEvents{ + exception: exception{ + extDabtPending: 1, + }, + } ) // getTLS returns the value of TPIDR_EL0 register. diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go index 9433d4da5..4d912769a 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -85,7 +85,7 @@ func bluepillStopGuest(c *vCPU) { uintptr(c.fd), _KVM_SET_VCPU_EVENTS, uintptr(unsafe.Pointer(&vcpuSErrBounce))); errno != 0 { - throw("sErr injection failed") + throw("bounce sErr injection failed") } } @@ -93,18 +93,54 @@ func bluepillStopGuest(c *vCPU) { // //go:nosplit func bluepillSigBus(c *vCPU) { + // Host must support ARM64_HAS_RAS_EXTN. if _, _, errno := syscall.RawSyscall( // escapes: no. syscall.SYS_IOCTL, uintptr(c.fd), _KVM_SET_VCPU_EVENTS, uintptr(unsafe.Pointer(&vcpuSErrNMI))); errno != 0 { - throw("sErr injection failed") + if errno == syscall.EINVAL { + throw("No ARM64_HAS_RAS_EXTN feature in host.") + } + throw("nmi sErr injection failed") } } +// bluepillExtDabt is reponsible for injecting external data abort. +// +//go:nosplit +func bluepillExtDabt(c *vCPU) { + if _, _, errno := syscall.RawSyscall( // escapes: no. + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_VCPU_EVENTS, + uintptr(unsafe.Pointer(&vcpuExtDabt))); errno != 0 { + throw("ext_dabt injection failed") + } +} + +// bluepillHandleEnosys is reponsible for handling enosys error. +// +//go:nosplit +func bluepillHandleEnosys(c *vCPU) { + bluepillExtDabt(c) +} + // bluepillReadyStopGuest checks whether the current vCPU is ready for sError injection. // //go:nosplit func bluepillReadyStopGuest(c *vCPU) bool { return true } + +// bluepillArchHandleExit checks architecture specific exitcode. +// +//go:nosplit +func bluepillArchHandleExit(c *vCPU, context unsafe.Pointer) { + switch c.runData.exitReason { + case _KVM_EXIT_ARM_NISV: + bluepillExtDabt(c) + default: + c.die(bluepillArchContext(context), "unknown") + } +} diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 75085ac6a..8c5369377 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -148,6 +148,9 @@ func bluepillHandler(context unsafe.Pointer) { // mode and have interrupts disabled. bluepillSigBus(c) continue // Rerun vCPU. + case syscall.ENOSYS: + bluepillHandleEnosys(c) + continue default: throw("run failed") } @@ -220,7 +223,7 @@ func bluepillHandler(context unsafe.Pointer) { c.die(bluepillArchContext(context), "entry failed") return default: - c.die(bluepillArchContext(context), "unknown") + bluepillArchHandleExit(c, context) return } } diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go index 0b06a923a..9db1db4e9 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -47,10 +47,11 @@ type userRegs struct { } type exception struct { - sErrPending uint8 - sErrHasEsr uint8 - pad [6]uint8 - sErrEsr uint64 + sErrPending uint8 + sErrHasEsr uint8 + extDabtPending uint8 + pad [5]uint8 + sErrEsr uint64 } type kvmVcpuEvents struct { diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go index 6abaa21c4..2492d57be 100644 --- a/pkg/sentry/platform/kvm/kvm_const.go +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -56,6 +56,7 @@ const ( _KVM_EXIT_FAIL_ENTRY = 0x9 _KVM_EXIT_INTERNAL_ERROR = 0x11 _KVM_EXIT_SYSTEM_EVENT = 0x18 + _KVM_EXIT_ARM_NISV = 0x1c ) // KVM capability options. diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index fd92c3873..3f5be276b 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -263,13 +263,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) return usermem.NoAccess, platform.ErrContextInterrupt case ring0.El0SyncUndef: return c.fault(int32(syscall.SIGILL), info) - case ring0.El1SyncUndef: - *info = arch.SignalInfo{ - Signo: int32(syscall.SIGILL), - Code: 1, // ILL_ILLOPC (illegal opcode). - } - info.SetAddr(switchOpts.Registers.Pc) // Include address. - return usermem.AccessType{}, platform.ErrContextSignal default: panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) } diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index f56aa3b79..571bfcc2e 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -18,8 +18,8 @@ // // In a nutshell, it works as follows: // -// The creation of a new address space creates a new child processes with a -// single thread which is traced by a single goroutine. +// The creation of a new address space creates a new child process with a single +// thread which is traced by a single goroutine. // // A context is just a collection of temporary variables. Calling Switch on a // context does the following: diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 812ab80ef..aacd7ce70 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -590,7 +590,7 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { // facilitate vsyscall emulation. See patchSignalInfo. patchSignalInfo(regs, &c.signalInfo) return false - } else if c.signalInfo.Code <= 0 && c.signalInfo.Pid() == int32(os.Getpid()) { + } else if c.signalInfo.Code <= 0 && c.signalInfo.PID() == int32(os.Getpid()) { // The signal was generated by this process. That means // that it was an interrupt or something else that we // should bail for. Note that we ignore signals diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD index 679b287c3..2852b7387 100644 --- a/pkg/sentry/platform/ring0/BUILD +++ b/pkg/sentry/platform/ring0/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "arch_genrule", "go_library") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) @@ -39,19 +39,19 @@ go_template_instance( template = ":defs_arm64", ) -genrule( +arch_genrule( name = "entry_impl_amd64", srcs = ["entry_amd64.s"], outs = ["entry_impl_amd64.s"], - cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", + cmd = "(echo -e '// build +amd64\\n' && QEMU $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(location entry_amd64.s)) > $@", tools = ["//pkg/sentry/platform/ring0/gen_offsets"], ) -genrule( +arch_genrule( name = "entry_impl_arm64", srcs = ["entry_arm64.s"], outs = ["entry_impl_arm64.s"], - cmd = "(echo -e '// build +arm64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", + cmd = "(echo -e '// build +arm64\\n' && QEMU $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(location entry_arm64.s)) > $@", tools = ["//pkg/sentry/platform/ring0/gen_offsets"], ) @@ -72,7 +72,6 @@ go_library( "lib_amd64.s", "lib_arm64.go", "lib_arm64.s", - "lib_arm64_unsafe.go", "ring0.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go index 327d48465..c51df2811 100644 --- a/pkg/sentry/platform/ring0/aarch64.go +++ b/pkg/sentry/platform/ring0/aarch64.go @@ -90,6 +90,7 @@ const ( El0SyncIa El0SyncFpsimdAcc El0SyncSveAcc + El0SyncFpsimdExc El0SyncSys El0SyncSpPc El0SyncUndef diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s index f489ad352..cf0bf3528 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -132,40 +132,6 @@ MOVD offset+PTRACE_R29(reg), R29; \ MOVD offset+PTRACE_R30(reg), R30; -// NOP-s -#define nop31Instructions() \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; \ - WORD $0xd503201f; - #define ESR_ELx_EC_UNKNOWN (0x00) #define ESR_ELx_EC_WFx (0x01) /* Unallocated EC: 0x02 */ @@ -324,6 +290,18 @@ MOVD CPU_TTBR0_KVM(from), RSV_REG; \ MSR RSV_REG, TTBR0_EL1; +TEXT ·EnableVFP(SB),NOSPLIT,$0 + MOVD $FPEN_ENABLE, R0 + WORD $0xd5181040 //MSR R0, CPACR_EL1 + ISB $15 + RET + +TEXT ·DisableVFP(SB),NOSPLIT,$0 + MOVD $0, R0 + WORD $0xd5181040 //MSR R0, CPACR_EL1 + ISB $15 + RET + #define VFP_ENABLE \ MOVD $FPEN_ENABLE, R0; \ WORD $0xd5181040; \ //MSR R0, CPACR_EL1 @@ -370,12 +348,12 @@ MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \ LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack. -// EXCEPTION_WITH_ERROR is a common exception handler function. -#define EXCEPTION_WITH_ERROR(user, vector) \ +// EXCEPTION_EL0 is a common el0 exception handler function. +#define EXCEPTION_EL0(vector) \ WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 WORD $0xd538601a; \ //MRS FAR_EL1, R26 MOVD R26, CPU_FAULT_ADDR(RSV_REG); \ - MOVD $user, R3; \ + MOVD $1, R3; \ MOVD R3, CPU_ERROR_TYPE(RSV_REG); \ // Set error type to user. MOVD $vector, R3; \ MOVD R3, CPU_VECTOR_CODE(RSV_REG); \ @@ -383,6 +361,12 @@ MOVD R3, CPU_ERROR_CODE(RSV_REG); \ B ·kernelExitToEl1(SB); +// EXCEPTION_EL1 is a common el1 exception handler function. +#define EXCEPTION_EL1(vector) \ + MOVD $vector, R3; \ + MOVD R3, 8(RSP); \ + B ·HaltEl1ExceptionAndResume(SB); + // storeAppASID writes the application's asid value. TEXT ·storeAppASID(SB),NOSPLIT,$0-8 MOVD asid+0(FP), R1 @@ -430,6 +414,16 @@ TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0 CALL ·kernelSyscall(SB) // Call the trampoline. B ·kernelExitToEl1(SB) // Resume. +// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. +TEXT ·HaltEl1ExceptionAndResume(SB),NOSPLIT,$0-8 + WORD $0xd538d092 // MRS TPIDR_EL1, R18 + MOVD CPU_SELF(RSV_REG), R3 // Load vCPU. + MOVD R3, 8(RSP) // First argument (vCPU). + MOVD vector+0(FP), R3 + MOVD R3, 16(RSP) // Second argument (vector). + CALL ·kernelException(SB) // Call the trampoline. + B ·kernelExitToEl1(SB) // Resume. + // Shutdown stops the guest. TEXT ·Shutdown(SB),NOSPLIT,$0 // PSCI EVENT. @@ -592,39 +586,22 @@ TEXT ·El1_sync(SB),NOSPLIT,$0 B el1_invalid el1_da: + EXCEPTION_EL1(El1SyncDa) el1_ia: - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - WORD $0xd538601a //MRS FAR_EL1, R26 - - MOVD R26, CPU_FAULT_ADDR(RSV_REG) - - MOVD $0, CPU_ERROR_TYPE(RSV_REG) - - MOVD $PageFault, R3 - MOVD R3, CPU_VECTOR_CODE(RSV_REG) - - B ·HaltAndResume(SB) - + EXCEPTION_EL1(El1SyncIa) el1_sp_pc: - B ·Shutdown(SB) - + EXCEPTION_EL1(El1SyncSpPc) el1_undef: - B ·Shutdown(SB) - + EXCEPTION_EL1(El1SyncUndef) el1_svc: - MOVD $0, CPU_ERROR_CODE(RSV_REG) - MOVD $0, CPU_ERROR_TYPE(RSV_REG) B ·HaltEl1SvcAndResume(SB) - el1_dbg: - B ·Shutdown(SB) - + EXCEPTION_EL1(El1SyncDbg) el1_fpsimd_acc: VFP_ENABLE B ·kernelExitToEl1(SB) // Resume. - el1_invalid: - B ·Shutdown(SB) + EXCEPTION_EL1(El1SyncInv) // El1_irq is the handler for El1_irq. TEXT ·El1_irq(SB),NOSPLIT,$0 @@ -680,28 +657,21 @@ el0_svc: el0_da: el0_ia: - EXCEPTION_WITH_ERROR(1, PageFault) - + EXCEPTION_EL0(PageFault) el0_fpsimd_acc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncFpsimdAcc) el0_sve_acc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncSveAcc) el0_fpsimd_exc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncFpsimdExc) el0_sp_pc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncSpPc) el0_undef: - EXCEPTION_WITH_ERROR(1, El0SyncUndef) - + EXCEPTION_EL0(El0SyncUndef) el0_dbg: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncDbg) el0_invalid: - B ·Shutdown(SB) + EXCEPTION_EL0(El0SyncInv) TEXT ·El0_irq(SB),NOSPLIT,$0 B ·Shutdown(SB) @@ -760,79 +730,43 @@ TEXT ·El0_error_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) // Vectors implements exception vector table. +// The start address of exception vector table should be 11-bits aligned. +// For detail, please refer to arm developer document: +// https://developer.arm.com/documentation/100933/0100/AArch64-exception-vector-table +// Also can refer to the code in linux kernel: arch/arm64/kernel/entry.S TEXT ·Vectors(SB),NOSPLIT,$0 + PCALIGN $2048 B ·El1_sync_invalid(SB) - nop31Instructions() + PCALIGN $128 B ·El1_irq_invalid(SB) - nop31Instructions() + PCALIGN $128 B ·El1_fiq_invalid(SB) - nop31Instructions() + PCALIGN $128 B ·El1_error_invalid(SB) - nop31Instructions() + PCALIGN $128 B ·El1_sync(SB) - nop31Instructions() + PCALIGN $128 B ·El1_irq(SB) - nop31Instructions() + PCALIGN $128 B ·El1_fiq(SB) - nop31Instructions() + PCALIGN $128 B ·El1_error(SB) - nop31Instructions() + PCALIGN $128 B ·El0_sync(SB) - nop31Instructions() + PCALIGN $128 B ·El0_irq(SB) - nop31Instructions() + PCALIGN $128 B ·El0_fiq(SB) - nop31Instructions() + PCALIGN $128 B ·El0_error(SB) - nop31Instructions() + PCALIGN $128 B ·El0_sync_invalid(SB) - nop31Instructions() + PCALIGN $128 B ·El0_irq_invalid(SB) - nop31Instructions() + PCALIGN $128 B ·El0_fiq_invalid(SB) - nop31Instructions() + PCALIGN $128 B ·El0_error_invalid(SB) - nop31Instructions() - - // The exception-vector-table is required to be 11-bits aligned. - // Please see Linux source code as reference: arch/arm64/kernel/entry.s. - // For gvisor, I defined it as 4K in length, filled the 2nd 2K part with NOPs. - // So that, I can safely move the 1st 2K part into the address with 11-bits alignment. - WORD $0xd503201f //nop - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() - WORD $0xd503201f - nop31Instructions() diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD index 9742308d8..a9703baf6 100644 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -24,6 +24,9 @@ go_binary( "defs_impl_arm64.go", "main.go", ], + # Use the libc malloc to avoid any extra dependencies. This is required to + # pass the sentry deps test. + system_malloc = True, visibility = [ "//pkg/sentry/platform/kvm:__pkg__", "//pkg/sentry/platform/ring0:__pkg__", diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go index c1c808b96..c05284641 100644 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -24,6 +24,10 @@ func HaltAndResume() //go:nosplit func HaltEl1SvcAndResume() +// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. +//go:nosplit +func HaltEl1ExceptionAndResume() + // init initializes architecture-specific state. func (k *Kernel) init(maxCPUs int) { } @@ -49,6 +53,12 @@ func IsCanonical(addr uint64) bool { return addr <= 0x0000ffffffffffff || addr > 0xffff000000000000 } +// SwitchToUser performs an eret. +// +// The return value is the exception vector. +// +// +checkescape:all +// //go:nosplit func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { storeAppASID(uintptr(switchOpts.UserASID)) @@ -61,11 +71,13 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { regs.Pstate &= ^uint64(PsrFlagsClear) regs.Pstate |= UserFlagsSet + EnableVFP() LoadFloatingPoint(switchOpts.FloatingPointState) kernelExitToEl0() SaveFloatingPoint(switchOpts.FloatingPointState) + DisableVFP() vector = c.vecCode diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go index bf1c655f4..a490bf3af 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.go +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -34,13 +34,13 @@ func FlushTlbAll() // CPACREL1 returns the value of the CPACR_EL1 register. func CPACREL1() (value uintptr) -// FPCR returns the value of FPCR register. +// GetFPCR returns the value of FPCR register. func GetFPCR() (value uintptr) // SetFPCR writes the FPCR value. func SetFPCR(value uintptr) -// FPSR returns the value of FPSR register. +// GetFPSR returns the value of FPSR register. func GetFPSR() (value uintptr) // SetFPSR writes the FPSR value. @@ -59,9 +59,13 @@ func LoadFloatingPoint(*byte) // SaveFloatingPoint saves floating point state. func SaveFloatingPoint(*byte) +// EnableVFP enables fpsimd. +func EnableVFP() + +// DisableVFP disables fpsimd. +func DisableVFP() + // Init sets function pointers based on architectural features. // // This must be called prior to using ring0. -func Init() { - rewriteVectors() -} +func Init() {} diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s index 675a8bdb7..e39b32841 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.s +++ b/pkg/sentry/platform/ring0/lib_arm64.s @@ -52,62 +52,47 @@ TEXT ·CPACREL1(SB),NOSPLIT,$0-8 RET TEXT ·GetFPCR(SB),NOSPLIT,$0-8 - WORD $0xd53b4201 // MRS NZCV, R1 + MOVD FPCR, R1 MOVD R1, ret+0(FP) RET TEXT ·GetFPSR(SB),NOSPLIT,$0-8 - WORD $0xd53b4421 // MRS FPSR, R1 + MOVD FPSR, R1 MOVD R1, ret+0(FP) RET TEXT ·SetFPCR(SB),NOSPLIT,$0-8 MOVD addr+0(FP), R1 - WORD $0xd51b4201 // MSR R1, NZCV + MOVD R1, FPCR RET TEXT ·SetFPSR(SB),NOSPLIT,$0-8 MOVD addr+0(FP), R1 - WORD $0xd51b4421 // MSR R1, FPSR + MOVD R1, FPSR RET TEXT ·SaveVRegs(SB),NOSPLIT,$0-8 MOVD addr+0(FP), R0 // Skip aarch64_ctx, fpsr, fpcr. - FMOVD F0, 16*1(R0) - FMOVD F1, 16*2(R0) - FMOVD F2, 16*3(R0) - FMOVD F3, 16*4(R0) - FMOVD F4, 16*5(R0) - FMOVD F5, 16*6(R0) - FMOVD F6, 16*7(R0) - FMOVD F7, 16*8(R0) - FMOVD F8, 16*9(R0) - FMOVD F9, 16*10(R0) - FMOVD F10, 16*11(R0) - FMOVD F11, 16*12(R0) - FMOVD F12, 16*13(R0) - FMOVD F13, 16*14(R0) - FMOVD F14, 16*15(R0) - FMOVD F15, 16*16(R0) - FMOVD F16, 16*17(R0) - FMOVD F17, 16*18(R0) - FMOVD F18, 16*19(R0) - FMOVD F19, 16*20(R0) - FMOVD F20, 16*21(R0) - FMOVD F21, 16*22(R0) - FMOVD F22, 16*23(R0) - FMOVD F23, 16*24(R0) - FMOVD F24, 16*25(R0) - FMOVD F25, 16*26(R0) - FMOVD F26, 16*27(R0) - FMOVD F27, 16*28(R0) - FMOVD F28, 16*29(R0) - FMOVD F29, 16*30(R0) - FMOVD F30, 16*31(R0) - FMOVD F31, 16*32(R0) - ISB $15 + ADD $16, R0, R0 + + WORD $0xad000400 // stp q0, q1, [x0] + WORD $0xad010c02 // stp q2, q3, [x0, #32] + WORD $0xad021404 // stp q4, q5, [x0, #64] + WORD $0xad031c06 // stp q6, q7, [x0, #96] + WORD $0xad042408 // stp q8, q9, [x0, #128] + WORD $0xad052c0a // stp q10, q11, [x0, #160] + WORD $0xad06340c // stp q12, q13, [x0, #192] + WORD $0xad073c0e // stp q14, q15, [x0, #224] + WORD $0xad084410 // stp q16, q17, [x0, #256] + WORD $0xad094c12 // stp q18, q19, [x0, #288] + WORD $0xad0a5414 // stp q20, q21, [x0, #320] + WORD $0xad0b5c16 // stp q22, q23, [x0, #352] + WORD $0xad0c6418 // stp q24, q25, [x0, #384] + WORD $0xad0d6c1a // stp q26, q27, [x0, #416] + WORD $0xad0e741c // stp q28, q29, [x0, #448] + WORD $0xad0f7c1e // stp q30, q31, [x0, #480] RET @@ -115,39 +100,24 @@ TEXT ·LoadVRegs(SB),NOSPLIT,$0-8 MOVD addr+0(FP), R0 // Skip aarch64_ctx, fpsr, fpcr. - FMOVD 16*1(R0), F0 - FMOVD 16*2(R0), F1 - FMOVD 16*3(R0), F2 - FMOVD 16*4(R0), F3 - FMOVD 16*5(R0), F4 - FMOVD 16*6(R0), F5 - FMOVD 16*7(R0), F6 - FMOVD 16*8(R0), F7 - FMOVD 16*9(R0), F8 - FMOVD 16*10(R0), F9 - FMOVD 16*11(R0), F10 - FMOVD 16*12(R0), F11 - FMOVD 16*13(R0), F12 - FMOVD 16*14(R0), F13 - FMOVD 16*15(R0), F14 - FMOVD 16*16(R0), F15 - FMOVD 16*17(R0), F16 - FMOVD 16*18(R0), F17 - FMOVD 16*19(R0), F18 - FMOVD 16*20(R0), F19 - FMOVD 16*21(R0), F20 - FMOVD 16*22(R0), F21 - FMOVD 16*23(R0), F22 - FMOVD 16*24(R0), F23 - FMOVD 16*25(R0), F24 - FMOVD 16*26(R0), F25 - FMOVD 16*27(R0), F26 - FMOVD 16*28(R0), F27 - FMOVD 16*29(R0), F28 - FMOVD 16*30(R0), F29 - FMOVD 16*31(R0), F30 - FMOVD 16*32(R0), F31 - ISB $15 + ADD $16, R0, R0 + + WORD $0xad400400 // ldp q0, q1, [x0] + WORD $0xad410c02 // ldp q2, q3, [x0, #32] + WORD $0xad421404 // ldp q4, q5, [x0, #64] + WORD $0xad431c06 // ldp q6, q7, [x0, #96] + WORD $0xad442408 // ldp q8, q9, [x0, #128] + WORD $0xad452c0a // ldp q10, q11, [x0, #160] + WORD $0xad46340c // ldp q12, q13, [x0, #192] + WORD $0xad473c0e // ldp q14, q15, [x0, #224] + WORD $0xad484410 // ldp q16, q17, [x0, #256] + WORD $0xad494c12 // ldp q18, q19, [x0, #288] + WORD $0xad4a5414 // ldp q20, q21, [x0, #320] + WORD $0xad4b5c16 // ldp q22, q23, [x0, #352] + WORD $0xad4c6418 // ldp q24, q25, [x0, #384] + WORD $0xad4d6c1a // ldp q26, q27, [x0, #416] + WORD $0xad4e741c // ldp q28, q29, [x0, #448] + WORD $0xad4f7c1e // ldp q30, q31, [x0, #480] RET diff --git a/pkg/sentry/platform/ring0/lib_arm64_unsafe.go b/pkg/sentry/platform/ring0/lib_arm64_unsafe.go deleted file mode 100644 index c05166fea..000000000 --- a/pkg/sentry/platform/ring0/lib_arm64_unsafe.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package ring0 - -import ( - "reflect" - "syscall" - "unsafe" - - "gvisor.dev/gvisor/pkg/safecopy" - "gvisor.dev/gvisor/pkg/usermem" -) - -const ( - nopInstruction = 0xd503201f - instSize = unsafe.Sizeof(uint32(0)) - vectorsRawLen = 0x800 -) - -func unsafeSlice(addr uintptr, length int) (slice []uint32) { - hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - hdr.Data = addr - hdr.Len = length / int(instSize) - hdr.Cap = length / int(instSize) - return slice -} - -// Work around: move ring0.Vectors() into a specific address with 11-bits alignment. -// -// According to the design documentation of Arm64, -// the start address of exception vector table should be 11-bits aligned. -// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S -// But, we can't align a function's start address to a specific address by using golang. -// We have raised this question in golang community: -// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I -// This function will be removed when golang supports this feature. -// -// There are 2 jobs were implemented in this function: -// 1, move the start address of exception vector table into the specific address. -// 2, modify the offset of each instruction. -func rewriteVectors() { - vectorsBegin := reflect.ValueOf(Vectors).Pointer() - - // The exception-vector-table is required to be 11-bits aligned. - // And the size is 0x800. - // Please see the documentation as reference: - // https://developer.arm.com/docs/100933/0100/aarch64-exception-vector-table - // - // But, golang does not allow to set a function's address to a specific value. - // So, for gvisor, I defined the size of exception-vector-table as 4K, - // filled the 2nd 2K part with NOP-s. - // So that, I can safely move the 1st 2K part into the address with 11-bits alignment. - // - // So, the prerequisite for this function to work correctly is: - // vectorsSafeLen >= 0x1000 - // vectorsRawLen = 0x800 - vectorsSafeLen := int(safecopy.FindEndAddress(vectorsBegin) - vectorsBegin) - if vectorsSafeLen < 2*vectorsRawLen { - panic("Can't update vectors") - } - - vectorsSafeTable := unsafeSlice(vectorsBegin, vectorsSafeLen) // Now a []uint32 - vectorsRawLen32 := vectorsRawLen / int(instSize) - - offset := vectorsBegin & (1<<11 - 1) - if offset != 0 { - offset = 1<<11 - offset - } - - pageBegin := (vectorsBegin + offset) & ^uintptr(usermem.PageSize-1) - - _, _, errno := syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC)) - if errno != 0 { - panic(errno.Error()) - } - - offset = offset / instSize // By index, not bytes. - // Move exception-vector-table into the specific address, should uses memmove here. - for i := 1; i <= vectorsRawLen32; i++ { - vectorsSafeTable[int(offset)+vectorsRawLen32-i] = vectorsSafeTable[vectorsRawLen32-i] - } - - // Adjust branch since instruction was moved forward. - for i := 0; i < vectorsRawLen32; i++ { - if vectorsSafeTable[int(offset)+i] != nopInstruction { - vectorsSafeTable[int(offset)+i] -= uint32(offset) - } - } - - _, _, errno = syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_EXEC)) - if errno != 0 { - panic(errno.Error()) - } -} diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go index 53bc3353c..b5652deb9 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -70,6 +70,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define El0SyncIa 0x%02x\n", El0SyncIa) fmt.Fprintf(w, "#define El0SyncFpsimdAcc 0x%02x\n", El0SyncFpsimdAcc) fmt.Fprintf(w, "#define El0SyncSveAcc 0x%02x\n", El0SyncSveAcc) + fmt.Fprintf(w, "#define El0SyncFpsimdExc 0x%02x\n", El0SyncFpsimdExc) fmt.Fprintf(w, "#define El0SyncSys 0x%02x\n", El0SyncSys) fmt.Fprintf(w, "#define El0SyncSpPc 0x%02x\n", El0SyncSpPc) fmt.Fprintf(w, "#define El0SyncUndef 0x%02x\n", El0SyncUndef) diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go index bc16a1622..7605d0cb2 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -58,6 +58,15 @@ type PageTables struct { readOnlyShared bool } +// Init initializes a set of PageTables. +// +//go:nosplit +func (p *PageTables) Init(allocator Allocator) { + p.Allocator = allocator + p.root = p.Allocator.NewPTEs() + p.rootPhysical = p.Allocator.PhysicalFor(p.root) +} + // NewWithUpper returns new PageTables. // // upperSharedPageTables are used for mapping the upper of addresses, @@ -73,14 +82,17 @@ type PageTables struct { func NewWithUpper(a Allocator, upperSharedPageTables *PageTables, upperStart uintptr) *PageTables { p := new(PageTables) p.Init(a) + if upperSharedPageTables != nil { if !upperSharedPageTables.readOnlyShared { panic("Only read-only shared pagetables can be used as upper") } p.upperSharedPageTables = upperSharedPageTables p.upperStart = upperStart - p.cloneUpperShared() } + + p.InitArch(a) + return p } diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go index a4e416af7..520161755 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go @@ -24,6 +24,14 @@ import ( // archPageTables is architecture-specific data. type archPageTables struct { + // root is the pagetable root for kernel space. + root *PTEs + + // rootPhysical is the cached physical address of the root. + // + // This is saved only to prevent constant translation. + rootPhysical uintptr + asid uint16 } @@ -38,7 +46,7 @@ func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 { // //go:nosplit func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 { - return uint64(p.upperSharedPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset + return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset } // Bits in page table entries. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go index e7ab887e5..4bdde8448 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go @@ -41,13 +41,13 @@ const ( entriesPerPage = 512 ) -// Init initializes a set of PageTables. +// InitArch does some additional initialization related to the architecture. // //go:nosplit -func (p *PageTables) Init(allocator Allocator) { - p.Allocator = allocator - p.root = p.Allocator.NewPTEs() - p.rootPhysical = p.Allocator.PhysicalFor(p.root) +func (p *PageTables) InitArch(allocator Allocator) { + if p.upperSharedPageTables != nil { + p.cloneUpperShared() + } } func pgdIndex(upperStart uintptr) uintptr { diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go index 5392bf27a..ad0e30c88 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go @@ -42,13 +42,16 @@ const ( entriesPerPage = 512 ) -// Init initializes a set of PageTables. +// InitArch does some additional initialization related to the architecture. // //go:nosplit -func (p *PageTables) Init(allocator Allocator) { - p.Allocator = allocator - p.root = p.Allocator.NewPTEs() - p.rootPhysical = p.Allocator.PhysicalFor(p.root) +func (p *PageTables) InitArch(allocator Allocator) { + if p.upperSharedPageTables != nil { + p.cloneUpperShared() + } else { + p.archPageTables.root = p.Allocator.NewPTEs() + p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root) + } } // cloneUpperShared clone the upper from the upper shared page tables. @@ -59,7 +62,8 @@ func (p *PageTables) cloneUpperShared() { panic("upperStart should be the same as upperBottom") } - // nothing to do for arm. + p.archPageTables.root = p.upperSharedPageTables.archPageTables.root + p.archPageTables.rootPhysical = p.upperSharedPageTables.archPageTables.rootPhysical } // PTEs is a collection of entries. diff --git a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go index 157c9a7cc..c261d393a 100644 --- a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go +++ b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go @@ -116,7 +116,7 @@ func next(start uintptr, size uintptr) uintptr { func (w *Walker) iterateRangeCanonical(start, end uintptr) { pgdEntryIndex := w.pageTables.root if start >= upperBottom { - pgdEntryIndex = w.pageTables.upperSharedPageTables.root + pgdEntryIndex = w.pageTables.archPageTables.root } for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ { diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index a3f775d15..cc1f6bfcc 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -20,6 +20,7 @@ go_library( "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/tcpip", + "//pkg/tcpip/header", "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index ca16d0381..fb7c5dc61 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -23,7 +23,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/syserror", - "//pkg/tcpip", "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 70ccf77a7..b88cdca48 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" ) @@ -344,21 +343,34 @@ func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte { } // PackIPPacketInfo packs an IP_PKTINFO socket control message. -func PackIPPacketInfo(t *kernel.Task, packetInfo tcpip.IPPacketInfo, buf []byte) []byte { - var p linux.ControlMessageIPPacketInfo - p.NIC = int32(packetInfo.NIC) - copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr)) - copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr)) - +func PackIPPacketInfo(t *kernel.Task, packetInfo *linux.ControlMessageIPPacketInfo, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IP, linux.IP_PKTINFO, t.Arch().Width(), - p, + packetInfo, ) } +// PackOriginalDstAddress packs an IP_RECVORIGINALDSTADDR socket control message. +func PackOriginalDstAddress(t *kernel.Task, originalDstAddress linux.SockAddr, buf []byte) []byte { + var level uint32 + var optType uint32 + switch originalDstAddress.(type) { + case *linux.SockAddrInet: + level = linux.SOL_IP + optType = linux.IP_RECVORIGDSTADDR + case *linux.SockAddrInet6: + level = linux.SOL_IPV6 + optType = linux.IPV6_RECVORIGDSTADDR + default: + panic("invalid address type, must be an IP address for IP_RECVORIGINALDSTADDR cmsg") + } + return putCmsgStruct( + buf, level, optType, t.Arch().Width(), originalDstAddress) +} + // PackControlMessages packs control messages into the given buffer. // // We skip control messages specific to Unix domain sockets. @@ -384,7 +396,11 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt } if cmsgs.IP.HasIPPacketInfo { - buf = PackIPPacketInfo(t, cmsgs.IP.PacketInfo, buf) + buf = PackIPPacketInfo(t, &cmsgs.IP.PacketInfo, buf) + } + + if cmsgs.IP.OriginalDstAddress != nil { + buf = PackOriginalDstAddress(t, cmsgs.IP.OriginalDstAddress, buf) } return buf @@ -416,17 +432,15 @@ func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int { space += cmsgSpace(t, linux.SizeOfControlMessageTClass) } - return space -} + if cmsgs.IP.HasIPPacketInfo { + space += cmsgSpace(t, linux.SizeOfControlMessageIPPacketInfo) + } -// NewIPPacketInfo returns the IPPacketInfo struct. -func NewIPPacketInfo(packetInfo linux.ControlMessageIPPacketInfo) tcpip.IPPacketInfo { - var p tcpip.IPPacketInfo - p.NIC = tcpip.NICID(packetInfo.NIC) - copy([]byte(p.LocalAddr), packetInfo.LocalAddr[:]) - copy([]byte(p.DestinationAddr), packetInfo.DestinationAddr[:]) + if cmsgs.IP.OriginalDstAddress != nil { + space += cmsgSpace(t, cmsgs.IP.OriginalDstAddress.SizeBytes()) + } - return p + return space } // Parse parses a raw socket control message into portable objects. @@ -489,6 +503,14 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con cmsgs.Unix.Credentials = scmCreds i += binary.AlignUp(length, width) + case linux.SO_TIMESTAMP: + if length < linux.SizeOfTimeval { + return socket.ControlMessages{}, syserror.EINVAL + } + cmsgs.IP.HasTimestamp = true + binary.Unmarshal(buf[i:i+linux.SizeOfTimeval], usermem.ByteOrder, &cmsgs.IP.Timestamp) + i += binary.AlignUp(length, width) + default: // Unknown message type. return socket.ControlMessages{}, syserror.EINVAL @@ -512,7 +534,16 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con var packetInfo linux.ControlMessageIPPacketInfo binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo) - cmsgs.IP.PacketInfo = NewIPPacketInfo(packetInfo) + cmsgs.IP.PacketInfo = packetInfo + i += binary.AlignUp(length, width) + + case linux.IP_RECVORIGDSTADDR: + var addr linux.SockAddrInet + if length < addr.SizeBytes() { + return socket.ControlMessages{}, syserror.EINVAL + } + binary.Unmarshal(buf[i:i+addr.SizeBytes()], usermem.ByteOrder, &addr) + cmsgs.IP.OriginalDstAddress = &addr i += binary.AlignUp(length, width) default: @@ -528,6 +559,15 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass) i += binary.AlignUp(length, width) + case linux.IPV6_RECVORIGDSTADDR: + var addr linux.SockAddrInet6 + if length < addr.SizeBytes() { + return socket.ControlMessages{}, syserror.EINVAL + } + binary.Unmarshal(buf[i:i+addr.SizeBytes()], usermem.ByteOrder, &addr) + cmsgs.IP.OriginalDstAddress = &addr + i += binary.AlignUp(length, width) + default: return socket.ControlMessages{}, syserror.EINVAL } diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 7d3c4a01c..1f220c343 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -331,17 +331,17 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr switch level { case linux.SOL_IP: switch name { - case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO: + case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO, linux.IP_RECVORIGDSTADDR: optlen = sizeofInt32 } case linux.SOL_IPV6: switch name { - case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY: + case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY, linux.IPV6_RECVORIGDSTADDR: optlen = sizeofInt32 } case linux.SOL_SOCKET: switch name { - case linux.SO_ERROR, linux.SO_KEEPALIVE, linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR: + case linux.SO_ERROR, linux.SO_KEEPALIVE, linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR, linux.SO_TIMESTAMP: optlen = sizeofInt32 case linux.SO_LINGER: optlen = syscall.SizeofLinger @@ -377,24 +377,24 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt [] switch level { case linux.SOL_IP: switch name { - case linux.IP_TOS, linux.IP_RECVTOS: + case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_RECVORIGDSTADDR: optlen = sizeofInt32 case linux.IP_PKTINFO: optlen = linux.SizeOfControlMessageIPPacketInfo } case linux.SOL_IPV6: switch name { - case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY: + case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY, linux.IPV6_RECVORIGDSTADDR: optlen = sizeofInt32 } case linux.SOL_SOCKET: switch name { - case linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR: + case linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR, linux.SO_TIMESTAMP: optlen = sizeofInt32 } case linux.SOL_TCP: switch name { - case linux.TCP_NODELAY: + case linux.TCP_NODELAY, linux.TCP_INQ: optlen = sizeofInt32 } } @@ -416,6 +416,37 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt [] return nil } +func (s *socketOpsCommon) recvMsgFromHost(iovs []syscall.Iovec, flags int, senderRequested bool, controlLen uint64) (uint64, int, []byte, []byte, error) { + // We always do a non-blocking recv*(). + sysflags := flags | syscall.MSG_DONTWAIT + + msg := syscall.Msghdr{} + if len(iovs) > 0 { + msg.Iov = &iovs[0] + msg.Iovlen = uint64(len(iovs)) + } + var senderAddrBuf []byte + if senderRequested { + senderAddrBuf = make([]byte, sizeofSockaddr) + msg.Name = &senderAddrBuf[0] + msg.Namelen = uint32(sizeofSockaddr) + } + var controlBuf []byte + if controlLen > 0 { + if controlLen > maxControlLen { + controlLen = maxControlLen + } + controlBuf = make([]byte, controlLen) + msg.Control = &controlBuf[0] + msg.Controllen = controlLen + } + n, err := recvmsg(s.fd, &msg, sysflags) + if err != nil { + return 0 /* n */, 0 /* mFlags */, nil /* senderAddrBuf */, nil /* controlBuf */, err + } + return n, int(msg.Flags), senderAddrBuf[:msg.Namelen], controlBuf[:msg.Controllen], err +} + // RecvMsg implements socket.Socket.RecvMsg. func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { // Only allow known and safe flags. @@ -427,56 +458,36 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument } - var senderAddr linux.SockAddr var senderAddrBuf []byte - if senderRequested { - senderAddrBuf = make([]byte, sizeofSockaddr) - } - var controlBuf []byte var msgFlags int - - recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { - // Refuse to do anything if any part of dst.Addrs was unusable. - if uint64(dst.NumBytes()) != dsts.NumBytes() { - return 0, nil - } - if dsts.IsEmpty() { - return 0, nil - } - - // We always do a non-blocking recv*(). - sysflags := flags | syscall.MSG_DONTWAIT - - iovs := safemem.IovecsFromBlockSeq(dsts) - msg := syscall.Msghdr{ - Iov: &iovs[0], - Iovlen: uint64(len(iovs)), - } - if len(senderAddrBuf) != 0 { - msg.Name = &senderAddrBuf[0] - msg.Namelen = uint32(len(senderAddrBuf)) - } - if controlLen > 0 { - if controlLen > maxControlLen { - controlLen = maxControlLen + copyToDst := func() (int64, error) { + var n uint64 + var err error + if dst.NumBytes() == 0 { + // We want to make the recvmsg(2) call to the host even if dst is empty + // to fetch control messages, sender address or errors if any occur. + n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(nil, flags, senderRequested, controlLen) + return int64(n), err + } + + recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { + // Refuse to do anything if any part of dst.Addrs was unusable. + if uint64(dst.NumBytes()) != dsts.NumBytes() { + return 0, nil + } + if dsts.IsEmpty() { + return 0, nil } - controlBuf = make([]byte, controlLen) - msg.Control = &controlBuf[0] - msg.Controllen = controlLen - } - n, err := recvmsg(s.fd, &msg, sysflags) - if err != nil { - return 0, err - } - senderAddrBuf = senderAddrBuf[:msg.Namelen] - msgFlags = int(msg.Flags) - controlLen = uint64(msg.Controllen) - return n, nil - }) + + n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(safemem.IovecsFromBlockSeq(dsts), flags, senderRequested, controlLen) + return n, err + }) + return dst.CopyOutFrom(t, recvmsgToBlocks) + } var ch chan struct{} - n, err := dst.CopyOutFrom(t, recvmsgToBlocks) + n, err := copyToDst() if flags&syscall.MSG_DONTWAIT == 0 { for err == syserror.ErrWouldBlock { // We only expect blocking to come from the actual syscall, in which @@ -494,48 +505,75 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags s.EventRegister(&e, waiter.EventIn) defer s.EventUnregister(&e) } - n, err = dst.CopyOutFrom(t, recvmsgToBlocks) + n, err = copyToDst() } } if err != nil { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) } + var senderAddr linux.SockAddr if senderRequested { senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf) } - unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf[:controlLen]) + unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf) if err != nil { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) } + return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), parseUnixControlMessages(unixControlMessages), nil +} +func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) socket.ControlMessages { controlMessages := socket.ControlMessages{} for _, unixCmsg := range unixControlMessages { switch unixCmsg.Header.Level { - case syscall.SOL_IP: + case linux.SOL_SOCKET: + switch unixCmsg.Header.Type { + case linux.SO_TIMESTAMP: + controlMessages.IP.HasTimestamp = true + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfTimeval], usermem.ByteOrder, &controlMessages.IP.Timestamp) + } + + case linux.SOL_IP: switch unixCmsg.Header.Type { - case syscall.IP_TOS: + case linux.IP_TOS: controlMessages.IP.HasTOS = true binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS) - case syscall.IP_PKTINFO: + case linux.IP_PKTINFO: controlMessages.IP.HasIPPacketInfo = true var packetInfo linux.ControlMessageIPPacketInfo binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo) - controlMessages.IP.PacketInfo = control.NewIPPacketInfo(packetInfo) + controlMessages.IP.PacketInfo = packetInfo + + case linux.IP_RECVORIGDSTADDR: + var addr linux.SockAddrInet + binary.Unmarshal(unixCmsg.Data[:addr.SizeBytes()], usermem.ByteOrder, &addr) + controlMessages.IP.OriginalDstAddress = &addr } - case syscall.SOL_IPV6: + case linux.SOL_IPV6: switch unixCmsg.Header.Type { - case syscall.IPV6_TCLASS: + case linux.IPV6_TCLASS: controlMessages.IP.HasTClass = true binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTClass], usermem.ByteOrder, &controlMessages.IP.TClass) + + case linux.IPV6_RECVORIGDSTADDR: + var addr linux.SockAddrInet6 + binary.Unmarshal(unixCmsg.Data[:addr.SizeBytes()], usermem.ByteOrder, &addr) + controlMessages.IP.OriginalDstAddress = &addr + } + + case linux.SOL_TCP: + switch unixCmsg.Header.Type { + case linux.TCP_INQ: + controlMessages.IP.HasInq = true + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageInq], usermem.ByteOrder, &controlMessages.IP.Inq) } } } - - return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), controlMessages, nil + return controlMessages } // SendMsg implements socket.Socket.SendMsg. diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 3baad098b..057f4d294 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -120,9 +120,6 @@ type socketOpsCommon struct { // fixed buffer but only consume this many bytes. sendBufferSize uint32 - // passcred indicates if this socket wants SCM credentials. - passcred bool - // filter indicates that this socket has a BPF filter "installed". // // TODO(gvisor.dev/issue/1119): We don't actually support filtering, @@ -201,10 +198,7 @@ func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { // Passcred implements transport.Credentialer.Passcred. func (s *socketOpsCommon) Passcred() bool { - s.mu.Lock() - passcred := s.passcred - s.mu.Unlock() - return passcred + return s.ep.SocketOptions().GetPassCred() } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. @@ -419,9 +413,7 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt [] } passcred := usermem.ByteOrder.Uint32(opt) - s.mu.Lock() - s.passcred = passcred != 0 - s.mu.Unlock() + s.ep.SocketOptions().SetPassCred(passcred != 0) return nil case linux.SO_ATTACH_FILTER: diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 7d0ae15ca..23d5cab9c 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -84,69 +84,95 @@ var Metrics = tcpip.Stats{ MalformedRcvdPackets: mustCreateMetric("/netstack/malformed_received_packets", "Number of packets received by netstack that were deemed malformed."), DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped by netstack due to full queues."), ICMP: tcpip.ICMPStats{ - V4PacketsSent: tcpip.ICMPv4SentPacketStats{ - ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ - Echo: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo", "Total number of ICMPv4 echo packets sent by netstack."), - EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Total number of ICMPv4 echo reply packets sent by netstack."), - DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Total number of ICMPv4 destination unreachable packets sent by netstack."), - SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Total number of ICMPv4 source quench packets sent by netstack."), - Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Total number of ICMPv4 redirect packets sent by netstack."), - TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Total number of ICMPv4 time exceeded packets sent by netstack."), - ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Total number of ICMPv4 parameter problem packets sent by netstack."), - Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Total number of ICMPv4 timestamp packets sent by netstack."), - TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Total number of ICMPv4 timestamp reply packets sent by netstack."), - InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Total number of ICMPv4 information request packets sent by netstack."), - InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Total number of ICMPv4 information reply packets sent by netstack."), + V4: tcpip.ICMPv4Stats{ + PacketsSent: tcpip.ICMPv4SentPacketStats{ + ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ + Echo: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo", "Total number of ICMPv4 echo packets sent by netstack."), + EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Total number of ICMPv4 echo reply packets sent by netstack."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Total number of ICMPv4 destination unreachable packets sent by netstack."), + SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Total number of ICMPv4 source quench packets sent by netstack."), + Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Total number of ICMPv4 redirect packets sent by netstack."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Total number of ICMPv4 time exceeded packets sent by netstack."), + ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Total number of ICMPv4 parameter problem packets sent by netstack."), + Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Total number of ICMPv4 timestamp packets sent by netstack."), + TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Total number of ICMPv4 timestamp reply packets sent by netstack."), + InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Total number of ICMPv4 information request packets sent by netstack."), + InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Total number of ICMPv4 information reply packets sent by netstack."), + }, + Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Total number of ICMPv4 packets dropped by netstack due to link layer errors."), + }, + PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{ + ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ + Echo: mustCreateMetric("/netstack/icmp/v4/packets_received/echo", "Total number of ICMPv4 echo packets received by netstack."), + EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Total number of ICMPv4 echo reply packets received by netstack."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Total number of ICMPv4 destination unreachable packets received by netstack."), + SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Total number of ICMPv4 source quench packets received by netstack."), + Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Total number of ICMPv4 redirect packets received by netstack."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Total number of ICMPv4 time exceeded packets received by netstack."), + ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Total number of ICMPv4 parameter problem packets received by netstack."), + Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Total number of ICMPv4 timestamp packets received by netstack."), + TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Total number of ICMPv4 timestamp reply packets received by netstack."), + InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Total number of ICMPv4 information request packets received by netstack."), + InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Total number of ICMPv4 information reply packets received by netstack."), + }, + Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Total number of ICMPv4 packets received that the transport layer could not parse."), }, - Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Total number of ICMPv4 packets dropped by netstack due to link layer errors."), }, - V4PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{ - ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ - Echo: mustCreateMetric("/netstack/icmp/v4/packets_received/echo", "Total number of ICMPv4 echo packets received by netstack."), - EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Total number of ICMPv4 echo reply packets received by netstack."), - DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Total number of ICMPv4 destination unreachable packets received by netstack."), - SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Total number of ICMPv4 source quench packets received by netstack."), - Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Total number of ICMPv4 redirect packets received by netstack."), - TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Total number of ICMPv4 time exceeded packets received by netstack."), - ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Total number of ICMPv4 parameter problem packets received by netstack."), - Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Total number of ICMPv4 timestamp packets received by netstack."), - TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Total number of ICMPv4 timestamp reply packets received by netstack."), - InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Total number of ICMPv4 information request packets received by netstack."), - InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Total number of ICMPv4 information reply packets received by netstack."), + V6: tcpip.ICMPv6Stats{ + PacketsSent: tcpip.ICMPv6SentPacketStats{ + ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ + EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Total number of ICMPv6 echo request packets sent by netstack."), + EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Total number of ICMPv6 echo reply packets sent by netstack."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Total number of ICMPv6 destination unreachable packets sent by netstack."), + PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Total number of ICMPv6 packet too big packets sent by netstack."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Total number of ICMPv6 time exceeded packets sent by netstack."), + ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Total number of ICMPv6 parameter problem packets sent by netstack."), + RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Total number of ICMPv6 router solicit packets sent by netstack."), + RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Total number of ICMPv6 router advert packets sent by netstack."), + NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets sent by netstack."), + NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Total number of ICMPv6 neighbor advert packets sent by netstack."), + RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Total number of ICMPv6 redirect message packets sent by netstack."), + }, + Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Total number of ICMPv6 packets dropped by netstack due to link layer errors."), + }, + PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{ + ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ + EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Total number of ICMPv6 echo request packets received by netstack."), + EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Total number of ICMPv6 echo reply packets received by netstack."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Total number of ICMPv6 destination unreachable packets received by netstack."), + PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Total number of ICMPv6 packet too big packets received by netstack."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Total number of ICMPv6 time exceeded packets received by netstack."), + ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Total number of ICMPv6 parameter problem packets received by netstack."), + RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Total number of ICMPv6 router solicit packets received by netstack."), + RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Total number of ICMPv6 router advert packets received by netstack."), + NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets received by netstack."), + NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Total number of ICMPv6 neighbor advert packets received by netstack."), + RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Total number of ICMPv6 redirect message packets received by netstack."), + }, + Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Total number of ICMPv6 packets received that the transport layer could not parse."), }, - Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Total number of ICMPv4 packets received that the transport layer could not parse."), }, - V6PacketsSent: tcpip.ICMPv6SentPacketStats{ - ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ - EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Total number of ICMPv6 echo request packets sent by netstack."), - EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Total number of ICMPv6 echo reply packets sent by netstack."), - DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Total number of ICMPv6 destination unreachable packets sent by netstack."), - PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Total number of ICMPv6 packet too big packets sent by netstack."), - TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Total number of ICMPv6 time exceeded packets sent by netstack."), - ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Total number of ICMPv6 parameter problem packets sent by netstack."), - RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Total number of ICMPv6 router solicit packets sent by netstack."), - RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Total number of ICMPv6 router advert packets sent by netstack."), - NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets sent by netstack."), - NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Total number of ICMPv6 neighbor advert packets sent by netstack."), - RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Total number of ICMPv6 redirect message packets sent by netstack."), + }, + IGMP: tcpip.IGMPStats{ + PacketsSent: tcpip.IGMPSentPacketStats{ + IGMPPacketStats: tcpip.IGMPPacketStats{ + MembershipQuery: mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Total number of IGMP Membership Query messages sent by netstack."), + V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Total number of IGMPv1 Membership Report messages sent by netstack."), + V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Total number of IGMPv2 Membership Report messages sent by netstack."), + LeaveGroup: mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Total number of IGMP Leave Group messages sent by netstack."), }, - Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Total number of ICMPv6 packets dropped by netstack due to link layer errors."), + Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Total number of IGMP packets dropped by netstack due to link layer errors."), }, - V6PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{ - ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ - EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Total number of ICMPv6 echo request packets received by netstack."), - EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Total number of ICMPv6 echo reply packets received by netstack."), - DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Total number of ICMPv6 destination unreachable packets received by netstack."), - PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Total number of ICMPv6 packet too big packets received by netstack."), - TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Total number of ICMPv6 time exceeded packets received by netstack."), - ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Total number of ICMPv6 parameter problem packets received by netstack."), - RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Total number of ICMPv6 router solicit packets received by netstack."), - RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Total number of ICMPv6 router advert packets received by netstack."), - NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets received by netstack."), - NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Total number of ICMPv6 neighbor advert packets received by netstack."), - RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Total number of ICMPv6 redirect message packets received by netstack."), + PacketsReceived: tcpip.IGMPReceivedPacketStats{ + IGMPPacketStats: tcpip.IGMPPacketStats{ + MembershipQuery: mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Total number of IGMP Membership Query messages received by netstack."), + V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Total number of IGMPv1 Membership Report messages received by netstack."), + V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Total number of IGMPv2 Membership Report messages received by netstack."), + LeaveGroup: mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Total number of IGMP Leave Group messages received by netstack."), }, - Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Total number of ICMPv6 packets received that the transport layer could not parse."), + Invalid: mustCreateMetric("/netstack/igmp/packets_received/invalid", "Total number of IGMP packets received by netstack that could not be parsed."), + ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Total number of received IGMP packets with bad checksums."), + Unrecognized: mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Total number of unrecognized IGMP packets received by netstack."), }, }, IP: tcpip.IPStats{ @@ -209,18 +235,6 @@ const sizeOfInt32 int = 4 var errStackType = syserr.New("expected but did not receive a netstack.Stack", linux.EINVAL) -// ntohs converts a 16-bit number from network byte order to host byte order. It -// assumes that the host is little endian. -func ntohs(v uint16) uint16 { - return v<<8 | v>>8 -} - -// htons converts a 16-bit number from host byte order to network byte order. It -// assumes that the host is little endian. -func htons(v uint16) uint16 { - return ntohs(v) -} - // commonEndpoint represents the intersection of a tcpip.Endpoint and a // transport.Endpoint. type commonEndpoint interface { @@ -240,10 +254,6 @@ type commonEndpoint interface { // transport.Endpoint.SetSockOpt. SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error - // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and - // transport.Endpoint.SetSockOptBool. - SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error - // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and // transport.Endpoint.SetSockOptInt. SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error @@ -252,18 +262,20 @@ type commonEndpoint interface { // transport.Endpoint.GetSockOpt. GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error - // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and - // transport.Endpoint.GetSockOpt. - GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and // transport.Endpoint.GetSockOpt. GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) - // LastError implements tcpip.Endpoint.LastError. + // State returns a socket's lifecycle state. The returned value is + // protocol-specific and is primarily used for diagnostics. + State() uint32 + + // LastError implements tcpip.Endpoint.LastError and + // transport.Endpoint.LastError. LastError() *tcpip.Error - // SocketOptions implements tcpip.Endpoint.SocketOptions. + // SocketOptions implements tcpip.Endpoint.SocketOptions and + // transport.Endpoint.SocketOptions. SocketOptions() *tcpip.SocketOptions } @@ -308,7 +320,7 @@ type socketOpsCommon struct { readView buffer.View // readCM holds control message information for the last packet read // from Endpoint. - readCM tcpip.ControlMessages + readCM socket.IPControlMessages sender tcpip.FullAddress linkPacketInfo tcpip.LinkPacketInfo @@ -332,9 +344,7 @@ type socketOpsCommon struct { // New creates a new endpoint socket. func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) { if skType == linux.SOCK_STREAM { - if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil { - return nil, syserr.TranslateNetstackError(err) - } + endpoint.SocketOptions().SetDelayOption(true) } dirent := socket.NewDirent(t, netstackDevice) @@ -363,88 +373,6 @@ func bytesToIPAddress(addr []byte) tcpip.Address { return tcpip.Address(addr) } -// AddressAndFamily reads an sockaddr struct from the given address and -// converts it to the FullAddress format. It supports AF_UNIX, AF_INET, -// AF_INET6, and AF_PACKET addresses. -// -// AddressAndFamily returns an address and its family. -func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { - // Make sure we have at least 2 bytes for the address family. - if len(addr) < 2 { - return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument - } - - // Get the rest of the fields based on the address family. - switch family := usermem.ByteOrder.Uint16(addr); family { - case linux.AF_UNIX: - path := addr[2:] - if len(path) > linux.UnixPathMax { - return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument - } - // Drop the terminating NUL (if one exists) and everything after - // it for filesystem (non-abstract) addresses. - if len(path) > 0 && path[0] != 0 { - if n := bytes.IndexByte(path[1:], 0); n >= 0 { - path = path[:n+1] - } - } - return tcpip.FullAddress{ - Addr: tcpip.Address(path), - }, family, nil - - case linux.AF_INET: - var a linux.SockAddrInet - if len(addr) < sockAddrInetSize { - return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument - } - binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a) - - out := tcpip.FullAddress{ - Addr: bytesToIPAddress(a.Addr[:]), - Port: ntohs(a.Port), - } - return out, family, nil - - case linux.AF_INET6: - var a linux.SockAddrInet6 - if len(addr) < sockAddrInet6Size { - return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument - } - binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a) - - out := tcpip.FullAddress{ - Addr: bytesToIPAddress(a.Addr[:]), - Port: ntohs(a.Port), - } - if isLinkLocal(out.Addr) { - out.NIC = tcpip.NICID(a.Scope_id) - } - return out, family, nil - - case linux.AF_PACKET: - var a linux.SockAddrLink - if len(addr) < sockAddrLinkSize { - return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument - } - binary.Unmarshal(addr[:sockAddrLinkSize], usermem.ByteOrder, &a) - if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize { - return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument - } - - // TODO(gvisor.dev/issue/173): Return protocol too. - return tcpip.FullAddress{ - NIC: tcpip.NICID(a.InterfaceIndex), - Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), - }, family, nil - - case linux.AF_UNSPEC: - return tcpip.FullAddress{}, family, nil - - default: - return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported - } -} - func (s *socketOpsCommon) isPacketBased() bool { return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW } @@ -480,7 +408,7 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error { } s.readView = v - s.readCM = cms + s.readCM = socket.NewIPControlMessages(s.family, cms) atomic.StoreUint32(&s.readViewHasData, 1) return nil @@ -500,11 +428,7 @@ func (s *socketOpsCommon) Release(ctx context.Context) { return } - var v tcpip.LingerOption - if err := s.Endpoint.GetSockOpt(&v); err != nil { - return - } - + v := s.Endpoint.SocketOptions().GetLinger() // The case for zero timeout is handled in tcp endpoint close function. // Close is blocked until either: // 1. The endpoint state is not in any of the states: FIN-WAIT1, @@ -721,11 +645,7 @@ func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error { return nil } if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 { - v, err := s.Endpoint.GetSockOptBool(tcpip.V6OnlyOption) - if err != nil { - return syserr.TranslateNetstackError(err) - } - if !v { + if !s.Endpoint.SocketOptions().GetV6Only() { return nil } } @@ -749,7 +669,7 @@ func (s *socketOpsCommon) mapFamily(addr tcpip.FullAddress, family uint16) tcpip // Connect implements the linux syscall connect(2) for sockets backed by // tpcip.Endpoint. func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { - addr, family, err := AddressAndFamily(sockaddr) + addr, family, err := socket.AddressAndFamily(sockaddr) if err != nil { return err } @@ -830,7 +750,7 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } } else { var err *syserr.Error - addr, family, err = AddressAndFamily(sockaddr) + addr, family, err = socket.AddressAndFamily(sockaddr) if err != nil { return err } @@ -921,7 +841,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, var addr linux.SockAddr var addrLen uint32 if peerAddr != nil { - addr, addrLen = ConvertAddress(s.family, *peerAddr) + addr, addrLen = socket.ConvertAddress(s.family, *peerAddr) } fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ @@ -1005,7 +925,7 @@ func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family in return getSockOptSocket(t, s, ep, family, skType, name, outLen) case linux.SOL_TCP: - return getSockOptTCP(t, ep, name, outLen) + return getSockOptTCP(t, s, ep, name, outLen) case linux.SOL_IPV6: return getSockOptIPv6(t, s, ep, name, outPtr, outLen) @@ -1041,7 +961,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam } // Get the last error and convert it. - err := ep.LastError() + err := ep.SocketOptions().GetLastError() if err == nil { optP := primitive.Int32(0) return &optP, nil @@ -1068,13 +988,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.PasscredOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred())) + return &v, nil case linux.SO_SNDBUF: if outLen < sizeOfInt32 { @@ -1115,25 +1030,16 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.ReuseAddressOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress())) + return &v, nil case linux.SO_REUSEPORT: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.ReusePortOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort())) + return &v, nil case linux.SO_BINDTODEVICE: var v tcpip.BindToDeviceOption @@ -1174,24 +1080,16 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.KeepaliveEnabledOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive())) + return &v, nil case linux.SO_LINGER: if outLen < linux.SizeOfLinger { return nil, syserr.ErrInvalidArgument } - var v tcpip.LingerOption var linger linux.Linger - if err := ep.GetSockOpt(&v); err != nil { - return nil, syserr.TranslateNetstackError(err) - } + v := ep.SocketOptions().GetLinger() if v.Enabled { linger.OnOff = 1 @@ -1222,34 +1120,26 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - var v tcpip.OutOfBandInlineOption - if err := ep.GetSockOpt(&v); err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(v) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline())) + return &v, nil case linux.SO_NO_CHECK: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.NoChecksumOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum())) + return &v, nil case linux.SO_ACCEPTCONN: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.AcceptConnOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) + // This option is only viable for TCP endpoints. + var v bool + if _, skType, skProto := s.Type(); isTCPSocket(skType, skProto) { + v = tcp.EndpointState(ep.State()) == tcp.StateListen } vP := primitive.Int32(boolToInt32(v)) return &vP, nil @@ -1261,46 +1151,36 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam } // getSockOptTCP implements GetSockOpt when level is SOL_TCP. -func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) { +func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) { + if _, skType, skProto := s.Type(); !isTCPSocket(skType, skProto) { + log.Warningf("SOL_TCP options are only supported on TCP sockets: skType, skProto = %v, %d", skType, skProto) + return nil, syserr.ErrUnknownProtocolOption + } + switch name { case linux.TCP_NODELAY: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.DelayOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(!v)) - return &vP, nil + v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption())) + return &v, nil case linux.TCP_CORK: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.CorkOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption())) + return &v, nil case linux.TCP_QUICKACK: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.QuickAckOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck())) + return &v, nil case linux.TCP_MAXSEG: if outLen < sizeOfInt32 { @@ -1474,19 +1354,24 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6. func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { + if _, ok := ep.(tcpip.Endpoint); !ok { + log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) + return nil, syserr.ErrUnknownProtocolOption + } + + family, skType, _ := s.Type() + if family != linux.AF_INET6 { + return nil, syserr.ErrUnknownProtocolOption + } + switch name { case linux.IPV6_V6ONLY: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.V6OnlyOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only())) + return &v, nil case linux.IPV6_PATHMTU: t.Kernel().EmitUnimplementedEvent(t) @@ -1518,13 +1403,16 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass())) + return &v, nil + + case linux.IPV6_RECVORIGDSTADDR: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument } - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) + return &v, nil case linux.IP6T_ORIGINAL_DST: if outLen < int(binary.Size(linux.SockAddrInet6{})) { @@ -1536,7 +1424,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return nil, syserr.TranslateNetstackError(err) } - a, _ := ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v)) + a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v)) return a.(*linux.SockAddrInet6), nil case linux.IP6T_SO_GET_INFO: @@ -1545,7 +1433,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name } // Only valid for raw IPv6 sockets. - if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW { + if skType != linux.SOCK_RAW { return nil, syserr.ErrProtocolNotAvailable } @@ -1565,7 +1453,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return nil, syserr.ErrInvalidArgument } // Only valid for raw IPv6 sockets. - if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW { + if skType != linux.SOCK_RAW { return nil, syserr.ErrProtocolNotAvailable } @@ -1585,7 +1473,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name } // Only valid for raw IPv6 sockets. - if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW { + if skType != linux.SOCK_RAW { return nil, syserr.ErrProtocolNotAvailable } @@ -1607,6 +1495,11 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name // getSockOptIP implements GetSockOpt when level is SOL_IP. func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) { + if _, ok := ep.(tcpip.Endpoint); !ok { + log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) + return nil, syserr.ErrUnknownProtocolOption + } + switch name { case linux.IP_TTL: if outLen < sizeOfInt32 { @@ -1649,7 +1542,7 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.TranslateNetstackError(err) } - a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr}) + a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr}) return &a.(*linux.SockAddrInet).Addr, nil @@ -1658,13 +1551,8 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.MulticastLoopOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) - } - - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop())) + return &v, nil case linux.IP_TOS: // Length handling for parity with Linux. @@ -1688,26 +1576,32 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.ReceiveTOSOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS())) + return &v, nil + + case linux.IP_PKTINFO: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument } - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo())) + return &v, nil - case linux.IP_PKTINFO: + case linux.IP_HDRINCL: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptBool(tcpip.ReceiveIPPacketInfoOption) - if err != nil { - return nil, syserr.TranslateNetstackError(err) + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded())) + return &v, nil + + case linux.IP_RECVORIGDSTADDR: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument } - vP := primitive.Int32(boolToInt32(v)) - return &vP, nil + v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) + return &v, nil case linux.SO_ORIGINAL_DST: if outLen < int(binary.Size(linux.SockAddrInet{})) { @@ -1719,7 +1613,7 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.TranslateNetstackError(err) } - a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v)) + a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v)) return a.(*linux.SockAddrInet), nil case linux.IPT_SO_GET_INFO: @@ -1826,7 +1720,7 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int return setSockOptSocket(t, s, ep, name, optVal) case linux.SOL_TCP: - return setSockOptTCP(t, ep, name, optVal) + return setSockOptTCP(t, s, ep, name, optVal) case linux.SOL_IPV6: return setSockOptIPv6(t, s, ep, name, optVal) @@ -1876,7 +1770,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReuseAddressOption, v != 0)) + ep.SocketOptions().SetReuseAddress(v != 0) + return nil case linux.SO_REUSEPORT: if len(optVal) < sizeOfInt32 { @@ -1884,7 +1779,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReusePortOption, v != 0)) + ep.SocketOptions().SetReusePort(v != 0) + return nil case linux.SO_BINDTODEVICE: n := bytes.IndexByte(optVal, 0) @@ -1923,7 +1819,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.PasscredOption, v != 0)) + ep.SocketOptions().SetPassCred(v != 0) + return nil case linux.SO_KEEPALIVE: if len(optVal) < sizeOfInt32 { @@ -1931,7 +1828,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.KeepaliveEnabledOption, v != 0)) + ep.SocketOptions().SetKeepAlive(v != 0) + return nil case linux.SO_SNDTIMEO: if len(optVal) < linux.SizeOfTimeval { @@ -1970,8 +1868,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam socket.SetSockOptEmitUnimplementedEvent(t, name) } - opt := tcpip.OutOfBandInlineOption(v) - return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) + ep.SocketOptions().SetOutOfBandInline(v != 0) + return nil case linux.SO_NO_CHECK: if len(optVal) < sizeOfInt32 { @@ -1979,7 +1877,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.NoChecksumOption, v != 0)) + ep.SocketOptions().SetNoChecksum(v != 0) + return nil case linux.SO_LINGER: if len(optVal) < linux.SizeOfLinger { @@ -1993,10 +1892,11 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam socket.SetSockOptEmitUnimplementedEvent(t, name) } - return syserr.TranslateNetstackError( - ep.SetSockOpt(&tcpip.LingerOption{ - Enabled: v.OnOff != 0, - Timeout: time.Second * time.Duration(v.Linger)})) + ep.SocketOptions().SetLinger(tcpip.LingerOption{ + Enabled: v.OnOff != 0, + Timeout: time.Second * time.Duration(v.Linger), + }) + return nil case linux.SO_DETACH_FILTER: // optval is ignored. @@ -2011,7 +1911,12 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } // setSockOptTCP implements SetSockOpt when level is SOL_TCP. -func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error { +func setSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { + if _, skType, skProto := s.Type(); !isTCPSocket(skType, skProto) { + log.Warningf("SOL_TCP options are only supported on TCP sockets: skType, skProto = %v, %d", skType, skProto) + return syserr.ErrUnknownProtocolOption + } + switch name { case linux.TCP_NODELAY: if len(optVal) < sizeOfInt32 { @@ -2019,7 +1924,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.DelayOption, v == 0)) + ep.SocketOptions().SetDelayOption(v == 0) + return nil case linux.TCP_CORK: if len(optVal) < sizeOfInt32 { @@ -2027,7 +1933,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.CorkOption, v != 0)) + ep.SocketOptions().SetCorkOption(v != 0) + return nil case linux.TCP_QUICKACK: if len(optVal) < sizeOfInt32 { @@ -2035,7 +1942,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.QuickAckOption, v != 0)) + ep.SocketOptions().SetQuickAck(v != 0) + return nil case linux.TCP_MAXSEG: if len(optVal) < sizeOfInt32 { @@ -2147,14 +2055,31 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6. func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { + if _, ok := ep.(tcpip.Endpoint); !ok { + log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) + return syserr.ErrUnknownProtocolOption + } + + family, skType, skProto := s.Type() + if family != linux.AF_INET6 { + return syserr.ErrUnknownProtocolOption + } + switch name { case linux.IPV6_V6ONLY: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } + if isTCPSocket(skType, skProto) && tcp.EndpointState(ep.State()) != tcp.StateInitial { + return syserr.ErrInvalidEndpointState + } else if isUDPSocket(skType, skProto) && udp.EndpointState(ep.State()) != udp.StateInitial { + return syserr.ErrInvalidEndpointState + } + v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.V6OnlyOption, v != 0)) + ep.SocketOptions().SetV6Only(v != 0) + return nil case linux.IPV6_ADD_MEMBERSHIP, linux.IPV6_DROP_MEMBERSHIP, @@ -2174,6 +2099,15 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name t.Kernel().EmitUnimplementedEvent(t) + case linux.IPV6_RECVORIGDSTADDR: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := int32(usermem.ByteOrder.Uint32(optVal)) + + ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) + return nil + case linux.IPV6_TCLASS: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument @@ -2193,7 +2127,8 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return err } - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0)) + ep.SocketOptions().SetReceiveTClass(v != 0) + return nil case linux.IP6T_SO_SET_REPLACE: if len(optVal) < linux.SizeOfIP6TReplace { @@ -2201,7 +2136,7 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name } // Only valid for raw IPv6 sockets. - if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW { + if skType != linux.SOCK_RAW { return syserr.ErrProtocolNotAvailable } @@ -2276,6 +2211,11 @@ func parseIntOrChar(buf []byte) (int32, *syserr.Error) { // setSockOptIP implements SetSockOpt when level is SOL_IP. func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { + if _, ok := ep.(tcpip.Endpoint); !ok { + log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) + return syserr.ErrUnknownProtocolOption + } + switch name { case linux.IP_MULTICAST_TTL: v, err := parseIntOrChar(optVal) @@ -2328,7 +2268,7 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{ NIC: tcpip.NICID(req.InterfaceIndex), - InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]), + InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]), })) case linux.IP_MULTICAST_LOOP: @@ -2337,7 +2277,8 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return err } - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.MulticastLoopOption, v != 0)) + ep.SocketOptions().SetMulticastLoop(v != 0) + return nil case linux.MCAST_JOIN_GROUP: // FIXME(b/124219304): Implement MCAST_JOIN_GROUP. @@ -2373,7 +2314,8 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in if err != nil { return err } - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0)) + ep.SocketOptions().SetReceiveTOS(v != 0) + return nil case linux.IP_PKTINFO: if len(optVal) == 0 { @@ -2383,7 +2325,8 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in if err != nil { return err } - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0)) + ep.SocketOptions().SetReceivePacketInfo(v != 0) + return nil case linux.IP_HDRINCL: if len(optVal) == 0 { @@ -2393,7 +2336,20 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in if err != nil { return err } - return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0)) + ep.SocketOptions().SetHeaderIncluded(v != 0) + return nil + + case linux.IP_RECVORIGDSTADDR: + if len(optVal) == 0 { + return nil + } + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + + ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) + return nil case linux.IPT_SO_SET_REPLACE: if len(optVal) < linux.SizeOfIPTReplace { @@ -2433,7 +2389,6 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in linux.IP_RECVERR, linux.IP_RECVFRAGSIZE, linux.IP_RECVOPTS, - linux.IP_RECVORIGDSTADDR, linux.IP_RECVTTL, linux.IP_RETOPTS, linux.IP_TRANSPARENT, @@ -2511,7 +2466,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) { linux.IPV6_RECVFRAGSIZE, linux.IPV6_RECVHOPLIMIT, linux.IPV6_RECVHOPOPTS, - linux.IPV6_RECVORIGDSTADDR, linux.IPV6_RECVPATHMTU, linux.IPV6_RECVPKTINFO, linux.IPV6_RECVRTHDR, @@ -2535,7 +2489,6 @@ func emitUnimplementedEventIP(t *kernel.Task, name int) { switch name { case linux.IP_TOS, linux.IP_TTL, - linux.IP_HDRINCL, linux.IP_OPTIONS, linux.IP_ROUTER_ALERT, linux.IP_RECVOPTS, @@ -2582,72 +2535,6 @@ func emitUnimplementedEventIP(t *kernel.Task, name int) { } } -// isLinkLocal determines if the given IPv6 address is link-local. This is the -// case when it has the fe80::/10 prefix. This check is used to determine when -// the NICID is relevant for a given IPv6 address. -func isLinkLocal(addr tcpip.Address) bool { - return len(addr) >= 2 && addr[0] == 0xfe && addr[1]&0xc0 == 0x80 -} - -// ConvertAddress converts the given address to a native format. -func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) { - switch family { - case linux.AF_UNIX: - var out linux.SockAddrUnix - out.Family = linux.AF_UNIX - l := len([]byte(addr.Addr)) - for i := 0; i < l; i++ { - out.Path[i] = int8(addr.Addr[i]) - } - - // Linux returns the used length of the address struct (including the - // null terminator) for filesystem paths. The Family field is 2 bytes. - // It is sometimes allowed to exclude the null terminator if the - // address length is the max. Abstract and empty paths always return - // the full exact length. - if l == 0 || out.Path[0] == 0 || l == len(out.Path) { - return &out, uint32(2 + l) - } - return &out, uint32(3 + l) - - case linux.AF_INET: - var out linux.SockAddrInet - copy(out.Addr[:], addr.Addr) - out.Family = linux.AF_INET - out.Port = htons(addr.Port) - return &out, uint32(sockAddrInetSize) - - case linux.AF_INET6: - var out linux.SockAddrInet6 - if len(addr.Addr) == header.IPv4AddressSize { - // Copy address in v4-mapped format. - copy(out.Addr[12:], addr.Addr) - out.Addr[10] = 0xff - out.Addr[11] = 0xff - } else { - copy(out.Addr[:], addr.Addr) - } - out.Family = linux.AF_INET6 - out.Port = htons(addr.Port) - if isLinkLocal(addr.Addr) { - out.Scope_id = uint32(addr.NIC) - } - return &out, uint32(sockAddrInet6Size) - - case linux.AF_PACKET: - // TODO(gvisor.dev/issue/173): Return protocol too. - var out linux.SockAddrLink - out.Family = linux.AF_PACKET - out.InterfaceIndex = int32(addr.NIC) - out.HardwareAddrLen = header.EthernetAddressSize - copy(out.HardwareAddr[:], addr.Addr) - return &out, uint32(sockAddrLinkSize) - - default: - return nil, 0 - } -} - // GetSockName implements the linux syscall getsockname(2) for sockets backed by // tcpip.Endpoint. func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { @@ -2656,7 +2543,7 @@ func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, * return nil, 0, syserr.TranslateNetstackError(err) } - a, l := ConvertAddress(s.family, addr) + a, l := socket.ConvertAddress(s.family, addr) return a, l, nil } @@ -2668,7 +2555,7 @@ func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, * return nil, 0, syserr.TranslateNetstackError(err) } - a, l := ConvertAddress(s.family, addr) + a, l := socket.ConvertAddress(s.family, addr) return a, l, nil } @@ -2686,7 +2573,7 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ // Always do at least one fetchReadView, even if the number of bytes to // read is 0. err = s.fetchReadView() - if err != nil { + if err != nil || len(s.readView) == 0 { break } if dst.NumBytes() == 0 { @@ -2709,15 +2596,20 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ } copied += n s.readView.TrimFront(n) - if len(s.readView) == 0 { - atomic.StoreUint32(&s.readViewHasData, 0) - } dst = dst.DropFirst(n) if e != nil { err = syserr.FromError(e) break } + // If we are done reading requested data then stop. + if dst.NumBytes() == 0 { + break + } + } + + if len(s.readView) == 0 { + atomic.StoreUint32(&s.readViewHasData, 0) } // If we managed to copy something, we must deliver it. @@ -2812,10 +2704,10 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq var addr linux.SockAddr var addrLen uint32 if isPacket && senderRequested { - addr, addrLen = ConvertAddress(s.family, s.sender) + addr, addrLen = socket.ConvertAddress(s.family, s.sender) switch v := addr.(type) { case *linux.SockAddrLink: - v.Protocol = htons(uint16(s.linkPacketInfo.Protocol)) + v.Protocol = socket.Htons(uint16(s.linkPacketInfo.Protocol)) v.PacketType = toLinuxPacketType(s.linkPacketInfo.PktType) } } @@ -2833,7 +2725,7 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq // We need to peek beyond the first message. dst = dst.DropFirst(n) num, err := dst.CopyOutFrom(ctx, safemem.FromVecReaderFunc{func(dsts [][]byte) (int64, error) { - n, _, err := s.Endpoint.Peek(dsts) + n, err := s.Endpoint.Peek(dsts) // TODO(b/78348848): Handle peek timestamp. if err != nil { return int64(n), syserr.TranslateNetstackError(err).ToError() @@ -2877,15 +2769,16 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq func (s *socketOpsCommon) controlMessages() socket.ControlMessages { return socket.ControlMessages{ - IP: tcpip.ControlMessages{ - HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, - Timestamp: s.readCM.Timestamp, - HasTOS: s.readCM.HasTOS, - TOS: s.readCM.TOS, - HasTClass: s.readCM.HasTClass, - TClass: s.readCM.TClass, - HasIPPacketInfo: s.readCM.HasIPPacketInfo, - PacketInfo: s.readCM.PacketInfo, + IP: socket.IPControlMessages{ + HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, + Timestamp: s.readCM.Timestamp, + HasTOS: s.readCM.HasTOS, + TOS: s.readCM.TOS, + HasTClass: s.readCM.HasTClass, + TClass: s.readCM.TClass, + HasIPPacketInfo: s.readCM.HasIPPacketInfo, + PacketInfo: s.readCM.PacketInfo, + OriginalDstAddress: s.readCM.OriginalDstAddress, }, } } @@ -2980,7 +2873,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b var addr *tcpip.FullAddress if len(to) > 0 { - addrBuf, family, err := AddressAndFamily(to) + addrBuf, family, err := socket.AddressAndFamily(to) if err != nil { return 0, err } @@ -3399,6 +3292,18 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { return rv } +func isTCPSocket(skType linux.SockType, skProto int) bool { + return skType == linux.SOCK_STREAM && (skProto == 0 || skProto == syscall.IPPROTO_TCP) +} + +func isUDPSocket(skType linux.SockType, skProto int) bool { + return skType == linux.SOCK_DGRAM && (skProto == 0 || skProto == syscall.IPPROTO_UDP) +} + +func isICMPSocket(skType linux.SockType, skProto int) bool { + return skType == linux.SOCK_DGRAM && (skProto == syscall.IPPROTO_ICMP || skProto == syscall.IPPROTO_ICMPV6) +} + // State implements socket.Socket.State. State translates the internal state // returned by netstack to values defined by Linux. func (s *socketOpsCommon) State() uint32 { @@ -3408,7 +3313,7 @@ func (s *socketOpsCommon) State() uint32 { } switch { - case s.skType == linux.SOCK_STREAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_TCP: + case isTCPSocket(s.skType, s.protocol): // TCP socket. switch tcp.EndpointState(s.Endpoint.State()) { case tcp.StateEstablished: @@ -3437,7 +3342,7 @@ func (s *socketOpsCommon) State() uint32 { // Internal or unknown state. return 0 } - case s.skType == linux.SOCK_DGRAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_UDP: + case isUDPSocket(s.skType, s.protocol): // UDP socket. switch udp.EndpointState(s.Endpoint.State()) { case udp.StateInitial, udp.StateBound, udp.StateClosed: @@ -3447,7 +3352,7 @@ func (s *socketOpsCommon) State() uint32 { default: return 0 } - case s.skType == linux.SOCK_DGRAM && s.protocol == syscall.IPPROTO_ICMP || s.protocol == syscall.IPPROTO_ICMPV6: + case isICMPSocket(s.skType, s.protocol): // TODO(b/112063468): Export states for ICMP sockets. case s.skType == linux.SOCK_RAW: // TODO(b/112063468): Export states for raw sockets. diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index b0d9e4d9e..b756bfca0 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -51,9 +51,7 @@ var _ = socket.SocketVFS2(&SocketVFS2{}) // NewVFS2 creates a new endpoint socket. func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { if skType == linux.SOCK_STREAM { - if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil { - return nil, syserr.TranslateNetstackError(err) - } + endpoint.SocketOptions().SetDelayOption(true) } mnt := t.Kernel().SocketMount() @@ -191,7 +189,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block var addrLen uint32 if peerAddr != nil { // Get address of the peer and write it to peer slice. - addr, addrLen = ConvertAddress(s.family, *peerAddr) + addr, addrLen = socket.ConvertAddress(s.family, *peerAddr) } fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index ead3b2b79..c847ff1c7 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -158,7 +158,7 @@ func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol // protocol is passed in network byte order, but netstack wants it in // host order. - netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol))) + netProto := tcpip.NetworkProtocolNumber(socket.Ntohs(uint16(protocol))) wq := &waiter.Queue{} ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) diff --git a/pkg/sentry/socket/netstack/provider_vfs2.go b/pkg/sentry/socket/netstack/provider_vfs2.go index 2a01143f6..0af805246 100644 --- a/pkg/sentry/socket/netstack/provider_vfs2.go +++ b/pkg/sentry/socket/netstack/provider_vfs2.go @@ -102,7 +102,7 @@ func packetSocketVFS2(t *kernel.Task, epStack *Stack, stype linux.SockType, prot // protocol is passed in network byte order, but netstack wants it in // host order. - netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol))) + netProto := tcpip.NetworkProtocolNumber(socket.Ntohs(uint16(protocol))) wq := &waiter.Queue{} ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index fa9ac9059..cc0fadeb5 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -324,12 +324,12 @@ func (s *Stack) Statistics(stat interface{}, arg string) error { 0, // Support Ip/FragCreates. } case *inet.StatSNMPICMP: - in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats - out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats + in := Metrics.ICMP.V4.PacketsReceived.ICMPv4PacketStats + out := Metrics.ICMP.V4.PacketsSent.ICMPv4PacketStats // TODO(gvisor.dev/issue/969) Support stubbed stats. *stats = inet.StatSNMPICMP{ 0, // Icmp/InMsgs. - Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors. + Metrics.ICMP.V4.PacketsSent.Dropped.Value(), // InErrors. 0, // Icmp/InCsumErrors. in.DstUnreachable.Value(), // InDestUnreachs. in.TimeExceeded.Value(), // InTimeExcds. @@ -343,18 +343,18 @@ func (s *Stack) Statistics(stat interface{}, arg string) error { in.InfoRequest.Value(), // InAddrMasks. in.InfoReply.Value(), // InAddrMaskReps. 0, // Icmp/OutMsgs. - Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors. - out.DstUnreachable.Value(), // OutDestUnreachs. - out.TimeExceeded.Value(), // OutTimeExcds. - out.ParamProblem.Value(), // OutParmProbs. - out.SrcQuench.Value(), // OutSrcQuenchs. - out.Redirect.Value(), // OutRedirects. - out.Echo.Value(), // OutEchos. - out.EchoReply.Value(), // OutEchoReps. - out.Timestamp.Value(), // OutTimestamps. - out.TimestampReply.Value(), // OutTimestampReps. - out.InfoRequest.Value(), // OutAddrMasks. - out.InfoReply.Value(), // OutAddrMaskReps. + Metrics.ICMP.V4.PacketsReceived.Invalid.Value(), // OutErrors. + out.DstUnreachable.Value(), // OutDestUnreachs. + out.TimeExceeded.Value(), // OutTimeExcds. + out.ParamProblem.Value(), // OutParmProbs. + out.SrcQuench.Value(), // OutSrcQuenchs. + out.Redirect.Value(), // OutRedirects. + out.Echo.Value(), // OutEchos. + out.EchoReply.Value(), // OutEchoReps. + out.Timestamp.Value(), // OutTimestamps. + out.TimestampReply.Value(), // OutTimestampReps. + out.InfoRequest.Value(), // OutAddrMasks. + out.InfoReply.Value(), // OutAddrMaskReps. } case *inet.StatSNMPTCP: tcp := Metrics.TCP diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index fd31479e5..bcc426e33 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -18,6 +18,7 @@ package socket import ( + "bytes" "fmt" "sync/atomic" "syscall" @@ -35,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/usermem" ) @@ -42,7 +44,79 @@ import ( // control messages. type ControlMessages struct { Unix transport.ControlMessages - IP tcpip.ControlMessages + IP IPControlMessages +} + +// packetInfoToLinux converts IPPacketInfo from tcpip format to Linux format. +func packetInfoToLinux(packetInfo tcpip.IPPacketInfo) linux.ControlMessageIPPacketInfo { + var p linux.ControlMessageIPPacketInfo + p.NIC = int32(packetInfo.NIC) + copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr)) + copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr)) + return p +} + +// NewIPControlMessages converts the tcpip ControlMessgaes (which does not +// have Linux specific format) to Linux format. +func NewIPControlMessages(family int, cmgs tcpip.ControlMessages) IPControlMessages { + var orgDstAddr linux.SockAddr + if cmgs.HasOriginalDstAddress { + orgDstAddr, _ = ConvertAddress(family, cmgs.OriginalDstAddress) + } + return IPControlMessages{ + HasTimestamp: cmgs.HasTimestamp, + Timestamp: cmgs.Timestamp, + HasInq: cmgs.HasInq, + Inq: cmgs.Inq, + HasTOS: cmgs.HasTOS, + TOS: cmgs.TOS, + HasTClass: cmgs.HasTClass, + TClass: cmgs.TClass, + HasIPPacketInfo: cmgs.HasIPPacketInfo, + PacketInfo: packetInfoToLinux(cmgs.PacketInfo), + OriginalDstAddress: orgDstAddr, + } +} + +// IPControlMessages contains socket control messages for IP sockets. +// This can contain Linux specific structures unlike tcpip.ControlMessages. +// +// +stateify savable +type IPControlMessages struct { + // HasTimestamp indicates whether Timestamp is valid/set. + HasTimestamp bool + + // Timestamp is the time (in ns) that the last packet used to create + // the read data was received. + Timestamp int64 + + // HasInq indicates whether Inq is valid/set. + HasInq bool + + // Inq is the number of bytes ready to be received. + Inq int32 + + // HasTOS indicates whether Tos is valid/set. + HasTOS bool + + // TOS is the IPv4 type of service of the associated packet. + TOS uint8 + + // HasTClass indicates whether TClass is valid/set. + HasTClass bool + + // TClass is the IPv6 traffic class of the associated packet. + TClass uint32 + + // HasIPPacketInfo indicates whether PacketInfo is set. + HasIPPacketInfo bool + + // PacketInfo holds interface and address data on an incoming packet. + PacketInfo linux.ControlMessageIPPacketInfo + + // OriginalDestinationAddress holds the original destination address + // and port of the incoming packet. + OriginalDstAddress linux.SockAddr } // Release releases Unix domain socket credentials and rights. @@ -460,3 +534,176 @@ func UnmarshalSockAddr(family int, data []byte) linux.SockAddr { panic(fmt.Sprintf("Unsupported socket family %v", family)) } } + +var sockAddrLinkSize = (&linux.SockAddrLink{}).SizeBytes() +var sockAddrInetSize = (&linux.SockAddrInet{}).SizeBytes() +var sockAddrInet6Size = (&linux.SockAddrInet6{}).SizeBytes() + +// Ntohs converts a 16-bit number from network byte order to host byte order. It +// assumes that the host is little endian. +func Ntohs(v uint16) uint16 { + return v<<8 | v>>8 +} + +// Htons converts a 16-bit number from host byte order to network byte order. It +// assumes that the host is little endian. +func Htons(v uint16) uint16 { + return Ntohs(v) +} + +// isLinkLocal determines if the given IPv6 address is link-local. This is the +// case when it has the fe80::/10 prefix. This check is used to determine when +// the NICID is relevant for a given IPv6 address. +func isLinkLocal(addr tcpip.Address) bool { + return len(addr) >= 2 && addr[0] == 0xfe && addr[1]&0xc0 == 0x80 +} + +// ConvertAddress converts the given address to a native format. +func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) { + switch family { + case linux.AF_UNIX: + var out linux.SockAddrUnix + out.Family = linux.AF_UNIX + l := len([]byte(addr.Addr)) + for i := 0; i < l; i++ { + out.Path[i] = int8(addr.Addr[i]) + } + + // Linux returns the used length of the address struct (including the + // null terminator) for filesystem paths. The Family field is 2 bytes. + // It is sometimes allowed to exclude the null terminator if the + // address length is the max. Abstract and empty paths always return + // the full exact length. + if l == 0 || out.Path[0] == 0 || l == len(out.Path) { + return &out, uint32(2 + l) + } + return &out, uint32(3 + l) + + case linux.AF_INET: + var out linux.SockAddrInet + copy(out.Addr[:], addr.Addr) + out.Family = linux.AF_INET + out.Port = Htons(addr.Port) + return &out, uint32(sockAddrInetSize) + + case linux.AF_INET6: + var out linux.SockAddrInet6 + if len(addr.Addr) == header.IPv4AddressSize { + // Copy address in v4-mapped format. + copy(out.Addr[12:], addr.Addr) + out.Addr[10] = 0xff + out.Addr[11] = 0xff + } else { + copy(out.Addr[:], addr.Addr) + } + out.Family = linux.AF_INET6 + out.Port = Htons(addr.Port) + if isLinkLocal(addr.Addr) { + out.Scope_id = uint32(addr.NIC) + } + return &out, uint32(sockAddrInet6Size) + + case linux.AF_PACKET: + // TODO(gvisor.dev/issue/173): Return protocol too. + var out linux.SockAddrLink + out.Family = linux.AF_PACKET + out.InterfaceIndex = int32(addr.NIC) + out.HardwareAddrLen = header.EthernetAddressSize + copy(out.HardwareAddr[:], addr.Addr) + return &out, uint32(sockAddrLinkSize) + + default: + return nil, 0 + } +} + +// BytesToIPAddress converts an IPv4 or IPv6 address from the user to the +// netstack representation taking any addresses into account. +func BytesToIPAddress(addr []byte) tcpip.Address { + if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) { + return "" + } + return tcpip.Address(addr) +} + +// AddressAndFamily reads an sockaddr struct from the given address and +// converts it to the FullAddress format. It supports AF_UNIX, AF_INET, +// AF_INET6, and AF_PACKET addresses. +// +// AddressAndFamily returns an address and its family. +func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { + // Make sure we have at least 2 bytes for the address family. + if len(addr) < 2 { + return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument + } + + // Get the rest of the fields based on the address family. + switch family := usermem.ByteOrder.Uint16(addr); family { + case linux.AF_UNIX: + path := addr[2:] + if len(path) > linux.UnixPathMax { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + // Drop the terminating NUL (if one exists) and everything after + // it for filesystem (non-abstract) addresses. + if len(path) > 0 && path[0] != 0 { + if n := bytes.IndexByte(path[1:], 0); n >= 0 { + path = path[:n+1] + } + } + return tcpip.FullAddress{ + Addr: tcpip.Address(path), + }, family, nil + + case linux.AF_INET: + var a linux.SockAddrInet + if len(addr) < sockAddrInetSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a) + + out := tcpip.FullAddress{ + Addr: BytesToIPAddress(a.Addr[:]), + Port: Ntohs(a.Port), + } + return out, family, nil + + case linux.AF_INET6: + var a linux.SockAddrInet6 + if len(addr) < sockAddrInet6Size { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a) + + out := tcpip.FullAddress{ + Addr: BytesToIPAddress(a.Addr[:]), + Port: Ntohs(a.Port), + } + if isLinkLocal(out.Addr) { + out.NIC = tcpip.NICID(a.Scope_id) + } + return out, family, nil + + case linux.AF_PACKET: + var a linux.SockAddrLink + if len(addr) < sockAddrLinkSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + binary.Unmarshal(addr[:sockAddrLinkSize], usermem.ByteOrder, &a) + if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize { + return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/173): Return protocol too. + return tcpip.FullAddress{ + NIC: tcpip.NICID(a.InterfaceIndex), + Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + }, family, nil + + case linux.AF_UNSPEC: + return tcpip.FullAddress{}, family, nil + + default: + return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported + } +} diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 6d9e502bd..9f7aca305 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -118,28 +118,24 @@ var ( // NewConnectioned creates a new unbound connectionedEndpoint. func NewConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) Endpoint { - return &connectionedEndpoint{ + return newConnectioned(ctx, stype, uid) +} + +func newConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) *connectionedEndpoint { + ep := &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, id: uid.UniqueID(), idGenerator: uid, stype: stype, } + ep.ops.InitHandler(ep) + return ep } // NewPair allocates a new pair of connected unix-domain connectionedEndpoints. func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { - a := &connectionedEndpoint{ - baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, - id: uid.UniqueID(), - idGenerator: uid, - stype: stype, - } - b := &connectionedEndpoint{ - baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, - id: uid.UniqueID(), - idGenerator: uid, - stype: stype, - } + a := newConnectioned(ctx, stype, uid) + b := newConnectioned(ctx, stype, uid) q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} q1.InitRefs() @@ -171,12 +167,14 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E // NewExternal creates a new externally backed Endpoint. It behaves like a // socketpair. func NewExternal(ctx context.Context, stype linux.SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint { - return &connectionedEndpoint{ + ep := &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected}, id: uid.UniqueID(), idGenerator: uid, stype: stype, } + ep.ops.InitHandler(ep) + return ep } // ID implements ConnectingEndpoint.ID. @@ -298,6 +296,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn idGenerator: e.idGenerator, stype: e.stype, } + ne.ops.InitHandler(ne) readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} readQueue.InitRefs() diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 1406971bc..0813ad87d 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -44,6 +44,7 @@ func NewConnectionless(ctx context.Context) Endpoint { q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} q.InitRefs() ep.receiver = &queueReceiver{readQueue: &q} + ep.ops.InitHandler(ep) return ep } diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 18a50e9f8..099a56281 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -16,8 +16,6 @@ package transport import ( - "sync/atomic" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" @@ -180,10 +178,6 @@ type Endpoint interface { // SetSockOpt sets a socket option. SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error - // SetSockOptBool sets a socket option for simple cases when a value has - // the int type. - SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error - // SetSockOptInt sets a socket option for simple cases when a value has // the int type. SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error @@ -191,10 +185,6 @@ type Endpoint interface { // GetSockOpt gets a socket option. GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error - // GetSockOptBool gets a socket option for simple cases when a return - // value has the int type. - GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) - // GetSockOptInt gets a socket option for simple cases when a return // value has the int type. GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) @@ -203,10 +193,11 @@ type Endpoint interface { // procfs. State() uint32 - // LastError implements tcpip.Endpoint.LastError. + // LastError clears and returns the last error reported by the endpoint. LastError() *tcpip.Error - // SocketOptions implements tcpip.Endpoint.SocketOptions. + // SocketOptions returns the structure which contains all the socket + // level options. SocketOptions() *tcpip.SocketOptions } @@ -739,10 +730,7 @@ func (e *connectedEndpoint) CloseUnread() { // +stateify savable type baseEndpoint struct { *waiter.Queue - - // passcred specifies whether SCM_CREDENTIALS socket control messages are - // enabled on this endpoint. Must be accessed atomically. - passcred int32 + tcpip.DefaultSocketOptionsHandler // Mutex protects the below fields. sync.Mutex `state:"nosave"` @@ -758,9 +746,7 @@ type baseEndpoint struct { // or may be used if the endpoint is connected. path string - // linger is used for SO_LINGER socket option. - linger tcpip.LingerOption - + // ops is used to get socket level options. ops tcpip.SocketOptions } @@ -786,7 +772,7 @@ func (e *baseEndpoint) EventUnregister(we *waiter.Entry) { // Passcred implements Credentialer.Passcred. func (e *baseEndpoint) Passcred() bool { - return atomic.LoadInt32(&e.passcred) != 0 + return e.SocketOptions().GetPassCred() } // ConnectedPasscred implements Credentialer.ConnectedPasscred. @@ -796,14 +782,6 @@ func (e *baseEndpoint) ConnectedPasscred() bool { return e.connected != nil && e.connected.Passcred() } -func (e *baseEndpoint) setPasscred(pc bool) { - if pc { - atomic.StoreInt32(&e.passcred, 1) - } else { - atomic.StoreInt32(&e.passcred, 0) - } -} - // Connected implements ConnectingEndpoint.Connected. func (e *baseEndpoint) Connected() bool { return e.receiver != nil && e.connected != nil @@ -859,23 +837,6 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess // SetSockOpt sets a socket option. func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch v := opt.(type) { - case *tcpip.LingerOption: - e.Lock() - e.linger = *v - e.Unlock() - } - return nil -} - -func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - switch opt { - case tcpip.PasscredOption: - e.setPasscred(v) - case tcpip.ReuseAddressOption: - default: - log.Warningf("Unsupported socket option: %d", opt) - } return nil } @@ -889,20 +850,6 @@ func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { return nil } -func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption: - return false, nil - - case tcpip.PasscredOption: - return e.Passcred(), nil - - default: - log.Warningf("Unsupported socket option: %d", opt) - return false, tcpip.ErrUnknownProtocolOption - } -} - func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: @@ -966,17 +913,8 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { - switch o := opt.(type) { - case *tcpip.LingerOption: - e.Lock() - *o = e.linger - e.Unlock() - return nil - - default: - log.Warningf("Unsupported socket option: %T", opt) - return tcpip.ErrUnknownProtocolOption - } + log.Warningf("Unsupported socket option: %T", opt) + return tcpip.ErrUnknownProtocolOption } // LastError implements Endpoint.LastError. diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 3e520d2ee..c59297c80 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -115,9 +115,6 @@ type socketOpsCommon struct { // bound, they cannot be modified. abstractName string abstractNamespace *kernel.AbstractSocketNamespace - - // ops is used to get socket level options. - ops tcpip.SocketOptions } func (s *socketOpsCommon) isPacket() bool { @@ -139,7 +136,7 @@ func (s *socketOpsCommon) Endpoint() transport.Endpoint { // extractPath extracts and validates the address. func extractPath(sockaddr []byte) (string, *syserr.Error) { - addr, family, err := netstack.AddressAndFamily(sockaddr) + addr, family, err := socket.AddressAndFamily(sockaddr) if err != nil { if err == syserr.ErrAddressFamilyNotSupported { err = syserr.ErrInvalidArgument @@ -172,7 +169,7 @@ func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, * return nil, 0, syserr.TranslateNetstackError(err) } - a, l := netstack.ConvertAddress(linux.AF_UNIX, addr) + a, l := socket.ConvertAddress(linux.AF_UNIX, addr) return a, l, nil } @@ -184,7 +181,7 @@ func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, * return nil, 0, syserr.TranslateNetstackError(err) } - a, l := netstack.ConvertAddress(linux.AF_UNIX, addr) + a, l := socket.ConvertAddress(linux.AF_UNIX, addr) return a, l, nil } @@ -258,7 +255,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, var addr linux.SockAddr var addrLen uint32 if peerAddr != nil { - addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr) + addr, addrLen = socket.ConvertAddress(linux.AF_UNIX, *peerAddr) } fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ @@ -650,7 +647,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags var from linux.SockAddr var fromLen uint32 if r.From != nil && len([]byte(r.From.Addr)) != 0 { - from, fromLen = netstack.ConvertAddress(linux.AF_UNIX, *r.From) + from, fromLen = socket.ConvertAddress(linux.AF_UNIX, *r.From) } if r.ControlTrunc { @@ -685,7 +682,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags var from linux.SockAddr var fromLen uint32 if r.From != nil { - from, fromLen = netstack.ConvertAddress(linux.AF_UNIX, *r.From) + from, fromLen = socket.ConvertAddress(linux.AF_UNIX, *r.From) } if r.ControlTrunc { diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index eaf0b0d26..27f705bb2 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -172,7 +172,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block var addr linux.SockAddr var addrLen uint32 if peerAddr != nil { - addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr) + addr, addrLen = socket.ConvertAddress(linux.AF_UNIX, *peerAddr) } fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index a920180d3..d36a64ffc 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -32,8 +32,8 @@ go_library( "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/kernel", + "//pkg/sentry/socket", "//pkg/sentry/socket/netlink", - "//pkg/sentry/socket/netstack", "//pkg/sentry/syscalls/linux", "//pkg/usermem", ], diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index cc5f70cd4..d943a7cb1 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" - "gvisor.dev/gvisor/pkg/sentry/socket/netstack" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/usermem" ) @@ -341,7 +341,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string { switch family { case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX: - fa, _, err := netstack.AddressAndFamily(b) + fa, _, err := socket.AddressAndFamily(b) if err != nil { return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err) } diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index bb1f715e2..b815e498f 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -118,7 +118,7 @@ var AMD64 = &kernel.SyscallTable{ 63: syscalls.Supported("uname", Uname), 64: syscalls.Supported("semget", Semget), 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), - 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY not supported.", nil), + 66: syscalls.PartiallySupported("semctl", Semctl, "Options SEM_STAT_ANY not supported.", nil), 67: syscalls.Supported("shmdt", Shmdt), 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) @@ -619,7 +619,7 @@ var ARM64 = &kernel.SyscallTable{ 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 190: syscalls.Supported("semget", Semget), - 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY not supported.", nil), + 191: syscalls.PartiallySupported("semctl", Semctl, "Options SEM_STAT_ANY not supported.", nil), 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index 0bf313a13..c2285f796 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -307,9 +307,8 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user if !ok { return syserror.EINVAL } - if ready := ctx.Prepare(); !ready { - // Context is busy. - return syserror.EAGAIN + if err := ctx.Prepare(); err != nil { + return err } if eventFile != nil { diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 519066a47..8db587401 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -646,7 +646,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { return 0, nil, err } - fSetOwn(t, file, set) + fSetOwn(t, int(fd), file, set) return 0, nil, nil case linux.FIOGETOWN, linux.SIOCGPGRP: @@ -901,8 +901,8 @@ func fGetOwn(t *kernel.Task, file *fs.File) int32 { // // If who is positive, it represents a PID. If negative, it represents a PGID. // If the PID or PGID is invalid, the owner is silently unset. -func fSetOwn(t *kernel.Task, file *fs.File, who int32) error { - a := file.Async(fasync.New).(*fasync.FileAsync) +func fSetOwn(t *kernel.Task, fd int, file *fs.File, who int32) error { + a := file.Async(fasync.New(fd)).(*fasync.FileAsync) if who < 0 { // Check for overflow before flipping the sign. if who-1 > who { @@ -1049,7 +1049,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_GETOWN: return uintptr(fGetOwn(t, file)), nil, nil case linux.F_SETOWN: - return 0, nil, fSetOwn(t, file, args[2].Int()) + return 0, nil, fSetOwn(t, int(fd), file, args[2].Int()) case linux.F_GETOWN_EX: addr := args[2].Pointer() owner := fGetOwnEx(t, file) @@ -1062,7 +1062,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - a := file.Async(fasync.New).(*fasync.FileAsync) + a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync) switch owner.Type { case linux.F_OWNER_TID: task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID)) @@ -1111,6 +1111,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } n, err := sz.SetFifoSize(int64(args[2].Int())) return uintptr(n), nil, err + case linux.F_GETSIG: + a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync) + return uintptr(a.Signal()), nil, nil + case linux.F_SETSIG: + a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync) + return 0, nil, a.SetSignal(linux.Signal(args[2].Int())) default: // Everything else is not yet supported. return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go index e383a0a87..1166cd7bb 100644 --- a/pkg/sentry/syscalls/linux/sys_sem.go +++ b/pkg/sentry/syscalls/linux/sys_sem.go @@ -146,11 +146,37 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal v, err := getNCnt(t, id, num) return uintptr(v), nil, err - case linux.IPC_INFO, - linux.SEM_INFO, - linux.SEM_STAT, - linux.SEM_STAT_ANY: + case linux.IPC_INFO: + buf := args[3].Pointer() + r := t.IPCNamespace().SemaphoreRegistry() + info := r.IPCInfo() + if _, err := info.CopyOut(t, buf); err != nil { + return 0, nil, err + } + return uintptr(r.HighestIndex()), nil, nil + + case linux.SEM_INFO: + buf := args[3].Pointer() + r := t.IPCNamespace().SemaphoreRegistry() + info := r.SemInfo() + if _, err := info.CopyOut(t, buf); err != nil { + return 0, nil, err + } + return uintptr(r.HighestIndex()), nil, nil + + case linux.SEM_STAT: + arg := args[3].Pointer() + // id is an index in SEM_STAT. + semid, ds, err := semStat(t, id) + if err != nil { + return 0, nil, err + } + if _, err := ds.CopyOut(t, arg); err != nil { + return 0, nil, err + } + return uintptr(semid), nil, err + case linux.SEM_STAT_ANY: t.Kernel().EmitUnimplementedEvent(t) fallthrough @@ -195,6 +221,17 @@ func ipcStat(t *kernel.Task, id int32) (*linux.SemidDS, error) { return set.GetStat(creds) } +func semStat(t *kernel.Task, index int32) (int32, *linux.SemidDS, error) { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByIndex(index) + if set == nil { + return 0, nil, syserror.EINVAL + } + creds := auth.CredentialsFromContext(t) + ds, err := set.GetStat(creds) + return set.ID, ds, err +} + func setVal(t *kernel.Task, id int32, num int32, val int16) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index e748d33d8..d639c9bf7 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -88,8 +88,8 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC Signo: int32(sig), Code: arch.SignalInfoUser, } - info.SetPid(int32(target.PIDNamespace().IDOfTask(t))) - info.SetUid(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow())) + info.SetPID(int32(target.PIDNamespace().IDOfTask(t))) + info.SetUID(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow())) if err := target.SendGroupSignal(info); err != syserror.ESRCH { return 0, nil, err } @@ -127,8 +127,8 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC Signo: int32(sig), Code: arch.SignalInfoUser, } - info.SetPid(int32(tg.PIDNamespace().IDOfTask(t))) - info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) + info.SetPID(int32(tg.PIDNamespace().IDOfTask(t))) + info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) err := tg.SendSignal(info) if err == syserror.ESRCH { // ESRCH is ignored because it means the task @@ -171,8 +171,8 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC Signo: int32(sig), Code: arch.SignalInfoUser, } - info.SetPid(int32(tg.PIDNamespace().IDOfTask(t))) - info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) + info.SetPID(int32(tg.PIDNamespace().IDOfTask(t))) + info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) // See note above regarding ESRCH race above. if err := tg.SendSignal(info); err != syserror.ESRCH { lastErr = err @@ -189,8 +189,8 @@ func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *arch.SignalI Signo: int32(sig), Code: arch.SignalInfoTkill, } - info.SetPid(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup()))) - info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + info.SetPID(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup()))) + info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) return info } diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 983f8d396..8e7ac0ffe 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -413,8 +413,8 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal si := arch.SignalInfo{ Signo: int32(linux.SIGCHLD), } - si.SetPid(int32(wr.TID)) - si.SetUid(int32(wr.UID)) + si.SetPID(int32(wr.TID)) + si.SetUID(int32(wr.UID)) // TODO(b/73541790): convert kernel.ExitStatus to functions and make // WaitResult.Status a linux.WaitStatus. s := syscall.WaitStatus(wr.Status) diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go index 6d0a38330..1365a5a62 100644 --- a/pkg/sentry/syscalls/linux/vfs2/aio.go +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -130,9 +130,8 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user if !ok { return syserror.EINVAL } - if ready := aioCtx.Prepare(); !ready { - // Context is busy. - return syserror.EAGAIN + if err := aioCtx.Prepare(); err != nil { + return err } if eventFD != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 36e89700e..7dd9ef857 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -165,7 +165,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall ownerType = linux.F_OWNER_PGRP who = -who } - return 0, nil, setAsyncOwner(t, file, ownerType, who) + return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who) case linux.F_GETOWN_EX: owner, hasOwner := getAsyncOwner(t, file) if !hasOwner { @@ -179,7 +179,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID) + return 0, nil, setAsyncOwner(t, int(fd), file, owner.Type, owner.PID) case linux.F_SETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { @@ -207,6 +207,16 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err case linux.F_SETLK, linux.F_SETLKW: return 0, nil, posixLock(t, args, file, cmd) + case linux.F_GETSIG: + a := file.AsyncHandler() + if a == nil { + // Default behavior aka SIGIO. + return 0, nil, nil + } + return uintptr(a.(*fasync.FileAsync).Signal()), nil, nil + case linux.F_SETSIG: + a := file.SetAsyncHandler(fasync.NewVFS2(int(fd))).(*fasync.FileAsync) + return 0, nil, a.SetSignal(linux.Signal(args[2].Int())) default: // Everything else is not yet supported. return 0, nil, syserror.EINVAL @@ -241,7 +251,7 @@ func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwne } } -func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error { +func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, pid int32) error { switch ownerType { case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: // Acceptable type. @@ -249,7 +259,7 @@ func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32 return syserror.EINVAL } - a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) + a := file.SetAsyncHandler(fasync.NewVFS2(fd)).(*fasync.FileAsync) if pid == 0 { a.ClearOwner() return nil diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index 2806c3f6f..20c264fef 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -100,7 +100,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall ownerType = linux.F_OWNER_PGRP who = -who } - return 0, nil, setAsyncOwner(t, file, ownerType, who) + return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who) } ret, err := file.Ioctl(t, t.MemoryManager(), args) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 440c9307c..a3868bf16 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -105,6 +105,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/fs", "//pkg/sentry/fs/lock", + "//pkg/sentry/fsmetric", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index a98aac52b..072655fe8 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -204,8 +204,8 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin file.EventRegister(&epi.waiter, wmask) // Check if the file is already ready. - if file.Readiness(wmask)&wmask != 0 { - epi.Callback(nil) + if m := file.Readiness(wmask) & wmask; m != 0 { + epi.Callback(nil, m) } // Add epi to file.epolls so that it is removed when the last @@ -274,8 +274,8 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event file.EventRegister(&epi.waiter, wmask) // Check if the file is already ready with the new mask. - if file.Readiness(wmask)&wmask != 0 { - epi.Callback(nil) + if m := file.Readiness(wmask) & wmask; m != 0 { + epi.Callback(nil, m) } return nil @@ -311,7 +311,7 @@ func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error } // Callback implements waiter.EntryCallback.Callback. -func (epi *epollInterest) Callback(*waiter.Entry) { +func (epi *epollInterest) Callback(*waiter.Entry, waiter.EventMask) { newReady := false epi.epoll.mu.Lock() if !epi.ready { diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 936f9fc71..5321ac80a 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -15,12 +15,14 @@ package vfs import ( + "io" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" @@ -42,7 +44,7 @@ import ( type FileDescription struct { FileDescriptionRefs - // flagsMu protects statusFlags and asyncHandler below. + // flagsMu protects `statusFlags`, `saved`, and `asyncHandler` below. flagsMu sync.Mutex `state:"nosave"` // statusFlags contains status flags, "initialized by open(2) and possibly @@ -51,6 +53,11 @@ type FileDescription struct { // access to asyncHandler. statusFlags uint32 + // saved is true after beforeSave is called. This is used to prevent + // double-unregistration of asyncHandler. This does not work properly for + // save-resume, which is not currently supported in gVisor (see b/26588733). + saved bool `state:"nosave"` + // asyncHandler handles O_ASYNC signal generation. It is set with the // F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must // also be set by fcntl(2). @@ -183,7 +190,7 @@ func (fd *FileDescription) DecRef(ctx context.Context) { } fd.vd.DecRef(ctx) fd.flagsMu.Lock() - if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { + if !fd.saved && fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { fd.asyncHandler.Unregister(fd) } fd.asyncHandler = nil @@ -583,7 +590,11 @@ func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of if !fd.readable { return 0, syserror.EBADF } - return fd.impl.PRead(ctx, dst, offset, opts) + start := fsmetric.StartReadWait() + n, err := fd.impl.PRead(ctx, dst, offset, opts) + fsmetric.Reads.Increment() + fsmetric.FinishReadWait(fsmetric.ReadWait, start) + return n, err } // Read is similar to PRead, but does not specify an offset. @@ -591,7 +602,11 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt if !fd.readable { return 0, syserror.EBADF } - return fd.impl.Read(ctx, dst, opts) + start := fsmetric.StartReadWait() + n, err := fd.impl.Read(ctx, dst, opts) + fsmetric.Reads.Increment() + fsmetric.FinishReadWait(fsmetric.ReadWait, start) + return n, err } // PWrite writes src to the file represented by fd, starting at the given @@ -825,44 +840,27 @@ func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsyn return fd.asyncHandler } -// FileReadWriteSeeker is a helper struct to pass a FileDescription as -// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc. -type FileReadWriteSeeker struct { - FD *FileDescription - Ctx context.Context - ROpts ReadOptions - WOpts WriteOptions -} - -// ReadAt implements io.ReaderAt.ReadAt. -func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) { - dst := usermem.BytesIOSequence(p) - n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts) - return int(n), err -} - -// Read implements io.ReadWriteSeeker.Read. -func (f *FileReadWriteSeeker) Read(p []byte) (int, error) { - dst := usermem.BytesIOSequence(p) - n, err := f.FD.Read(f.Ctx, dst, f.ROpts) - return int(n), err -} - -// Seek implements io.ReadWriteSeeker.Seek. -func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) { - return f.FD.Seek(f.Ctx, offset, int32(whence)) -} - -// WriteAt implements io.WriterAt.WriteAt. -func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) { - dst := usermem.BytesIOSequence(p) - n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts) - return int(n), err -} - -// Write implements io.ReadWriteSeeker.Write. -func (f *FileReadWriteSeeker) Write(p []byte) (int, error) { - buf := usermem.BytesIOSequence(p) - n, err := f.FD.Write(f.Ctx, buf, f.WOpts) - return int(n), err +// CopyRegularFileData copies data from srcFD to dstFD until reading from srcFD +// returns EOF or an error. It returns the number of bytes copied. +func CopyRegularFileData(ctx context.Context, dstFD, srcFD *FileDescription) (int64, error) { + done := int64(0) + buf := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size + for { + readN, readErr := srcFD.Read(ctx, buf, ReadOptions{}) + if readErr != nil && readErr != io.EOF { + return done, readErr + } + src := buf.TakeFirst64(readN) + for src.NumBytes() != 0 { + writeN, writeErr := dstFD.Write(ctx, src, WriteOptions{}) + done += writeN + src = src.DropFirst64(writeN) + if writeErr != nil { + return done, writeErr + } + } + if readErr == io.EOF { + return done, nil + } + } } diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index cb48c37a1..0df023713 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -12,11 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.12 -// +build !go1.17 - -// Check go:linkname function signatures when updating Go version. - package vfs import ( @@ -41,6 +36,15 @@ type mountKey struct { point unsafe.Pointer // *Dentry } +var ( + mountKeyHasher = sync.MapKeyHasher(map[mountKey]struct{}(nil)) + mountKeySeed = sync.RandUintptr() +) + +func (k *mountKey) hash() uintptr { + return mountKeyHasher(gohacks.Noescape(unsafe.Pointer(k)), mountKeySeed) +} + func (mnt *Mount) parent() *Mount { return (*Mount)(atomic.LoadPointer(&mnt.key.parent)) } @@ -56,23 +60,17 @@ func (mnt *Mount) getKey() VirtualDentry { } } -func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() } - // Invariant: mnt.key.parent == nil. vd.Ok(). func (mnt *Mount) setKey(vd VirtualDentry) { atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount)) atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry)) } -func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) } - // mountTable maps (mount parent, mount point) pairs to mounts. It supports // efficient concurrent lookup, even in the presence of concurrent mutators // (provided mutation is sufficiently uncommon). // // mountTable.Init() must be called on new mountTables before use. -// -// +stateify savable type mountTable struct { // mountTable is implemented as a seqcount-protected hash table that // resolves collisions with linear probing, featuring Robin Hood insertion @@ -84,8 +82,7 @@ type mountTable struct { // intrinsics and inline assembly, limiting the performance of this // approach.) - seq sync.SeqCount `state:"nosave"` - seed uint32 // for hashing keys + seq sync.SeqCount `state:"nosave"` // size holds both length (number of elements) and capacity (number of // slots): capacity is stored as its base-2 log (referred to as order) in @@ -150,7 +147,6 @@ func init() { // Init must be called exactly once on each mountTable before use. func (mt *mountTable) Init() { - mt.seed = rand32() mt.size = mtInitOrder mt.slots = newMountTableSlots(mtInitCap) } @@ -167,7 +163,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer { // Lookup may be called even if there are concurrent mutators of mt. func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} - hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) + hash := key.hash() loop: for { @@ -247,7 +243,7 @@ func (mt *mountTable) Insert(mount *Mount) { // * mt.seq must be in a writer critical section. // * mt must not already contain a Mount with the same mount point and parent. func (mt *mountTable) insertSeqed(mount *Mount) { - hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + hash := mount.key.hash() // We're under the maximum load factor if: // @@ -346,7 +342,7 @@ func (mt *mountTable) Remove(mount *Mount) { // * mt.seq must be in a writer critical section. // * mt must contain mount. func (mt *mountTable) removeSeqed(mount *Mount) { - hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + hash := mount.key.hash() tcap := uintptr(1) << (mt.size & mtSizeOrderMask) mask := tcap - 1 slots := mt.slots @@ -386,9 +382,3 @@ func (mt *mountTable) removeSeqed(mount *Mount) { off = (off + mountSlotBytes) & offmask } } - -//go:linkname memhash runtime.memhash -func memhash(p unsafe.Pointer, seed, s uintptr) uintptr - -//go:linkname rand32 runtime.fastrand -func rand32() uint32 diff --git a/pkg/sentry/vfs/save_restore.go b/pkg/sentry/vfs/save_restore.go index 7723ed643..8998a82dd 100644 --- a/pkg/sentry/vfs/save_restore.go +++ b/pkg/sentry/vfs/save_restore.go @@ -18,8 +18,10 @@ import ( "fmt" "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refsvfs2" + "gvisor.dev/gvisor/pkg/waiter" ) // FilesystemImplSaveRestoreExtension is an optional extension to @@ -99,6 +101,9 @@ func (vfs *VirtualFilesystem) saveMounts() []*Mount { return mounts } +// saveKey is called by stateify. +func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() } + // loadMounts is called by stateify. func (vfs *VirtualFilesystem) loadMounts(mounts []*Mount) { if mounts == nil { @@ -110,6 +115,9 @@ func (vfs *VirtualFilesystem) loadMounts(mounts []*Mount) { } } +// loadKey is called by stateify. +func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) } + func (mnt *Mount) afterLoad() { if atomic.LoadInt64(&mnt.refs) != 0 { refsvfs2.Register(mnt) @@ -120,5 +128,20 @@ func (mnt *Mount) afterLoad() { func (epi *epollInterest) afterLoad() { // Mark all epollInterests as ready after restore so that the next call to // EpollInstance.ReadEvents() rechecks their readiness. - epi.Callback(nil) + epi.Callback(nil, waiter.EventMaskFromLinux(epi.mask)) +} + +// beforeSave is called by stateify. +func (fd *FileDescription) beforeSave() { + fd.saved = true + if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { + fd.asyncHandler.Unregister(fd) + } +} + +// afterLoad is called by stateify. +func (fd *FileDescription) afterLoad() { + if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { + fd.asyncHandler.Register(fd) + } } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 48d6252f7..6fd1bb0b2 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -41,6 +41,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" @@ -381,6 +382,8 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia // OpenAt returns a FileDescription providing access to the file at the given // path. A reference is taken on the returned FileDescription. func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { + fsmetric.Opens.Increment() + // Remove: // // - O_CLOEXEC, which affects file descriptors and therefore must be diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index 1d1062aeb..8e3146d8d 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -338,6 +338,7 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo tid := w.k.TaskSet().Root.IDOfTask(t) buf.WriteString(fmt.Sprintf("\tTask tid: %v (goroutine %d), entered RunSys state %v ago.\n", tid, t.GoroutineID(), now.Sub(o.lastUpdateTime))) } + buf.WriteString("Search for 'goroutine <id>' in the stack dump to find the offending goroutine(s)") // Force stack dump only if a new task is detected. w.doAction(w.TaskTimeoutAction, newTaskFound, &buf) diff --git a/pkg/shim/v1/proc/process.go b/pkg/shim/v1/proc/process.go index d462c3eef..e8315326d 100644 --- a/pkg/shim/v1/proc/process.go +++ b/pkg/shim/v1/proc/process.go @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package proc contains process-related utilities. package proc import ( diff --git a/pkg/shim/v1/shim/BUILD b/pkg/shim/v1/shim/BUILD index 05c595bc9..e5b6bf186 100644 --- a/pkg/shim/v1/shim/BUILD +++ b/pkg/shim/v1/shim/BUILD @@ -8,6 +8,7 @@ go_library( "api.go", "platform.go", "service.go", + "shim.go", ], visibility = [ "//pkg/shim:__subpackages__", diff --git a/pkg/sleep/commit_asm.go b/pkg/shim/v1/shim/shim.go index 75728a97d..1855a8769 100644 --- a/pkg/sleep/commit_asm.go +++ b/pkg/shim/v1/shim/shim.go @@ -1,10 +1,11 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2018 The containerd Authors. +// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, @@ -12,9 +13,5 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build amd64 arm64 - -package sleep - -// See commit_noasm.go for a description of commitSleep. -func commitSleep(g uintptr, waitingG *uintptr) bool +// Package shim contains the core containerd shim implementation. +package shim diff --git a/pkg/shim/v1/utils/utils.go b/pkg/shim/v1/utils/utils.go index 07e346654..21e75d16d 100644 --- a/pkg/shim/v1/utils/utils.go +++ b/pkg/shim/v1/utils/utils.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package utils contains utility functions. package utils import ( diff --git a/pkg/shim/v2/BUILD b/pkg/shim/v2/BUILD index f37fefddc..b0e8daa51 100644 --- a/pkg/shim/v2/BUILD +++ b/pkg/shim/v2/BUILD @@ -22,6 +22,7 @@ go_library( "//runsc/specutils", "@com_github_burntsushi_toml//:go_default_library", "@com_github_containerd_cgroups//:go_default_library", + "@com_github_containerd_cgroups//stats/v1:go_default_library", "@com_github_containerd_console//:go_default_library", "@com_github_containerd_containerd//api/events:go_default_library", "@com_github_containerd_containerd//api/types/task:go_default_library", diff --git a/pkg/shim/v2/service.go b/pkg/shim/v2/service.go index 2e39d2c4a..6aaf5fab8 100644 --- a/pkg/shim/v2/service.go +++ b/pkg/shim/v2/service.go @@ -28,6 +28,7 @@ import ( "github.com/BurntSushi/toml" "github.com/containerd/cgroups" + cgroupsstats "github.com/containerd/cgroups/stats/v1" "github.com/containerd/console" "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/api/types/task" @@ -67,9 +68,15 @@ var ( var _ = (taskAPI.TaskService)(&service{}) -// configFile is the default config file name. For containerd 1.2, -// we assume that a config.toml should exist in the runtime root. -const configFile = "config.toml" +const ( + // configFile is the default config file name. For containerd 1.2, + // we assume that a config.toml should exist in the runtime root. + configFile = "config.toml" + + // shimAddressPath is the relative path to a file that contains the address + // to the shim UDS. See service.shimAddress. + shimAddressPath = "address" +) // New returns a new shim service that can be used via GRPC. func New(ctx context.Context, id string, publisher shim.Publisher, cancel func()) (shim.Shim, error) { @@ -101,6 +108,11 @@ func New(ctx context.Context, id string, publisher shim.Publisher, cancel func() return nil, fmt.Errorf("failed to initialized platform behavior: %w", err) } go s.forward(ctx, publisher) + + if address, err := shim.ReadAddress(shimAddressPath); err == nil { + s.shimAddress = address + } + return s, nil } @@ -152,6 +164,9 @@ type service struct { // cancel is a function that needs to be called before the shim stops. The // function is provided by the caller to New(). cancel func() + + // shimAddress is the location of the UDS used to communicate to containerd. + shimAddress string } func (s *service) newCommand(ctx context.Context, containerdBinary, containerdAddress string) (*exec.Cmd, error) { @@ -191,38 +206,58 @@ func (s *service) StartShim(ctx context.Context, id, containerdBinary, container if err != nil { return "", err } - address, err := shim.SocketAddress(ctx, id) + address, err := shim.SocketAddress(ctx, containerdAddress, id) if err != nil { return "", err } socket, err := shim.NewSocket(address) if err != nil { - return "", err + // The only time where this would happen is if there is a bug and the socket + // was not cleaned up in the cleanup method of the shim or we are using the + // grouping functionality where the new process should be run with the same + // shim as an existing container. + if !shim.SocketEaddrinuse(err) { + return "", fmt.Errorf("create new shim socket: %w", err) + } + if shim.CanConnect(address) { + if err := shim.WriteAddress(shimAddressPath, address); err != nil { + return "", fmt.Errorf("write existing socket for shim: %w", err) + } + return address, nil + } + if err := shim.RemoveSocket(address); err != nil { + return "", fmt.Errorf("remove pre-existing socket: %w", err) + } + if socket, err = shim.NewSocket(address); err != nil { + return "", fmt.Errorf("try create new shim socket 2x: %w", err) + } } - defer socket.Close() + cu := cleanup.Make(func() { + socket.Close() + _ = shim.RemoveSocket(address) + }) + defer cu.Clean() + f, err := socket.File() if err != nil { return "", err } - defer f.Close() cmd.ExtraFiles = append(cmd.ExtraFiles, f) log.L.Debugf("Executing: %q %s", cmd.Path, cmd.Args) if err := cmd.Start(); err != nil { + f.Close() return "", err } - cu := cleanup.Make(func() { - cmd.Process.Kill() - }) - defer cu.Clean() + cu.Add(func() { cmd.Process.Kill() }) // make sure to wait after start go cmd.Wait() if err := shim.WritePidFile("shim.pid", cmd.Process.Pid); err != nil { return "", err } - if err := shim.WriteAddress("address", address); err != nil { + if err := shim.WriteAddress(shimAddressPath, address); err != nil { return "", err } if err := shim.SetScore(cmd.Process.Pid); err != nil { @@ -675,8 +710,11 @@ func (s *service) Connect(ctx context.Context, r *taskAPI.ConnectRequest) (*task func (s *service) Shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*types.Empty, error) { log.L.Debugf("Shutdown, id: %s", r.ID) s.cancel() + if s.shimAddress != "" { + _ = shim.RemoveSocket(s.shimAddress) + } os.Exit(0) - return empty, nil + panic("Should not get here") } func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) { @@ -698,48 +736,48 @@ func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI. // as runc. // // [0]: https://github.com/google/gvisor/blob/277a0d5a1fbe8272d4729c01ee4c6e374d047ebc/runsc/boot/events.go#L61-L81 - metrics := &cgroups.Metrics{ - CPU: &cgroups.CPUStat{ - Usage: &cgroups.CPUUsage{ + metrics := &cgroupsstats.Metrics{ + CPU: &cgroupsstats.CPUStat{ + Usage: &cgroupsstats.CPUUsage{ Total: stats.Cpu.Usage.Total, Kernel: stats.Cpu.Usage.Kernel, User: stats.Cpu.Usage.User, PerCPU: stats.Cpu.Usage.Percpu, }, - Throttling: &cgroups.Throttle{ + Throttling: &cgroupsstats.Throttle{ Periods: stats.Cpu.Throttling.Periods, ThrottledPeriods: stats.Cpu.Throttling.ThrottledPeriods, ThrottledTime: stats.Cpu.Throttling.ThrottledTime, }, }, - Memory: &cgroups.MemoryStat{ + Memory: &cgroupsstats.MemoryStat{ Cache: stats.Memory.Cache, - Usage: &cgroups.MemoryEntry{ + Usage: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.Usage.Limit, Usage: stats.Memory.Usage.Usage, Max: stats.Memory.Usage.Max, Failcnt: stats.Memory.Usage.Failcnt, }, - Swap: &cgroups.MemoryEntry{ + Swap: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.Swap.Limit, Usage: stats.Memory.Swap.Usage, Max: stats.Memory.Swap.Max, Failcnt: stats.Memory.Swap.Failcnt, }, - Kernel: &cgroups.MemoryEntry{ + Kernel: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.Kernel.Limit, Usage: stats.Memory.Kernel.Usage, Max: stats.Memory.Kernel.Max, Failcnt: stats.Memory.Kernel.Failcnt, }, - KernelTCP: &cgroups.MemoryEntry{ + KernelTCP: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.KernelTCP.Limit, Usage: stats.Memory.KernelTCP.Usage, Max: stats.Memory.KernelTCP.Max, Failcnt: stats.Memory.KernelTCP.Failcnt, }, }, - Pids: &cgroups.PidsStat{ + Pids: &cgroupsstats.PidsStat{ Current: stats.Pids.Current, Limit: stats.Pids.Limit, }, @@ -843,9 +881,7 @@ func (s *service) getContainerPids(ctx context.Context, id string) ([]uint32, er func (s *service) forward(ctx context.Context, publisher shim.Publisher) { for e := range s.events { - ctx, cancel := context.WithTimeout(ctx, 5*time.Second) err := publisher.Publish(ctx, getTopic(e), e) - cancel() if err != nil { // Should not happen. panic(fmt.Errorf("post event: %w", err)) diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD index ae0fe1522..48bcdd62b 100644 --- a/pkg/sleep/BUILD +++ b/pkg/sleep/BUILD @@ -5,10 +5,6 @@ package(licenses = ["notice"]) go_library( name = "sleep", srcs = [ - "commit_amd64.s", - "commit_arm64.s", - "commit_asm.go", - "commit_noasm.go", "sleep_unsafe.go", ], visibility = ["//:sandbox"], diff --git a/pkg/sleep/commit_amd64.s b/pkg/sleep/commit_amd64.s deleted file mode 100644 index bc4ac2c3c..000000000 --- a/pkg/sleep/commit_amd64.s +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "textflag.h" - -#define preparingG 1 - -// See commit_noasm.go for a description of commitSleep. -// -// func commitSleep(g uintptr, waitingG *uintptr) bool -TEXT ·commitSleep(SB),NOSPLIT,$0-24 - MOVQ waitingG+8(FP), CX - MOVQ g+0(FP), DX - - // Store the G in waitingG if it's still preparingG. If it's anything - // else it means a waker has aborted the sleep. - MOVQ $preparingG, AX - LOCK - CMPXCHGQ DX, 0(CX) - - SETEQ AX - MOVB AX, ret+16(FP) - - RET diff --git a/pkg/sleep/commit_arm64.s b/pkg/sleep/commit_arm64.s deleted file mode 100644 index d0ef15b20..000000000 --- a/pkg/sleep/commit_arm64.s +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "textflag.h" - -#define preparingG 1 - -// See commit_noasm.go for a description of commitSleep. -// -// func commitSleep(g uintptr, waitingG *uintptr) bool -TEXT ·commitSleep(SB),NOSPLIT,$0-24 - MOVD waitingG+8(FP), R0 - MOVD $preparingG, R1 - MOVD G+0(FP), R2 - - // Store the G in waitingG if it's still preparingG. If it's anything - // else it means a waker has aborted the sleep. -again: - LDAXR (R0), R3 - CMP R1, R3 - BNE ok - STLXR R2, (R0), R3 - CBNZ R3, again -ok: - CSET EQ, R0 - MOVB R0, ret+16(FP) - RET diff --git a/pkg/sleep/commit_noasm.go b/pkg/sleep/commit_noasm.go deleted file mode 100644 index f59061f37..000000000 --- a/pkg/sleep/commit_noasm.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build !race -// +build !amd64,!arm64 - -package sleep - -import "sync/atomic" - -// commitSleep signals to wakers that the given g is now sleeping. Wakers can -// then fetch it and wake it. -// -// The commit may fail if wakers have been asserted after our last check, in -// which case they will have set s.waitingG to zero. -// -// It is written in assembly because it is called from g0, so it doesn't have -// a race context. -func commitSleep(g uintptr, waitingG *uintptr) bool { - // Try to store the G so that wakers know who to wake. - return atomic.CompareAndSwapUintptr(waitingG, preparingG, g) -} diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go index 19bce2afb..c44206b1e 100644 --- a/pkg/sleep/sleep_unsafe.go +++ b/pkg/sleep/sleep_unsafe.go @@ -12,11 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.11 -// +build !go1.17 - -// Check go:linkname function signatures when updating Go version. - // Package sleep allows goroutines to efficiently sleep on multiple sources of // notifications (wakers). It offers O(1) complexity, which is different from // multi-channel selects which have O(n) complexity (where n is the number of @@ -91,12 +86,6 @@ var ( assertedSleeper Sleeper ) -//go:linkname gopark runtime.gopark -func gopark(unlockf func(uintptr, *uintptr) bool, wg *uintptr, reason uint8, traceEv byte, traceskip int) - -//go:linkname goready runtime.goready -func goready(g uintptr, traceskip int) - // Sleeper allows a goroutine to sleep and receive wake up notifications from // Wakers in an efficient way. // @@ -189,7 +178,7 @@ func (s *Sleeper) nextWaker(block bool) *Waker { // See:runtime2.go in the go runtime package for // the values to pass as the waitReason here. const waitReasonSelect = 9 - gopark(commitSleep, &s.waitingG, waitReasonSelect, traceEvGoBlockSelect, 0) + sync.Gopark(commitSleep, unsafe.Pointer(&s.waitingG), sync.WaitReasonSelect, sync.TraceEvGoBlockSelect, 0) } // Pull the shared list out and reverse it in the local @@ -212,6 +201,18 @@ func (s *Sleeper) nextWaker(block bool) *Waker { return w } +// commitSleep signals to wakers that the given g is now sleeping. Wakers can +// then fetch it and wake it. +// +// The commit may fail if wakers have been asserted after our last check, in +// which case they will have set s.waitingG to zero. +// +//go:norace +//go:nosplit +func commitSleep(g uintptr, waitingG unsafe.Pointer) bool { + return sync.RaceUncheckedAtomicCompareAndSwapUintptr((*uintptr)(waitingG), preparingG, g) +} + // Fetch fetches the next wake-up notification. If a notification is immediately // available, it is returned right away. Otherwise, the behavior depends on the // value of 'block': if true, the current goroutine blocks until a notification @@ -311,7 +312,7 @@ func (s *Sleeper) enqueueAssertedWaker(w *Waker) { case 0, preparingG: default: // We managed to get a G. Wake it up. - goready(g, 0) + sync.Goready(g, 0) } } diff --git a/pkg/state/tests/integer_test.go b/pkg/state/tests/integer_test.go index d3931c952..2b1609af0 100644 --- a/pkg/state/tests/integer_test.go +++ b/pkg/state/tests/integer_test.go @@ -20,21 +20,21 @@ import ( ) var ( - allIntTs = []int{-1, 0, 1} - allInt8s = []int8{math.MinInt8, -1, 0, 1, math.MaxInt8} - allInt16s = []int16{math.MinInt16, -1, 0, 1, math.MaxInt16} - allInt32s = []int32{math.MinInt32, -1, 0, 1, math.MaxInt32} - allInt64s = []int64{math.MinInt64, -1, 0, 1, math.MaxInt64} - allUintTs = []uint{0, 1} - allUintptrs = []uintptr{0, 1, ^uintptr(0)} - allUint8s = []uint8{0, 1, math.MaxUint8} - allUint16s = []uint16{0, 1, math.MaxUint16} - allUint32s = []uint32{0, 1, math.MaxUint32} - allUint64s = []uint64{0, 1, math.MaxUint64} + allBasicInts = []int{-1, 0, 1} + allInt8s = []int8{math.MinInt8, -1, 0, 1, math.MaxInt8} + allInt16s = []int16{math.MinInt16, -1, 0, 1, math.MaxInt16} + allInt32s = []int32{math.MinInt32, -1, 0, 1, math.MaxInt32} + allInt64s = []int64{math.MinInt64, -1, 0, 1, math.MaxInt64} + allBasicUints = []uint{0, 1} + allUintptrs = []uintptr{0, 1, ^uintptr(0)} + allUint8s = []uint8{0, 1, math.MaxUint8} + allUint16s = []uint16{0, 1, math.MaxUint16} + allUint32s = []uint32{0, 1, math.MaxUint32} + allUint64s = []uint64{0, 1, math.MaxUint64} ) var allInts = flatten( - allIntTs, + allBasicInts, allInt8s, allInt16s, allInt32s, @@ -42,7 +42,7 @@ var allInts = flatten( ) var allUints = flatten( - allUintTs, + allBasicUints, allUintptrs, allUint8s, allUint16s, diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD index 68535c3b1..28e62abbb 100644 --- a/pkg/sync/BUILD +++ b/pkg/sync/BUILD @@ -10,15 +10,34 @@ exports_files(["LICENSE"]) go_template( name = "generic_atomicptr", - srcs = ["atomicptr_unsafe.go"], + srcs = ["generic_atomicptr_unsafe.go"], types = [ "Value", ], ) go_template( + name = "generic_atomicptrmap", + srcs = ["generic_atomicptrmap_unsafe.go"], + opt_consts = [ + "ShardOrder", + ], + opt_types = [ + "Hasher", + ], + types = [ + "Key", + "Value", + ], + deps = [ + ":sync", + "//pkg/gohacks", + ], +) + +go_template( name = "generic_seqatomic", - srcs = ["seqatomic_unsafe.go"], + srcs = ["generic_seqatomic_unsafe.go"], types = [ "Value", ], @@ -31,18 +50,26 @@ go_library( name = "sync", srcs = [ "aliases.go", - "memmove_unsafe.go", + "checklocks_off_unsafe.go", + "checklocks_on_unsafe.go", + "goyield_go113_unsafe.go", + "goyield_unsafe.go", "mutex_unsafe.go", "nocopy.go", "norace_unsafe.go", + "race_amd64.s", + "race_arm64.s", "race_unsafe.go", + "runtime_unsafe.go", "rwmutex_unsafe.go", "seqcount.go", - "spin_unsafe.go", "sync.go", ], marshal = False, stateify = False, + deps = [ + "//pkg/goid", + ], ) go_test( diff --git a/pkg/sync/atomicptrmaptest/BUILD b/pkg/sync/atomicptrmaptest/BUILD new file mode 100644 index 000000000..3f71ae97d --- /dev/null +++ b/pkg/sync/atomicptrmaptest/BUILD @@ -0,0 +1,57 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package( + default_visibility = ["//visibility:private"], + licenses = ["notice"], +) + +go_template_instance( + name = "test_atomicptrmap", + out = "test_atomicptrmap_unsafe.go", + package = "atomicptrmap", + prefix = "test", + template = "//pkg/sync:generic_atomicptrmap", + types = { + "Key": "int64", + "Value": "testValue", + }, +) + +go_template_instance( + name = "test_atomicptrmap_sharded", + out = "test_atomicptrmap_sharded_unsafe.go", + consts = { + "ShardOrder": "4", + }, + package = "atomicptrmap", + prefix = "test", + suffix = "Sharded", + template = "//pkg/sync:generic_atomicptrmap", + types = { + "Key": "int64", + "Value": "testValue", + }, +) + +go_library( + name = "atomicptrmap", + testonly = 1, + srcs = [ + "atomicptrmap.go", + "test_atomicptrmap_sharded_unsafe.go", + "test_atomicptrmap_unsafe.go", + ], + deps = [ + "//pkg/gohacks", + "//pkg/sync", + ], +) + +go_test( + name = "atomicptrmap_test", + size = "small", + srcs = ["atomicptrmap_test.go"], + library = ":atomicptrmap", + deps = ["//pkg/sync"], +) diff --git a/tools/vm/test.cc b/pkg/sync/atomicptrmaptest/atomicptrmap.go index c0ceacda1..867821ce9 100644 --- a/tools/vm/test.cc +++ b/pkg/sync/atomicptrmaptest/atomicptrmap.go @@ -12,16 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "gtest/gtest.h" +// Package atomicptrmap instantiates generic_atomicptrmap for testing. +package atomicptrmap -namespace { - -TEST(Image, Sanity0) { - // Do nothing (in shard 0). -} - -TEST(Image, Sanity1) { - // Do nothing (in shard 1). +type testValue struct { + val int } - -} // namespace diff --git a/pkg/sync/atomicptrmaptest/atomicptrmap_test.go b/pkg/sync/atomicptrmaptest/atomicptrmap_test.go new file mode 100644 index 000000000..75a9997ef --- /dev/null +++ b/pkg/sync/atomicptrmaptest/atomicptrmap_test.go @@ -0,0 +1,635 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package atomicptrmap + +import ( + "context" + "fmt" + "math/rand" + "reflect" + "runtime" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/sync" +) + +func TestConsistencyWithGoMap(t *testing.T) { + const maxKey = 16 + var vals [4]*testValue + for i := 1; /* leave vals[0] nil */ i < len(vals); i++ { + vals[i] = new(testValue) + } + var ( + m = make(map[int64]*testValue) + apm testAtomicPtrMap + ) + for i := 0; i < 100000; i++ { + // Apply a random operation to both m and apm and expect them to have + // the same result. Bias toward CompareAndSwap, which has the most + // cases; bias away from Range and RangeRepeatable, which are + // relatively expensive. + switch rand.Intn(10) { + case 0, 1: // Load + key := rand.Int63n(maxKey) + want := m[key] + got := apm.Load(key) + t.Logf("Load(%d) = %p", key, got) + if got != want { + t.Fatalf("got %p, wanted %p", got, want) + } + case 2, 3: // Swap + key := rand.Int63n(maxKey) + val := vals[rand.Intn(len(vals))] + want := m[key] + if val != nil { + m[key] = val + } else { + delete(m, key) + } + got := apm.Swap(key, val) + t.Logf("Swap(%d, %p) = %p", key, val, got) + if got != want { + t.Fatalf("got %p, wanted %p", got, want) + } + case 4, 5, 6, 7: // CompareAndSwap + key := rand.Int63n(maxKey) + oldVal := vals[rand.Intn(len(vals))] + newVal := vals[rand.Intn(len(vals))] + want := m[key] + if want == oldVal { + if newVal != nil { + m[key] = newVal + } else { + delete(m, key) + } + } + got := apm.CompareAndSwap(key, oldVal, newVal) + t.Logf("CompareAndSwap(%d, %p, %p) = %p", key, oldVal, newVal, got) + if got != want { + t.Fatalf("got %p, wanted %p", got, want) + } + case 8: // Range + got := make(map[int64]*testValue) + var ( + haveDup = false + dup int64 + ) + apm.Range(func(key int64, val *testValue) bool { + if _, ok := got[key]; ok && !haveDup { + haveDup = true + dup = key + } + got[key] = val + return true + }) + t.Logf("Range() = %v", got) + if !reflect.DeepEqual(got, m) { + t.Fatalf("got %v, wanted %v", got, m) + } + if haveDup { + t.Fatalf("got duplicate key %d", dup) + } + case 9: // RangeRepeatable + got := make(map[int64]*testValue) + apm.RangeRepeatable(func(key int64, val *testValue) bool { + got[key] = val + return true + }) + t.Logf("RangeRepeatable() = %v", got) + if !reflect.DeepEqual(got, m) { + t.Fatalf("got %v, wanted %v", got, m) + } + } + } +} + +func TestConcurrentHeterogeneous(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + var ( + apm testAtomicPtrMap + wg sync.WaitGroup + ) + defer func() { + cancel() + wg.Wait() + }() + + possibleKeyValuePairs := make(map[int64]map[*testValue]struct{}) + addKeyValuePair := func(key int64, val *testValue) { + values := possibleKeyValuePairs[key] + if values == nil { + values = make(map[*testValue]struct{}) + possibleKeyValuePairs[key] = values + } + values[val] = struct{}{} + } + + const numValuesPerKey = 4 + + // These goroutines use keys not used by any other goroutine. + const numPrivateKeys = 3 + for i := 0; i < numPrivateKeys; i++ { + key := int64(i) + var vals [numValuesPerKey]*testValue + for i := 1; /* leave vals[0] nil */ i < len(vals); i++ { + val := new(testValue) + vals[i] = val + addKeyValuePair(key, val) + } + wg.Add(1) + go func() { + defer wg.Done() + r := rand.New(rand.NewSource(rand.Int63())) + var stored *testValue + for ctx.Err() == nil { + switch r.Intn(4) { + case 0: + got := apm.Load(key) + if got != stored { + t.Errorf("Load(%d): got %p, wanted %p", key, got, stored) + return + } + case 1: + val := vals[r.Intn(len(vals))] + want := stored + stored = val + got := apm.Swap(key, val) + if got != want { + t.Errorf("Swap(%d, %p): got %p, wanted %p", key, val, got, want) + return + } + case 2, 3: + oldVal := vals[r.Intn(len(vals))] + newVal := vals[r.Intn(len(vals))] + want := stored + if stored == oldVal { + stored = newVal + } + got := apm.CompareAndSwap(key, oldVal, newVal) + if got != want { + t.Errorf("CompareAndSwap(%d, %p, %p): got %p, wanted %p", key, oldVal, newVal, got, want) + return + } + } + } + }() + } + + // These goroutines share a small set of keys. + const numSharedKeys = 2 + var ( + sharedKeys [numSharedKeys]int64 + sharedValues = make(map[int64][]*testValue) + sharedValuesSet = make(map[int64]map[*testValue]struct{}) + ) + for i := range sharedKeys { + key := int64(numPrivateKeys + i) + sharedKeys[i] = key + vals := make([]*testValue, numValuesPerKey) + valsSet := make(map[*testValue]struct{}) + for j := range vals { + val := new(testValue) + vals[j] = val + valsSet[val] = struct{}{} + addKeyValuePair(key, val) + } + sharedValues[key] = vals + sharedValuesSet[key] = valsSet + } + randSharedValue := func(r *rand.Rand, key int64) *testValue { + vals := sharedValues[key] + return vals[r.Intn(len(vals))] + } + for i := 0; i < 3; i++ { + wg.Add(1) + go func() { + defer wg.Done() + r := rand.New(rand.NewSource(rand.Int63())) + for ctx.Err() == nil { + keyIndex := r.Intn(len(sharedKeys)) + key := sharedKeys[keyIndex] + var ( + op string + got *testValue + ) + switch r.Intn(4) { + case 0: + op = "Load" + got = apm.Load(key) + case 1: + op = "Swap" + got = apm.Swap(key, randSharedValue(r, key)) + case 2, 3: + op = "CompareAndSwap" + got = apm.CompareAndSwap(key, randSharedValue(r, key), randSharedValue(r, key)) + } + if got != nil { + valsSet := sharedValuesSet[key] + if _, ok := valsSet[got]; !ok { + t.Errorf("%s: got key %d, value %p; expected value in %v", op, key, got, valsSet) + return + } + } + } + }() + } + + // This goroutine repeatedly searches for unused keys. + wg.Add(1) + go func() { + defer wg.Done() + r := rand.New(rand.NewSource(rand.Int63())) + for ctx.Err() == nil { + key := -1 - r.Int63() + if got := apm.Load(key); got != nil { + t.Errorf("Load(%d): got %p, wanted nil", key, got) + } + } + }() + + // This goroutine repeatedly calls RangeRepeatable() and checks that each + // key corresponds to an expected value. + wg.Add(1) + go func() { + defer wg.Done() + abort := false + for !abort && ctx.Err() == nil { + apm.RangeRepeatable(func(key int64, val *testValue) bool { + values, ok := possibleKeyValuePairs[key] + if !ok { + t.Errorf("RangeRepeatable: got invalid key %d", key) + abort = true + return false + } + if _, ok := values[val]; !ok { + t.Errorf("RangeRepeatable: got key %d, value %p; expected one of %v", key, val, values) + abort = true + return false + } + return true + }) + } + }() + + // Finally, the main goroutine spins for the length of the test calling + // Range() and checking that each key that it observes is unique and + // corresponds to an expected value. + seenKeys := make(map[int64]struct{}) + const testDuration = 5 * time.Second + end := time.Now().Add(testDuration) + abort := false + for time.Now().Before(end) { + apm.Range(func(key int64, val *testValue) bool { + values, ok := possibleKeyValuePairs[key] + if !ok { + t.Errorf("Range: got invalid key %d", key) + abort = true + return false + } + if _, ok := values[val]; !ok { + t.Errorf("Range: got key %d, value %p; expected one of %v", key, val, values) + abort = true + return false + } + if _, ok := seenKeys[key]; ok { + t.Errorf("Range: got duplicate key %d", key) + abort = true + return false + } + seenKeys[key] = struct{}{} + return true + }) + if abort { + break + } + for k := range seenKeys { + delete(seenKeys, k) + } + } +} + +type benchmarkableMap interface { + Load(key int64) *testValue + Store(key int64, val *testValue) + LoadOrStore(key int64, val *testValue) (*testValue, bool) + Delete(key int64) +} + +// rwMutexMap implements benchmarkableMap for a RWMutex-protected Go map. +type rwMutexMap struct { + mu sync.RWMutex + m map[int64]*testValue +} + +func (m *rwMutexMap) Load(key int64) *testValue { + m.mu.RLock() + defer m.mu.RUnlock() + return m.m[key] +} + +func (m *rwMutexMap) Store(key int64, val *testValue) { + m.mu.Lock() + defer m.mu.Unlock() + if m.m == nil { + m.m = make(map[int64]*testValue) + } + m.m[key] = val +} + +func (m *rwMutexMap) LoadOrStore(key int64, val *testValue) (*testValue, bool) { + m.mu.Lock() + defer m.mu.Unlock() + if m.m == nil { + m.m = make(map[int64]*testValue) + } + if oldVal, ok := m.m[key]; ok { + return oldVal, true + } + m.m[key] = val + return val, false +} + +func (m *rwMutexMap) Delete(key int64) { + m.mu.Lock() + defer m.mu.Unlock() + delete(m.m, key) +} + +// syncMap implements benchmarkableMap for a sync.Map. +type syncMap struct { + m sync.Map +} + +func (m *syncMap) Load(key int64) *testValue { + val, ok := m.m.Load(key) + if !ok { + return nil + } + return val.(*testValue) +} + +func (m *syncMap) Store(key int64, val *testValue) { + m.m.Store(key, val) +} + +func (m *syncMap) LoadOrStore(key int64, val *testValue) (*testValue, bool) { + actual, loaded := m.m.LoadOrStore(key, val) + return actual.(*testValue), loaded +} + +func (m *syncMap) Delete(key int64) { + m.m.Delete(key) +} + +// benchmarkableAtomicPtrMap implements benchmarkableMap for testAtomicPtrMap. +type benchmarkableAtomicPtrMap struct { + m testAtomicPtrMap +} + +func (m *benchmarkableAtomicPtrMap) Load(key int64) *testValue { + return m.m.Load(key) +} + +func (m *benchmarkableAtomicPtrMap) Store(key int64, val *testValue) { + m.m.Store(key, val) +} + +func (m *benchmarkableAtomicPtrMap) LoadOrStore(key int64, val *testValue) (*testValue, bool) { + if prev := m.m.CompareAndSwap(key, nil, val); prev != nil { + return prev, true + } + return val, false +} + +func (m *benchmarkableAtomicPtrMap) Delete(key int64) { + m.m.Store(key, nil) +} + +// benchmarkableAtomicPtrMapSharded implements benchmarkableMap for testAtomicPtrMapSharded. +type benchmarkableAtomicPtrMapSharded struct { + m testAtomicPtrMapSharded +} + +func (m *benchmarkableAtomicPtrMapSharded) Load(key int64) *testValue { + return m.m.Load(key) +} + +func (m *benchmarkableAtomicPtrMapSharded) Store(key int64, val *testValue) { + m.m.Store(key, val) +} + +func (m *benchmarkableAtomicPtrMapSharded) LoadOrStore(key int64, val *testValue) (*testValue, bool) { + if prev := m.m.CompareAndSwap(key, nil, val); prev != nil { + return prev, true + } + return val, false +} + +func (m *benchmarkableAtomicPtrMapSharded) Delete(key int64) { + m.m.Store(key, nil) +} + +var mapImpls = [...]struct { + name string + ctor func() benchmarkableMap +}{ + { + name: "RWMutexMap", + ctor: func() benchmarkableMap { + return new(rwMutexMap) + }, + }, + { + name: "SyncMap", + ctor: func() benchmarkableMap { + return new(syncMap) + }, + }, + { + name: "AtomicPtrMap", + ctor: func() benchmarkableMap { + return new(benchmarkableAtomicPtrMap) + }, + }, + { + name: "AtomicPtrMapSharded", + ctor: func() benchmarkableMap { + return new(benchmarkableAtomicPtrMapSharded) + }, + }, +} + +func benchmarkStoreDelete(b *testing.B, mapCtor func() benchmarkableMap) { + m := mapCtor() + val := &testValue{} + for i := 0; i < b.N; i++ { + m.Store(int64(i), val) + } + for i := 0; i < b.N; i++ { + m.Delete(int64(i)) + } +} + +func BenchmarkStoreDelete(b *testing.B) { + for _, mapImpl := range mapImpls { + b.Run(mapImpl.name, func(b *testing.B) { + benchmarkStoreDelete(b, mapImpl.ctor) + }) + } +} + +func benchmarkLoadOrStoreDelete(b *testing.B, mapCtor func() benchmarkableMap) { + m := mapCtor() + val := &testValue{} + for i := 0; i < b.N; i++ { + m.LoadOrStore(int64(i), val) + } + for i := 0; i < b.N; i++ { + m.Delete(int64(i)) + } +} + +func BenchmarkLoadOrStoreDelete(b *testing.B) { + for _, mapImpl := range mapImpls { + b.Run(mapImpl.name, func(b *testing.B) { + benchmarkLoadOrStoreDelete(b, mapImpl.ctor) + }) + } +} + +func benchmarkLookupPositive(b *testing.B, mapCtor func() benchmarkableMap) { + m := mapCtor() + val := &testValue{} + for i := 0; i < b.N; i++ { + m.Store(int64(i), val) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + m.Load(int64(i)) + } +} + +func BenchmarkLookupPositive(b *testing.B) { + for _, mapImpl := range mapImpls { + b.Run(mapImpl.name, func(b *testing.B) { + benchmarkLookupPositive(b, mapImpl.ctor) + }) + } +} + +func benchmarkLookupNegative(b *testing.B, mapCtor func() benchmarkableMap) { + m := mapCtor() + val := &testValue{} + for i := 0; i < b.N; i++ { + m.Store(int64(i), val) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + m.Load(int64(-1 - i)) + } +} + +func BenchmarkLookupNegative(b *testing.B) { + for _, mapImpl := range mapImpls { + b.Run(mapImpl.name, func(b *testing.B) { + benchmarkLookupNegative(b, mapImpl.ctor) + }) + } +} + +type benchmarkConcurrentOptions struct { + // loadsPerMutationPair is the number of map lookups between each + // insertion/deletion pair. + loadsPerMutationPair int + + // If changeKeys is true, the keys used by each goroutine change between + // iterations of the test. + changeKeys bool +} + +func benchmarkConcurrent(b *testing.B, mapCtor func() benchmarkableMap, opts benchmarkConcurrentOptions) { + var ( + started sync.WaitGroup + workers sync.WaitGroup + ) + started.Add(1) + + m := mapCtor() + val := &testValue{} + // Insert a large number of unused elements into the map so that used + // elements are distributed throughout memory. + for i := 0; i < 10000; i++ { + m.Store(int64(-1-i), val) + } + // n := ceil(b.N / (opts.loadsPerMutationPair + 2)) + n := (b.N + opts.loadsPerMutationPair + 1) / (opts.loadsPerMutationPair + 2) + for i, procs := 0, runtime.GOMAXPROCS(0); i < procs; i++ { + workerID := i + workers.Add(1) + go func() { + defer workers.Done() + started.Wait() + for i := 0; i < n; i++ { + var key int64 + if opts.changeKeys { + key = int64(workerID*n + i) + } else { + key = int64(workerID) + } + m.LoadOrStore(key, val) + for j := 0; j < opts.loadsPerMutationPair; j++ { + m.Load(key) + } + m.Delete(key) + } + }() + } + + b.ResetTimer() + started.Done() + workers.Wait() +} + +func BenchmarkConcurrent(b *testing.B) { + changeKeysChoices := [...]struct { + name string + val bool + }{ + {"FixedKeys", false}, + {"ChangingKeys", true}, + } + writePcts := [...]struct { + name string + loadsPerMutationPair int + }{ + {"1PercentWrites", 198}, + {"10PercentWrites", 18}, + {"50PercentWrites", 2}, + } + for _, changeKeys := range changeKeysChoices { + for _, writePct := range writePcts { + for _, mapImpl := range mapImpls { + name := fmt.Sprintf("%s_%s_%s", changeKeys.name, writePct.name, mapImpl.name) + b.Run(name, func(b *testing.B) { + benchmarkConcurrent(b, mapImpl.ctor, benchmarkConcurrentOptions{ + loadsPerMutationPair: writePct.loadsPerMutationPair, + changeKeys: changeKeys.val, + }) + }) + } + } + } +} diff --git a/pkg/sync/checklocks_off_unsafe.go b/pkg/sync/checklocks_off_unsafe.go new file mode 100644 index 000000000..62c81b149 --- /dev/null +++ b/pkg/sync/checklocks_off_unsafe.go @@ -0,0 +1,18 @@ +// Copyright 2020 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !checklocks + +package sync + +import ( + "unsafe" +) + +func noteLock(l unsafe.Pointer) { +} + +func noteUnlock(l unsafe.Pointer) { +} diff --git a/pkg/sync/checklocks_on_unsafe.go b/pkg/sync/checklocks_on_unsafe.go new file mode 100644 index 000000000..24f933ed1 --- /dev/null +++ b/pkg/sync/checklocks_on_unsafe.go @@ -0,0 +1,108 @@ +// Copyright 2020 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build checklocks + +package sync + +import ( + "fmt" + "strings" + "sync" + "unsafe" + + "gvisor.dev/gvisor/pkg/goid" +) + +// gLocks contains metadata about the locks held by a goroutine. +type gLocks struct { + locksHeld []unsafe.Pointer +} + +// map[goid int]*gLocks +// +// Each key may only be written by the G with the goid it refers to. +// +// Note that entries are not evicted when a G exit, causing unbounded growth +// with new G creation / destruction. If this proves problematic, entries could +// be evicted when no locks are held at the expense of more allocations when +// taking top-level locks. +var locksHeld sync.Map + +func getGLocks() *gLocks { + id := goid.Get() + + var locks *gLocks + if l, ok := locksHeld.Load(id); ok { + locks = l.(*gLocks) + } else { + locks = &gLocks{ + // Initialize space for a few locks. + locksHeld: make([]unsafe.Pointer, 0, 8), + } + locksHeld.Store(id, locks) + } + + return locks +} + +func noteLock(l unsafe.Pointer) { + locks := getGLocks() + + for _, lock := range locks.locksHeld { + if lock == l { + panic(fmt.Sprintf("Deadlock on goroutine %d! Double lock of %p: %+v", goid.Get(), l, locks)) + } + } + + // Commit only after checking for panic conditions so that this lock + // isn't on the list if the above panic is recovered. + locks.locksHeld = append(locks.locksHeld, l) +} + +func noteUnlock(l unsafe.Pointer) { + locks := getGLocks() + + if len(locks.locksHeld) == 0 { + panic(fmt.Sprintf("Unlock of %p on goroutine %d without any locks held! All locks:\n%s", l, goid.Get(), dumpLocks())) + } + + // Search backwards since callers are most likely to unlock in LIFO order. + length := len(locks.locksHeld) + for i := length - 1; i >= 0; i-- { + if l == locks.locksHeld[i] { + copy(locks.locksHeld[i:length-1], locks.locksHeld[i+1:length]) + // Clear last entry to ensure addr can be GC'd. + locks.locksHeld[length-1] = nil + locks.locksHeld = locks.locksHeld[:length-1] + return + } + } + + panic(fmt.Sprintf("Unlock of %p on goroutine %d without matching lock! All locks:\n%s", l, goid.Get(), dumpLocks())) +} + +func dumpLocks() string { + var s strings.Builder + locksHeld.Range(func(key, value interface{}) bool { + goid := key.(int64) + locks := value.(*gLocks) + + // N.B. accessing gLocks of another G is fundamentally racy. + + fmt.Fprintf(&s, "goroutine %d:\n", goid) + if len(locks.locksHeld) == 0 { + fmt.Fprintf(&s, "\t<none>\n") + } + for _, lock := range locks.locksHeld { + fmt.Fprintf(&s, "\t%p\n", lock) + } + fmt.Fprintf(&s, "\n") + + return true + }) + + return s.String() +} diff --git a/pkg/sync/atomicptr_unsafe.go b/pkg/sync/generic_atomicptr_unsafe.go index 525c4beed..82b6df18c 100644 --- a/pkg/sync/atomicptr_unsafe.go +++ b/pkg/sync/generic_atomicptr_unsafe.go @@ -3,9 +3,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// Package template doesn't exist. This file must be instantiated using the +// Package seqatomic doesn't exist. This file must be instantiated using the // go_template_instance rule in tools/go_generics/defs.bzl. -package template +package seqatomic import ( "sync/atomic" diff --git a/pkg/sync/generic_atomicptrmap_unsafe.go b/pkg/sync/generic_atomicptrmap_unsafe.go new file mode 100644 index 000000000..c70dda6dd --- /dev/null +++ b/pkg/sync/generic_atomicptrmap_unsafe.go @@ -0,0 +1,503 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package atomicptrmap doesn't exist. This file must be instantiated using the +// go_template_instance rule in tools/go_generics/defs.bzl. +package atomicptrmap + +import ( + "reflect" + "runtime" + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sync" +) + +// Key is a required type parameter. +type Key struct{} + +// Value is a required type parameter. +type Value struct{} + +const ( + // ShardOrder is an optional parameter specifying the base-2 log of the + // number of shards per AtomicPtrMap. Higher values of ShardOrder reduce + // unnecessary synchronization between unrelated concurrent operations, + // improving performance for write-heavy workloads, but increase memory + // usage for small maps. + ShardOrder = 0 +) + +// Hasher is an optional type parameter. If Hasher is provided, it must define +// the Init and Hash methods. One Hasher will be shared by all AtomicPtrMaps. +type Hasher struct { + defaultHasher +} + +// defaultHasher is the default Hasher. This indirection exists because +// defaultHasher must exist even if a custom Hasher is provided, to prevent the +// Go compiler from complaining about defaultHasher's unused imports. +type defaultHasher struct { + fn func(unsafe.Pointer, uintptr) uintptr + seed uintptr +} + +// Init initializes the Hasher. +func (h *defaultHasher) Init() { + h.fn = sync.MapKeyHasher(map[Key]*Value(nil)) + h.seed = sync.RandUintptr() +} + +// Hash returns the hash value for the given Key. +func (h *defaultHasher) Hash(key Key) uintptr { + return h.fn(gohacks.Noescape(unsafe.Pointer(&key)), h.seed) +} + +var hasher Hasher + +func init() { + hasher.Init() +} + +// An AtomicPtrMap maps Keys to non-nil pointers to Values. AtomicPtrMap are +// safe for concurrent use from multiple goroutines without additional +// synchronization. +// +// The zero value of AtomicPtrMap is empty (maps all Keys to nil) and ready for +// use. AtomicPtrMaps must not be copied after first use. +// +// sync.Map may be faster than AtomicPtrMap if most operations on the map are +// concurrent writes to a fixed set of keys. AtomicPtrMap is usually faster in +// other circumstances. +type AtomicPtrMap struct { + // AtomicPtrMap is implemented as a hash table with the following + // properties: + // + // * Collisions are resolved with quadratic probing. Of the two major + // alternatives, Robin Hood linear probing makes it difficult for writers + // to execute in parallel, and bucketing is less effective in Go due to + // lack of SIMD. + // + // * The table is optionally divided into shards indexed by hash to further + // reduce unnecessary synchronization. + + shards [1 << ShardOrder]apmShard +} + +func (m *AtomicPtrMap) shard(hash uintptr) *apmShard { + // Go defines right shifts >= width of shifted unsigned operand as 0, so + // this is correct even if ShardOrder is 0 (although nogo complains because + // nogo is dumb). + const indexLSB = unsafe.Sizeof(uintptr(0))*8 - ShardOrder + index := hash >> indexLSB + return (*apmShard)(unsafe.Pointer(uintptr(unsafe.Pointer(&m.shards)) + (index * unsafe.Sizeof(apmShard{})))) +} + +type apmShard struct { + apmShardMutationData + _ [apmShardMutationDataPadding]byte + apmShardLookupData + _ [apmShardLookupDataPadding]byte +} + +type apmShardMutationData struct { + dirtyMu sync.Mutex // serializes slot transitions out of empty + dirty uintptr // # slots with val != nil + count uintptr // # slots with val != nil and val != tombstone() + rehashMu sync.Mutex // serializes rehashing +} + +type apmShardLookupData struct { + seq sync.SeqCount // allows atomic reads of slots+mask + slots unsafe.Pointer // [mask+1]slot or nil; protected by rehashMu/seq + mask uintptr // always (a power of 2) - 1; protected by rehashMu/seq +} + +const ( + cacheLineBytes = 64 + // Cache line padding is enabled if sharding is. + apmEnablePadding = (ShardOrder + 63) >> 6 // 0 if ShardOrder == 0, 1 otherwise + // The -1 and +1 below are required to ensure that if unsafe.Sizeof(T) % + // cacheLineBytes == 0, then padding is 0 (rather than cacheLineBytes). + apmShardMutationDataRequiredPadding = cacheLineBytes - (((unsafe.Sizeof(apmShardMutationData{}) - 1) % cacheLineBytes) + 1) + apmShardMutationDataPadding = apmEnablePadding * apmShardMutationDataRequiredPadding + apmShardLookupDataRequiredPadding = cacheLineBytes - (((unsafe.Sizeof(apmShardLookupData{}) - 1) % cacheLineBytes) + 1) + apmShardLookupDataPadding = apmEnablePadding * apmShardLookupDataRequiredPadding + + // These define fractional thresholds for when apmShard.rehash() is called + // (i.e. the load factor) and when it rehases to a larger table + // respectively. They are chosen such that the rehash threshold = the + // expansion threshold + 1/2, so that when reuse of deleted slots is rare + // or non-existent, rehashing occurs after the insertion of at least 1/2 + // the table's size in new entries, which is acceptably infrequent. + apmRehashThresholdNum = 2 + apmRehashThresholdDen = 3 + apmExpansionThresholdNum = 1 + apmExpansionThresholdDen = 6 +) + +type apmSlot struct { + // slot states are indicated by val: + // + // * Empty: val == nil; key is meaningless. May transition to full or + // evacuated with dirtyMu locked. + // + // * Full: val != nil, tombstone(), or evacuated(); key is immutable. val + // is the Value mapped to key. May transition to deleted or evacuated. + // + // * Deleted: val == tombstone(); key is still immutable. key is mapped to + // no Value. May transition to full or evacuated. + // + // * Evacuated: val == evacuated(); key is immutable. Set by rehashing on + // slots that have already been moved, requiring readers to wait for + // rehashing to complete and use the new table. Terminal state. + // + // Note that once val is non-nil, it cannot become nil again. That is, the + // transition from empty to non-empty is irreversible for a given slot; + // the only way to create more empty slots is by rehashing. + val unsafe.Pointer + key Key +} + +func apmSlotAt(slots unsafe.Pointer, pos uintptr) *apmSlot { + return (*apmSlot)(unsafe.Pointer(uintptr(slots) + pos*unsafe.Sizeof(apmSlot{}))) +} + +var tombstoneObj byte + +func tombstone() unsafe.Pointer { + return unsafe.Pointer(&tombstoneObj) +} + +var evacuatedObj byte + +func evacuated() unsafe.Pointer { + return unsafe.Pointer(&evacuatedObj) +} + +// Load returns the Value stored in m for key. +func (m *AtomicPtrMap) Load(key Key) *Value { + hash := hasher.Hash(key) + shard := m.shard(hash) + +retry: + epoch := shard.seq.BeginRead() + slots := atomic.LoadPointer(&shard.slots) + mask := atomic.LoadUintptr(&shard.mask) + if !shard.seq.ReadOk(epoch) { + goto retry + } + if slots == nil { + return nil + } + + i := hash & mask + inc := uintptr(1) + for { + slot := apmSlotAt(slots, i) + slotVal := atomic.LoadPointer(&slot.val) + if slotVal == nil { + // Empty slot; end of probe sequence. + return nil + } + if slotVal == evacuated() { + // Racing with rehashing. + goto retry + } + if slot.key == key { + if slotVal == tombstone() { + return nil + } + return (*Value)(slotVal) + } + i = (i + inc) & mask + inc++ + } +} + +// Store stores the Value val for key. +func (m *AtomicPtrMap) Store(key Key, val *Value) { + m.maybeCompareAndSwap(key, false, nil, val) +} + +// Swap stores the Value val for key and returns the previously-mapped Value. +func (m *AtomicPtrMap) Swap(key Key, val *Value) *Value { + return m.maybeCompareAndSwap(key, false, nil, val) +} + +// CompareAndSwap checks that the Value stored for key is oldVal; if it is, it +// stores the Value newVal for key. CompareAndSwap returns the previous Value +// stored for key, whether or not it stores newVal. +func (m *AtomicPtrMap) CompareAndSwap(key Key, oldVal, newVal *Value) *Value { + return m.maybeCompareAndSwap(key, true, oldVal, newVal) +} + +func (m *AtomicPtrMap) maybeCompareAndSwap(key Key, compare bool, typedOldVal, typedNewVal *Value) *Value { + hash := hasher.Hash(key) + shard := m.shard(hash) + oldVal := tombstone() + if typedOldVal != nil { + oldVal = unsafe.Pointer(typedOldVal) + } + newVal := tombstone() + if typedNewVal != nil { + newVal = unsafe.Pointer(typedNewVal) + } + +retry: + epoch := shard.seq.BeginRead() + slots := atomic.LoadPointer(&shard.slots) + mask := atomic.LoadUintptr(&shard.mask) + if !shard.seq.ReadOk(epoch) { + goto retry + } + if slots == nil { + if (compare && oldVal != tombstone()) || newVal == tombstone() { + return nil + } + // Need to allocate a table before insertion. + shard.rehash(nil) + goto retry + } + + i := hash & mask + inc := uintptr(1) + for { + slot := apmSlotAt(slots, i) + slotVal := atomic.LoadPointer(&slot.val) + if slotVal == nil { + if (compare && oldVal != tombstone()) || newVal == tombstone() { + return nil + } + // Try to grab this slot for ourselves. + shard.dirtyMu.Lock() + slotVal = atomic.LoadPointer(&slot.val) + if slotVal == nil { + // Check if we need to rehash before dirtying a slot. + if dirty, capacity := shard.dirty+1, mask+1; dirty*apmRehashThresholdDen >= capacity*apmRehashThresholdNum { + shard.dirtyMu.Unlock() + shard.rehash(slots) + goto retry + } + slot.key = key + atomic.StorePointer(&slot.val, newVal) // transitions slot to full + shard.dirty++ + atomic.AddUintptr(&shard.count, 1) + shard.dirtyMu.Unlock() + return nil + } + // Raced with another store; the slot is no longer empty. Continue + // with the new value of slotVal since we may have raced with + // another store of key. + shard.dirtyMu.Unlock() + } + if slotVal == evacuated() { + // Racing with rehashing. + goto retry + } + if slot.key == key { + // We're reusing an existing slot, so rehashing isn't necessary. + for { + if (compare && oldVal != slotVal) || newVal == slotVal { + if slotVal == tombstone() { + return nil + } + return (*Value)(slotVal) + } + if atomic.CompareAndSwapPointer(&slot.val, slotVal, newVal) { + if slotVal == tombstone() { + atomic.AddUintptr(&shard.count, 1) + return nil + } + if newVal == tombstone() { + atomic.AddUintptr(&shard.count, ^uintptr(0) /* -1 */) + } + return (*Value)(slotVal) + } + slotVal = atomic.LoadPointer(&slot.val) + if slotVal == evacuated() { + goto retry + } + } + } + // This produces a triangular number sequence of offsets from the + // initially-probed position. + i = (i + inc) & mask + inc++ + } +} + +// rehash is marked nosplit to avoid preemption during table copying. +//go:nosplit +func (shard *apmShard) rehash(oldSlots unsafe.Pointer) { + shard.rehashMu.Lock() + defer shard.rehashMu.Unlock() + + if shard.slots != oldSlots { + // Raced with another call to rehash(). + return + } + + // Determine the size of the new table. Constraints: + // + // * The size of the table must be a power of two to ensure that every slot + // is visitable by every probe sequence under quadratic probing with + // triangular numbers. + // + // * The size of the table cannot decrease because even if shard.count is + // currently smaller than shard.dirty, concurrent stores that reuse + // existing slots can drive shard.count back up to a maximum of + // shard.dirty. + newSize := uintptr(8) // arbitrary initial size + if oldSlots != nil { + oldSize := shard.mask + 1 + newSize = oldSize + if count := atomic.LoadUintptr(&shard.count) + 1; count*apmExpansionThresholdDen > oldSize*apmExpansionThresholdNum { + newSize *= 2 + } + } + + // Allocate the new table. + newSlotsSlice := make([]apmSlot, newSize) + newSlotsReflect := (*reflect.SliceHeader)(unsafe.Pointer(&newSlotsSlice)) + newSlots := unsafe.Pointer(newSlotsReflect.Data) + runtime.KeepAlive(newSlotsSlice) + newMask := newSize - 1 + + // Start a writer critical section now so that racing users of the old + // table that observe evacuated() wait for the new table. (But lock dirtyMu + // first since doing so may block, which we don't want to do during the + // writer critical section.) + shard.dirtyMu.Lock() + shard.seq.BeginWrite() + + if oldSlots != nil { + realCount := uintptr(0) + // Copy old entries to the new table. + oldMask := shard.mask + for i := uintptr(0); i <= oldMask; i++ { + oldSlot := apmSlotAt(oldSlots, i) + val := atomic.SwapPointer(&oldSlot.val, evacuated()) + if val == nil || val == tombstone() { + continue + } + hash := hasher.Hash(oldSlot.key) + j := hash & newMask + inc := uintptr(1) + for { + newSlot := apmSlotAt(newSlots, j) + if newSlot.val == nil { + newSlot.val = val + newSlot.key = oldSlot.key + break + } + j = (j + inc) & newMask + inc++ + } + realCount++ + } + // Update dirty to reflect that tombstones were not copied to the new + // table. Use realCount since a concurrent mutator may not have updated + // shard.count yet. + shard.dirty = realCount + } + + // Switch to the new table. + atomic.StorePointer(&shard.slots, newSlots) + atomic.StoreUintptr(&shard.mask, newMask) + + shard.seq.EndWrite() + shard.dirtyMu.Unlock() +} + +// Range invokes f on each Key-Value pair stored in m. If any call to f returns +// false, Range stops iteration and returns. +// +// Range does not necessarily correspond to any consistent snapshot of the +// Map's contents: no Key will be visited more than once, but if the Value for +// any Key is stored or deleted concurrently, Range may reflect any mapping for +// that Key from any point during the Range call. +// +// f must not call other methods on m. +func (m *AtomicPtrMap) Range(f func(key Key, val *Value) bool) { + for si := 0; si < len(m.shards); si++ { + shard := &m.shards[si] + if !shard.doRange(f) { + return + } + } +} + +func (shard *apmShard) doRange(f func(key Key, val *Value) bool) bool { + // We have to lock rehashMu because if we handled races with rehashing by + // retrying, f could see the same key twice. + shard.rehashMu.Lock() + defer shard.rehashMu.Unlock() + slots := shard.slots + if slots == nil { + return true + } + mask := shard.mask + for i := uintptr(0); i <= mask; i++ { + slot := apmSlotAt(slots, i) + slotVal := atomic.LoadPointer(&slot.val) + if slotVal == nil || slotVal == tombstone() { + continue + } + if !f(slot.key, (*Value)(slotVal)) { + return false + } + } + return true +} + +// RangeRepeatable is like Range, but: +// +// * RangeRepeatable may visit the same Key multiple times in the presence of +// concurrent mutators, possibly passing different Values to f in different +// calls. +// +// * It is safe for f to call other methods on m. +func (m *AtomicPtrMap) RangeRepeatable(f func(key Key, val *Value) bool) { + for si := 0; si < len(m.shards); si++ { + shard := &m.shards[si] + + retry: + epoch := shard.seq.BeginRead() + slots := atomic.LoadPointer(&shard.slots) + mask := atomic.LoadUintptr(&shard.mask) + if !shard.seq.ReadOk(epoch) { + goto retry + } + if slots == nil { + continue + } + + for i := uintptr(0); i <= mask; i++ { + slot := apmSlotAt(slots, i) + slotVal := atomic.LoadPointer(&slot.val) + if slotVal == evacuated() { + goto retry + } + if slotVal == nil || slotVal == tombstone() { + continue + } + if !f(slot.key, (*Value)(slotVal)) { + return + } + } + } +} diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/generic_seqatomic_unsafe.go index 2184cb5ab..82b676abf 100644 --- a/pkg/sync/seqatomic_unsafe.go +++ b/pkg/sync/generic_seqatomic_unsafe.go @@ -3,25 +3,17 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// Package template doesn't exist. This file must be instantiated using the +// Package seqatomic doesn't exist. This file must be instantiated using the // go_template_instance rule in tools/go_generics/defs.bzl. -package template +package seqatomic import ( - "fmt" - "reflect" - "strings" "unsafe" "gvisor.dev/gvisor/pkg/sync" ) // Value is a required type parameter. -// -// Value must not contain any pointers, including interface objects, function -// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs -// containing any of the above. An init() function will panic if this property -// does not hold. type Value struct{} // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race @@ -55,12 +47,3 @@ func SeqAtomicTryLoad(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) ok = seq.ReadOk(epoch) return } - -func init() { - var val Value - typ := reflect.TypeOf(val) - name := typ.Name() - if ptrs := sync.PointersInType(typ, name); len(ptrs) != 0 { - panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n"))) - } -} diff --git a/pkg/sync/goyield_go113_unsafe.go b/pkg/sync/goyield_go113_unsafe.go new file mode 100644 index 000000000..8aee0d455 --- /dev/null +++ b/pkg/sync/goyield_go113_unsafe.go @@ -0,0 +1,18 @@ +// Copyright 2020 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.13 +// +build !go1.14 + +package sync + +import ( + "runtime" +) + +func goyield() { + // goyield is not available until Go 1.14. + runtime.Gosched() +} diff --git a/pkg/sync/spin_unsafe.go b/pkg/sync/goyield_unsafe.go index cafb2d065..672ee274d 100644 --- a/pkg/sync/spin_unsafe.go +++ b/pkg/sync/goyield_unsafe.go @@ -3,7 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build go1.13 +// +build go1.14 // +build !go1.17 // Check go:linkname function signatures when updating Go version. @@ -14,11 +14,5 @@ import ( _ "unsafe" // for go:linkname ) -//go:linkname canSpin sync.runtime_canSpin -func canSpin(i int) bool - -//go:linkname doSpin sync.runtime_doSpin -func doSpin() - //go:linkname goyield runtime.goyield func goyield() diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go deleted file mode 100644 index f5e630009..000000000 --- a/pkg/sync/memmove_unsafe.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.12 -// +build !go1.17 - -// Check go:linkname function signatures when updating Go version. - -package sync - -import ( - "unsafe" -) - -//go:linkname memmove runtime.memmove -//go:noescape -func memmove(to, from unsafe.Pointer, n uintptr) - -// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't -// define it because go_generics can't update the go:linkname annotation. -// Furthermore, go:linkname silently doesn't work if the local name is exported -// (this is of course undocumented), which is why this indirection is -// necessary. -func Memmove(to, from unsafe.Pointer, n uintptr) { - memmove(to, from, n) -} diff --git a/pkg/sync/mutex_test.go b/pkg/sync/mutex_test.go index 0838248b4..4fb51a8ab 100644 --- a/pkg/sync/mutex_test.go +++ b/pkg/sync/mutex_test.go @@ -32,11 +32,11 @@ func TestStructSize(t *testing.T) { func TestFieldValues(t *testing.T) { var m Mutex m.Lock() - if got := *m.state(); got != mutexLocked { + if got := *m.m.state(); got != mutexLocked { t.Errorf("got locked sync.Mutex.state = %d, want = %d", got, mutexLocked) } m.Unlock() - if got := *m.state(); got != mutexUnlocked { + if got := *m.m.state(); got != mutexUnlocked { t.Errorf("got unlocked sync.Mutex.state = %d, want = %d", got, mutexUnlocked) } } diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go index f4c2e9642..21084b857 100644 --- a/pkg/sync/mutex_unsafe.go +++ b/pkg/sync/mutex_unsafe.go @@ -17,8 +17,9 @@ import ( "unsafe" ) -// Mutex is a try lock. -type Mutex struct { +// CrossGoroutineMutex is equivalent to Mutex, but it need not be unlocked by a +// the same goroutine that locked the mutex. +type CrossGoroutineMutex struct { sync.Mutex } @@ -27,7 +28,7 @@ type syncMutex struct { sema uint32 } -func (m *Mutex) state() *int32 { +func (m *CrossGoroutineMutex) state() *int32 { return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state } @@ -36,9 +37,9 @@ const ( mutexLocked = 1 ) -// TryLock tries to aquire the mutex. It returns true if it succeeds and false +// TryLock tries to acquire the mutex. It returns true if it succeeds and false // otherwise. TryLock does not block. -func (m *Mutex) TryLock() bool { +func (m *CrossGoroutineMutex) TryLock() bool { if atomic.CompareAndSwapInt32(m.state(), mutexUnlocked, mutexLocked) { if RaceEnabled { RaceAcquire(unsafe.Pointer(&m.Mutex)) @@ -47,3 +48,43 @@ func (m *Mutex) TryLock() bool { } return false } + +// Mutex is a mutual exclusion lock. The zero value for a Mutex is an unlocked +// mutex. +// +// A Mutex must not be copied after first use. +// +// A Mutex must be unlocked by the same goroutine that locked it. This +// invariant is enforced with the 'checklocks' build tag. +type Mutex struct { + m CrossGoroutineMutex +} + +// Lock locks m. If the lock is already in use, the calling goroutine blocks +// until the mutex is available. +func (m *Mutex) Lock() { + noteLock(unsafe.Pointer(m)) + m.m.Lock() +} + +// Unlock unlocks m. +// +// Preconditions: +// * m is locked. +// * m was locked by this goroutine. +func (m *Mutex) Unlock() { + noteUnlock(unsafe.Pointer(m)) + m.m.Unlock() +} + +// TryLock tries to acquire the mutex. It returns true if it succeeds and false +// otherwise. TryLock does not block. +func (m *Mutex) TryLock() bool { + // Note lock first to enforce proper locking even if unsuccessful. + noteLock(unsafe.Pointer(m)) + locked := m.m.TryLock() + if !locked { + noteUnlock(unsafe.Pointer(m)) + } + return locked +} diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go index 006055dd6..70b5f3a5e 100644 --- a/pkg/sync/norace_unsafe.go +++ b/pkg/sync/norace_unsafe.go @@ -8,6 +8,7 @@ package sync import ( + "sync/atomic" "unsafe" ) @@ -33,3 +34,13 @@ func RaceRelease(addr unsafe.Pointer) { // RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. func RaceReleaseMerge(addr unsafe.Pointer) { } + +// RaceUncheckedAtomicCompareAndSwapUintptr is equivalent to +// sync/atomic.CompareAndSwapUintptr, but is not checked by the race detector. +// This is necessary when implementing gopark callbacks, since no race context +// is available during their execution. +func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool { + // Use atomic.CompareAndSwapUintptr outside of race builds for + // inlinability. + return atomic.CompareAndSwapUintptr(ptr, old, new) +} diff --git a/pkg/syncevent/waiter_amd64.s b/pkg/sync/race_amd64.s index 5e216b045..57bc0ec79 100644 --- a/pkg/syncevent/waiter_amd64.s +++ b/pkg/sync/race_amd64.s @@ -12,21 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build race +// +build amd64 + #include "textflag.h" -// See waiter_noasm_unsafe.go for a description of waiterUnlock. -// -// func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool -TEXT ·waiterUnlock(SB),NOSPLIT,$0-24 +// func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool +TEXT ·RaceUncheckedAtomicCompareAndSwapUintptr(SB),NOSPLIT,$0-25 MOVQ ptr+0(FP), DI - MOVQ wg+8(FP), SI + MOVQ old+8(FP), AX + MOVQ new+16(FP), SI - MOVQ $·preparingG(SB), AX LOCK - CMPXCHGQ DI, 0(SI) + CMPXCHGQ SI, 0(DI) SETEQ AX - MOVB AX, ret+16(FP) + MOVB AX, ret+24(FP) RET diff --git a/pkg/syncevent/waiter_arm64.s b/pkg/sync/race_arm64.s index f4c06f194..88f091fda 100644 --- a/pkg/syncevent/waiter_arm64.s +++ b/pkg/sync/race_arm64.s @@ -12,15 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build race +// +build arm64 + #include "textflag.h" -// See waiter_noasm_unsafe.go for a description of waiterUnlock. -// -// func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool -TEXT ·waiterUnlock(SB),NOSPLIT,$0-24 - MOVD wg+8(FP), R0 - MOVD $·preparingG(SB), R1 - MOVD ptr+0(FP), R2 +// func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool +TEXT ·RaceUncheckedAtomicCompareAndSwapUintptr(SB),NOSPLIT,$0-25 + MOVD ptr+0(FP), R0 + MOVD old+8(FP), R1 + MOVD new+16(FP), R1 again: LDAXR (R0), R3 CMP R1, R3 @@ -29,6 +30,6 @@ again: CBNZ R3, again ok: CSET EQ, R0 - MOVB R0, ret+16(FP) + MOVB R0, ret+24(FP) RET diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go index 31d8fa9a6..59985c270 100644 --- a/pkg/sync/race_unsafe.go +++ b/pkg/sync/race_unsafe.go @@ -39,3 +39,9 @@ func RaceRelease(addr unsafe.Pointer) { func RaceReleaseMerge(addr unsafe.Pointer) { runtime.RaceReleaseMerge(addr) } + +// RaceUncheckedAtomicCompareAndSwapUintptr is equivalent to +// sync/atomic.CompareAndSwapUintptr, but is not checked by the race detector. +// This is necessary when implementing gopark callbacks, since no race context +// is available during their execution. +func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool diff --git a/pkg/sync/runtime_unsafe.go b/pkg/sync/runtime_unsafe.go new file mode 100644 index 000000000..e925e2e5b --- /dev/null +++ b/pkg/sync/runtime_unsafe.go @@ -0,0 +1,129 @@ +// Copyright 2020 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.13 +// +build !go1.17 + +// Check function signatures and constants when updating Go version. + +package sync + +import ( + "fmt" + "reflect" + "unsafe" +) + +// Note that go:linkname silently doesn't work if the local name is exported, +// necessitating an indirection for exported functions. + +// Memmove is runtime.memmove, exported for SeqAtomicLoad/SeqAtomicTryLoad<T>. +// +//go:nosplit +func Memmove(to, from unsafe.Pointer, n uintptr) { + memmove(to, from, n) +} + +//go:linkname memmove runtime.memmove +//go:noescape +func memmove(to, from unsafe.Pointer, n uintptr) + +// Gopark is runtime.gopark. Gopark calls unlockf(pointer to runtime.g, lock); +// if unlockf returns true, Gopark blocks until Goready(pointer to runtime.g) +// is called. unlockf and its callees must be nosplit and norace, since stack +// splitting and race context are not available where it is called. +// +//go:nosplit +func Gopark(unlockf func(uintptr, unsafe.Pointer) bool, lock unsafe.Pointer, reason uint8, traceEv byte, traceskip int) { + gopark(unlockf, lock, reason, traceEv, traceskip) +} + +//go:linkname gopark runtime.gopark +func gopark(unlockf func(uintptr, unsafe.Pointer) bool, lock unsafe.Pointer, reason uint8, traceEv byte, traceskip int) + +// Goready is runtime.goready. +// +//go:nosplit +func Goready(gp uintptr, traceskip int) { + goready(gp, traceskip) +} + +//go:linkname goready runtime.goready +func goready(gp uintptr, traceskip int) + +// Values for the reason argument to gopark, from Go's src/runtime/runtime2.go. +const ( + WaitReasonSelect uint8 = 9 +) + +// Values for the traceEv argument to gopark, from Go's src/runtime/trace.go. +const ( + TraceEvGoBlockSelect byte = 24 +) + +// Rand32 returns a non-cryptographically-secure random uint32. +func Rand32() uint32 { + return fastrand() +} + +// Rand64 returns a non-cryptographically-secure random uint64. +func Rand64() uint64 { + return uint64(fastrand())<<32 | uint64(fastrand()) +} + +//go:linkname fastrand runtime.fastrand +func fastrand() uint32 + +// RandUintptr returns a non-cryptographically-secure random uintptr. +func RandUintptr() uintptr { + if unsafe.Sizeof(uintptr(0)) == 4 { + return uintptr(Rand32()) + } + return uintptr(Rand64()) +} + +// MapKeyHasher returns a hash function for pointers of m's key type. +// +// Preconditions: m must be a map. +func MapKeyHasher(m interface{}) func(unsafe.Pointer, uintptr) uintptr { + if rtyp := reflect.TypeOf(m); rtyp.Kind() != reflect.Map { + panic(fmt.Sprintf("sync.MapKeyHasher: m is %v, not map", rtyp)) + } + mtyp := *(**maptype)(unsafe.Pointer(&m)) + return mtyp.hasher +} + +type maptype struct { + size uintptr + ptrdata uintptr + hash uint32 + tflag uint8 + align uint8 + fieldAlign uint8 + kind uint8 + equal func(unsafe.Pointer, unsafe.Pointer) bool + gcdata *byte + str int32 + ptrToThis int32 + key unsafe.Pointer + elem unsafe.Pointer + bucket unsafe.Pointer + hasher func(unsafe.Pointer, uintptr) uintptr + // more fields +} + +// These functions are only used within the sync package. + +//go:linkname semacquire sync.runtime_Semacquire +func semacquire(s *uint32) + +//go:linkname semrelease sync.runtime_Semrelease +func semrelease(s *uint32, handoff bool, skipframes int) + +//go:linkname canSpin sync.runtime_canSpin +func canSpin(i int) bool + +//go:linkname doSpin sync.runtime_doSpin +func doSpin() diff --git a/pkg/sync/rwmutex_test.go b/pkg/sync/rwmutex_test.go index ce667e825..5ca96d12b 100644 --- a/pkg/sync/rwmutex_test.go +++ b/pkg/sync/rwmutex_test.go @@ -102,7 +102,7 @@ func downgradingWriter(rwm *RWMutex, numIterations int, activity *int32, cdone c } for i := 0; i < 100; i++ { } - n = atomic.AddInt32(activity, -1) + atomic.AddInt32(activity, -1) rwm.RUnlock() } cdone <- true diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go index b3b4dee78..4cf3fcd6e 100644 --- a/pkg/sync/rwmutex_unsafe.go +++ b/pkg/sync/rwmutex_unsafe.go @@ -3,11 +3,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build go1.13 -// +build !go1.17 - -// Check go:linkname function signatures when updating Go version. - // This is mostly copied from the standard library's sync/rwmutex.go. // // Happens-before relationships indicated to the race detector: @@ -23,16 +18,15 @@ import ( "unsafe" ) -//go:linkname runtimeSemacquire sync.runtime_Semacquire -func runtimeSemacquire(s *uint32) - -//go:linkname runtimeSemrelease sync.runtime_Semrelease -func runtimeSemrelease(s *uint32, handoff bool, skipframes int) - -// RWMutex is identical to sync.RWMutex, but adds the DowngradeLock, -// TryLock and TryRLock methods. -type RWMutex struct { - w Mutex // held if there are pending writers +// CrossGoroutineRWMutex is equivalent to RWMutex, but it need not be unlocked +// by a the same goroutine that locked the mutex. +type CrossGoroutineRWMutex struct { + // w is held if there are pending writers + // + // We use CrossGoroutineMutex rather than Mutex because the lock + // annotation instrumentation in Mutex will trigger false positives in + // the race detector when called inside of RaceDisable. + w CrossGoroutineMutex writerSem uint32 // semaphore for writers to wait for completing readers readerSem uint32 // semaphore for readers to wait for completing writers readerCount int32 // number of pending readers @@ -43,7 +37,7 @@ const rwmutexMaxReaders = 1 << 30 // TryRLock locks rw for reading. It returns true if it succeeds and false // otherwise. It does not block. -func (rw *RWMutex) TryRLock() bool { +func (rw *CrossGoroutineRWMutex) TryRLock() bool { if RaceEnabled { RaceDisable() } @@ -67,13 +61,17 @@ func (rw *RWMutex) TryRLock() bool { } // RLock locks rw for reading. -func (rw *RWMutex) RLock() { +// +// It should not be used for recursive read locking; a blocked Lock call +// excludes new readers from acquiring the lock. See the documentation on the +// RWMutex type. +func (rw *CrossGoroutineRWMutex) RLock() { if RaceEnabled { RaceDisable() } if atomic.AddInt32(&rw.readerCount, 1) < 0 { // A writer is pending, wait for it. - runtimeSemacquire(&rw.readerSem) + semacquire(&rw.readerSem) } if RaceEnabled { RaceEnable() @@ -82,7 +80,10 @@ func (rw *RWMutex) RLock() { } // RUnlock undoes a single RLock call. -func (rw *RWMutex) RUnlock() { +// +// Preconditions: +// * rw is locked for reading. +func (rw *CrossGoroutineRWMutex) RUnlock() { if RaceEnabled { RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) RaceDisable() @@ -94,7 +95,7 @@ func (rw *RWMutex) RUnlock() { // A writer is pending. if atomic.AddInt32(&rw.readerWait, -1) == 0 { // The last reader unblocks the writer. - runtimeSemrelease(&rw.writerSem, false, 0) + semrelease(&rw.writerSem, false, 0) } } if RaceEnabled { @@ -104,7 +105,7 @@ func (rw *RWMutex) RUnlock() { // TryLock locks rw for writing. It returns true if it succeeds and false // otherwise. It does not block. -func (rw *RWMutex) TryLock() bool { +func (rw *CrossGoroutineRWMutex) TryLock() bool { if RaceEnabled { RaceDisable() } @@ -130,8 +131,9 @@ func (rw *RWMutex) TryLock() bool { return true } -// Lock locks rw for writing. -func (rw *RWMutex) Lock() { +// Lock locks rw for writing. If the lock is already locked for reading or +// writing, Lock blocks until the lock is available. +func (rw *CrossGoroutineRWMutex) Lock() { if RaceEnabled { RaceDisable() } @@ -141,7 +143,7 @@ func (rw *RWMutex) Lock() { r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders // Wait for active readers. if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 { - runtimeSemacquire(&rw.writerSem) + semacquire(&rw.writerSem) } if RaceEnabled { RaceEnable() @@ -150,7 +152,10 @@ func (rw *RWMutex) Lock() { } // Unlock unlocks rw for writing. -func (rw *RWMutex) Unlock() { +// +// Preconditions: +// * rw is locked for writing. +func (rw *CrossGoroutineRWMutex) Unlock() { if RaceEnabled { RaceRelease(unsafe.Pointer(&rw.writerSem)) RaceRelease(unsafe.Pointer(&rw.readerSem)) @@ -163,7 +168,7 @@ func (rw *RWMutex) Unlock() { } // Unblock blocked readers, if any. for i := 0; i < int(r); i++ { - runtimeSemrelease(&rw.readerSem, false, 0) + semrelease(&rw.readerSem, false, 0) } // Allow other writers to proceed. rw.w.Unlock() @@ -173,7 +178,10 @@ func (rw *RWMutex) Unlock() { } // DowngradeLock atomically unlocks rw for writing and locks it for reading. -func (rw *RWMutex) DowngradeLock() { +// +// Preconditions: +// * rw is locked for writing. +func (rw *CrossGoroutineRWMutex) DowngradeLock() { if RaceEnabled { RaceRelease(unsafe.Pointer(&rw.readerSem)) RaceDisable() @@ -186,7 +194,7 @@ func (rw *RWMutex) DowngradeLock() { // Unblock blocked readers, if any. Note that this loop starts as 1 since r // includes this goroutine. for i := 1; i < int(r); i++ { - runtimeSemrelease(&rw.readerSem, false, 0) + semrelease(&rw.readerSem, false, 0) } // Allow other writers to proceed to rw.w.Lock(). Note that they will still // block on rw.writerSem since at least this reader exists, such that @@ -196,3 +204,91 @@ func (rw *RWMutex) DowngradeLock() { RaceEnable() } } + +// A RWMutex is a reader/writer mutual exclusion lock. The lock can be held by +// an arbitrary number of readers or a single writer. The zero value for a +// RWMutex is an unlocked mutex. +// +// A RWMutex must not be copied after first use. +// +// If a goroutine holds a RWMutex for reading and another goroutine might call +// Lock, no goroutine should expect to be able to acquire a read lock until the +// initial read lock is released. In particular, this prohibits recursive read +// locking. This is to ensure that the lock eventually becomes available; a +// blocked Lock call excludes new readers from acquiring the lock. +// +// A Mutex must be unlocked by the same goroutine that locked it. This +// invariant is enforced with the 'checklocks' build tag. +type RWMutex struct { + m CrossGoroutineRWMutex +} + +// TryRLock locks rw for reading. It returns true if it succeeds and false +// otherwise. It does not block. +func (rw *RWMutex) TryRLock() bool { + // Note lock first to enforce proper locking even if unsuccessful. + noteLock(unsafe.Pointer(rw)) + locked := rw.m.TryRLock() + if !locked { + noteUnlock(unsafe.Pointer(rw)) + } + return locked +} + +// RLock locks rw for reading. +// +// It should not be used for recursive read locking; a blocked Lock call +// excludes new readers from acquiring the lock. See the documentation on the +// RWMutex type. +func (rw *RWMutex) RLock() { + noteLock(unsafe.Pointer(rw)) + rw.m.RLock() +} + +// RUnlock undoes a single RLock call. +// +// Preconditions: +// * rw is locked for reading. +// * rw was locked by this goroutine. +func (rw *RWMutex) RUnlock() { + rw.m.RUnlock() + noteUnlock(unsafe.Pointer(rw)) +} + +// TryLock locks rw for writing. It returns true if it succeeds and false +// otherwise. It does not block. +func (rw *RWMutex) TryLock() bool { + // Note lock first to enforce proper locking even if unsuccessful. + noteLock(unsafe.Pointer(rw)) + locked := rw.m.TryLock() + if !locked { + noteUnlock(unsafe.Pointer(rw)) + } + return locked +} + +// Lock locks rw for writing. If the lock is already locked for reading or +// writing, Lock blocks until the lock is available. +func (rw *RWMutex) Lock() { + noteLock(unsafe.Pointer(rw)) + rw.m.Lock() +} + +// Unlock unlocks rw for writing. +// +// Preconditions: +// * rw is locked for writing. +// * rw was locked by this goroutine. +func (rw *RWMutex) Unlock() { + rw.m.Unlock() + noteUnlock(unsafe.Pointer(rw)) +} + +// DowngradeLock atomically unlocks rw for writing and locks it for reading. +// +// Preconditions: +// * rw is locked for writing. +func (rw *RWMutex) DowngradeLock() { + // No note change for DowngradeLock. + rw.m.DowngradeLock() +} diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go index 2c5d3df99..1f025f33c 100644 --- a/pkg/sync/seqcount.go +++ b/pkg/sync/seqcount.go @@ -6,8 +6,6 @@ package sync import ( - "fmt" - "reflect" "sync/atomic" ) @@ -27,9 +25,6 @@ import ( // - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other // operations to be made atomic with reads of SeqCount-protected data. // -// - SeqCount may be less flexible: as of this writing, SeqCount-protected data -// cannot include pointers. -// // - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected // data require instantiating function templates using go_generics (see // seqatomic.go). @@ -128,32 +123,3 @@ func (s *SeqCount) EndWrite() { panic("SeqCount.EndWrite outside writer critical section") } } - -// PointersInType returns a list of pointers reachable from values named -// valName of the given type. -// -// PointersInType is not exhaustive, but it is guaranteed that if typ contains -// at least one pointer, then PointersInTypeOf returns a non-empty list. -func PointersInType(typ reflect.Type, valName string) []string { - switch kind := typ.Kind(); kind { - case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128: - return nil - - case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer: - return []string{valName} - - case reflect.Array: - return PointersInType(typ.Elem(), valName+"[]") - - case reflect.Struct: - var ptrs []string - for i, n := 0, typ.NumField(); i < n; i++ { - field := typ.Field(i) - ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...) - } - return ptrs - - default: - return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)} - } -} diff --git a/pkg/sync/seqcount_test.go b/pkg/sync/seqcount_test.go index 6eb7b4b59..3f5592e3e 100644 --- a/pkg/sync/seqcount_test.go +++ b/pkg/sync/seqcount_test.go @@ -6,7 +6,6 @@ package sync import ( - "reflect" "testing" "time" ) @@ -99,55 +98,3 @@ func BenchmarkSeqCountReadUncontended(b *testing.B) { } }) } - -func TestPointersInType(t *testing.T) { - for _, test := range []struct { - name string // used for both test and value name - val interface{} - ptrs []string - }{ - { - name: "EmptyStruct", - val: struct{}{}, - }, - { - name: "Int", - val: int(0), - }, - { - name: "MixedStruct", - val: struct { - b bool - I int - ExportedPtr *struct{} - unexportedPtr *struct{} - arr [2]int - ptrArr [2]*int - nestedStruct struct { - nestedNonptr int - nestedPtr *int - } - structArr [1]struct { - nonptr int - ptr *int - } - }{}, - ptrs: []string{ - "MixedStruct.ExportedPtr", - "MixedStruct.unexportedPtr", - "MixedStruct.ptrArr[]", - "MixedStruct.nestedStruct.nestedPtr", - "MixedStruct.structArr[].ptr", - }, - }, - } { - t.Run(test.name, func(t *testing.T) { - typ := reflect.TypeOf(test.val) - ptrs := PointersInType(typ, test.name) - t.Logf("Found pointers: %v", ptrs) - if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) { - t.Errorf("Got %v, wanted %v", ptrs, test.ptrs) - } - }) - } -} diff --git a/pkg/syncevent/BUILD b/pkg/syncevent/BUILD index 0500a22cf..42c553308 100644 --- a/pkg/syncevent/BUILD +++ b/pkg/syncevent/BUILD @@ -9,10 +9,6 @@ go_library( "receiver.go", "source.go", "syncevent.go", - "waiter_amd64.s", - "waiter_arm64.s", - "waiter_asm_unsafe.go", - "waiter_noasm_unsafe.go", "waiter_unsafe.go", ], visibility = ["//:sandbox"], diff --git a/pkg/syncevent/waiter_noasm_unsafe.go b/pkg/syncevent/waiter_noasm_unsafe.go deleted file mode 100644 index 0f74a689c..000000000 --- a/pkg/syncevent/waiter_noasm_unsafe.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// waiterUnlock is called from g0, so when the race detector is enabled, -// waiterUnlock must be implemented in assembly since no race context is -// available. -// -// +build !race -// +build !amd64,!arm64 - -package syncevent - -import ( - "sync/atomic" - "unsafe" -) - -// waiterUnlock is the "unlock function" passed to runtime.gopark by -// Waiter.Wait*. wg is &Waiter.g, and g is a pointer to the calling runtime.g. -// waiterUnlock returns true if Waiter.Wait should sleep and false if sleeping -// should be aborted. -// -//go:nosplit -func waiterUnlock(ptr unsafe.Pointer, wg *unsafe.Pointer) bool { - // The only way this CAS can fail is if a call to Waiter.NotifyPending() - // has replaced *wg with nil, in which case we should not sleep. - return atomic.CompareAndSwapPointer(wg, (unsafe.Pointer)(&preparingG), ptr) -} diff --git a/pkg/syncevent/waiter_unsafe.go b/pkg/syncevent/waiter_unsafe.go index 518f18479..b6ed2852d 100644 --- a/pkg/syncevent/waiter_unsafe.go +++ b/pkg/syncevent/waiter_unsafe.go @@ -12,11 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.11 -// +build !go1.17 - -// Check go:linkname function signatures when updating Go version. - package syncevent import ( @@ -26,17 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sync" ) -//go:linkname gopark runtime.gopark -func gopark(unlockf func(unsafe.Pointer, *unsafe.Pointer) bool, wg *unsafe.Pointer, reason uint8, traceEv byte, traceskip int) - -//go:linkname goready runtime.goready -func goready(g unsafe.Pointer, traceskip int) - -const ( - waitReasonSelect = 9 // Go: src/runtime/runtime2.go - traceEvGoBlockSelect = 24 // Go: src/runtime/trace.go -) - // Waiter allows a goroutine to block on pending events received by a Receiver. // // Waiter.Init() must be called before first use. @@ -45,20 +29,19 @@ type Waiter struct { // g is one of: // - // - nil: No goroutine is blocking in Wait. + // - 0: No goroutine is blocking in Wait. // - // - &preparingG: A goroutine is in Wait preparing to sleep, but hasn't yet + // - preparingG: A goroutine is in Wait preparing to sleep, but hasn't yet // completed waiterUnlock(). Thus the wait can only be interrupted by - // replacing the value of g with nil (the G may not be in state Gwaiting - // yet, so we can't call goready.) + // replacing the value of g with 0 (the G may not be in state Gwaiting yet, + // so we can't call goready.) // // - Otherwise: g is a pointer to the runtime.g in state Gwaiting for the // goroutine blocked in Wait, which can only be woken by calling goready. - g unsafe.Pointer `state:"zerovalue"` + g uintptr `state:"zerovalue"` } -// Sentinel object for Waiter.g. -var preparingG struct{} +const preparingG = 1 // Init must be called before first use of w. func (w *Waiter) Init() { @@ -99,21 +82,29 @@ func (w *Waiter) WaitFor(es Set) Set { } // Indicate that we're preparing to go to sleep. - atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG)) + atomic.StoreUintptr(&w.g, preparingG) // If an event is pending, abort the sleep. if p := w.r.Pending(); p&es != NoEvents { - atomic.StorePointer(&w.g, nil) + atomic.StoreUintptr(&w.g, 0) return p } // If w.g is still preparingG (i.e. w.NotifyPending() has not been - // called or has not reached atomic.SwapPointer()), go to sleep until + // called or has not reached atomic.SwapUintptr()), go to sleep until // w.NotifyPending() => goready(). - gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0) + sync.Gopark(waiterCommit, unsafe.Pointer(&w.g), sync.WaitReasonSelect, sync.TraceEvGoBlockSelect, 0) } } +//go:norace +//go:nosplit +func waiterCommit(g uintptr, wg unsafe.Pointer) bool { + // The only way this CAS can fail is if a call to Waiter.NotifyPending() + // has replaced *wg with nil, in which case we should not sleep. + return sync.RaceUncheckedAtomicCompareAndSwapUintptr((*uintptr)(wg), preparingG, g) +} + // Ack marks the given events as not pending. func (w *Waiter) Ack(es Set) { w.r.Ack(es) @@ -135,20 +126,20 @@ func (w *Waiter) WaitAndAckAll() Set { for { // Indicate that we're preparing to go to sleep. - atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG)) + atomic.StoreUintptr(&w.g, preparingG) // If an event is pending, abort the sleep. if w.r.Pending() != NoEvents { if p := w.r.PendingAndAckAll(); p != NoEvents { - atomic.StorePointer(&w.g, nil) + atomic.StoreUintptr(&w.g, 0) return p } } // If w.g is still preparingG (i.e. w.NotifyPending() has not been - // called or has not reached atomic.SwapPointer()), go to sleep until + // called or has not reached atomic.SwapUintptr()), go to sleep until // w.NotifyPending() => goready(). - gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0) + sync.Gopark(waiterCommit, unsafe.Pointer(&w.g), sync.WaitReasonSelect, sync.TraceEvGoBlockSelect, 0) // Check for pending events. We call PendingAndAckAll() directly now since // we only expect to be woken after events become pending. @@ -171,14 +162,14 @@ func (w *Waiter) NotifyPending() { // goroutine. NotifyPending is called after w.r.Pending() is updated, so // concurrent and future calls to w.Wait() will observe pending events and // abort sleeping. - if atomic.LoadPointer(&w.g) == nil { + if atomic.LoadUintptr(&w.g) == 0 { return } // Wake a sleeping G, or prevent a G that is preparing to sleep from doing // so. Swap is needed here to ensure that only one call to NotifyPending // calls goready. - if g := atomic.SwapPointer(&w.g, nil); g != nil && g != (unsafe.Pointer)(&preparingG) { - goready(g, 0) + if g := atomic.SwapUintptr(&w.g, 0); g > preparingG { + sync.Goready(g, 0) } } diff --git a/pkg/syserr/host_linux.go b/pkg/syserr/host_linux.go index fc6ef60a1..77faa3670 100644 --- a/pkg/syserr/host_linux.go +++ b/pkg/syserr/host_linux.go @@ -32,7 +32,7 @@ var linuxHostTranslations [maxErrno]linuxHostTranslation // FromHost translates a syscall.Errno to a corresponding Error value. func FromHost(err syscall.Errno) *Error { - if err < 0 || int(err) >= len(linuxHostTranslations) || !linuxHostTranslations[err].ok { + if int(err) >= len(linuxHostTranslations) || !linuxHostTranslations[err].ok { panic(fmt.Sprintf("unknown host errno %q (%d)", err.Error(), err)) } return linuxHostTranslations[err].err diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go index 5ae10939d..77c3c110c 100644 --- a/pkg/syserr/netstack.go +++ b/pkg/syserr/netstack.go @@ -15,6 +15,8 @@ package syserr import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/tcpip" ) @@ -48,45 +50,56 @@ var ( ErrNotPermittedNet = New(tcpip.ErrNotPermitted.String(), linux.EPERM) ) -var netstackErrorTranslations = map[*tcpip.Error]*Error{ - tcpip.ErrUnknownProtocol: ErrUnknownProtocol, - tcpip.ErrUnknownNICID: ErrUnknownNICID, - tcpip.ErrUnknownDevice: ErrUnknownDevice, - tcpip.ErrUnknownProtocolOption: ErrUnknownProtocolOption, - tcpip.ErrDuplicateNICID: ErrDuplicateNICID, - tcpip.ErrDuplicateAddress: ErrDuplicateAddress, - tcpip.ErrNoRoute: ErrNoRoute, - tcpip.ErrBadLinkEndpoint: ErrBadLinkEndpoint, - tcpip.ErrAlreadyBound: ErrAlreadyBound, - tcpip.ErrInvalidEndpointState: ErrInvalidEndpointState, - tcpip.ErrAlreadyConnecting: ErrAlreadyConnecting, - tcpip.ErrAlreadyConnected: ErrAlreadyConnected, - tcpip.ErrNoPortAvailable: ErrNoPortAvailable, - tcpip.ErrPortInUse: ErrPortInUse, - tcpip.ErrBadLocalAddress: ErrBadLocalAddress, - tcpip.ErrClosedForSend: ErrClosedForSend, - tcpip.ErrClosedForReceive: ErrClosedForReceive, - tcpip.ErrWouldBlock: ErrWouldBlock, - tcpip.ErrConnectionRefused: ErrConnectionRefused, - tcpip.ErrTimeout: ErrTimeout, - tcpip.ErrAborted: ErrAborted, - tcpip.ErrConnectStarted: ErrConnectStarted, - tcpip.ErrDestinationRequired: ErrDestinationRequired, - tcpip.ErrNotSupported: ErrNotSupported, - tcpip.ErrQueueSizeNotSupported: ErrQueueSizeNotSupported, - tcpip.ErrNotConnected: ErrNotConnected, - tcpip.ErrConnectionReset: ErrConnectionReset, - tcpip.ErrConnectionAborted: ErrConnectionAborted, - tcpip.ErrNoSuchFile: ErrNoSuchFile, - tcpip.ErrInvalidOptionValue: ErrInvalidOptionValue, - tcpip.ErrNoLinkAddress: ErrHostDown, - tcpip.ErrBadAddress: ErrBadAddress, - tcpip.ErrNetworkUnreachable: ErrNetworkUnreachable, - tcpip.ErrMessageTooLong: ErrMessageTooLong, - tcpip.ErrNoBufferSpace: ErrNoBufferSpace, - tcpip.ErrBroadcastDisabled: ErrBroadcastDisabled, - tcpip.ErrNotPermitted: ErrNotPermittedNet, - tcpip.ErrAddressFamilyNotSupported: ErrAddressFamilyNotSupported, +var netstackErrorTranslations map[string]*Error + +func addErrMapping(tcpipErr *tcpip.Error, netstackErr *Error) { + key := tcpipErr.String() + if _, ok := netstackErrorTranslations[key]; ok { + panic(fmt.Sprintf("duplicate error key: %s", key)) + } + netstackErrorTranslations[key] = netstackErr +} + +func init() { + netstackErrorTranslations = make(map[string]*Error) + addErrMapping(tcpip.ErrUnknownProtocol, ErrUnknownProtocol) + addErrMapping(tcpip.ErrUnknownNICID, ErrUnknownNICID) + addErrMapping(tcpip.ErrUnknownDevice, ErrUnknownDevice) + addErrMapping(tcpip.ErrUnknownProtocolOption, ErrUnknownProtocolOption) + addErrMapping(tcpip.ErrDuplicateNICID, ErrDuplicateNICID) + addErrMapping(tcpip.ErrDuplicateAddress, ErrDuplicateAddress) + addErrMapping(tcpip.ErrNoRoute, ErrNoRoute) + addErrMapping(tcpip.ErrBadLinkEndpoint, ErrBadLinkEndpoint) + addErrMapping(tcpip.ErrAlreadyBound, ErrAlreadyBound) + addErrMapping(tcpip.ErrInvalidEndpointState, ErrInvalidEndpointState) + addErrMapping(tcpip.ErrAlreadyConnecting, ErrAlreadyConnecting) + addErrMapping(tcpip.ErrAlreadyConnected, ErrAlreadyConnected) + addErrMapping(tcpip.ErrNoPortAvailable, ErrNoPortAvailable) + addErrMapping(tcpip.ErrPortInUse, ErrPortInUse) + addErrMapping(tcpip.ErrBadLocalAddress, ErrBadLocalAddress) + addErrMapping(tcpip.ErrClosedForSend, ErrClosedForSend) + addErrMapping(tcpip.ErrClosedForReceive, ErrClosedForReceive) + addErrMapping(tcpip.ErrWouldBlock, ErrWouldBlock) + addErrMapping(tcpip.ErrConnectionRefused, ErrConnectionRefused) + addErrMapping(tcpip.ErrTimeout, ErrTimeout) + addErrMapping(tcpip.ErrAborted, ErrAborted) + addErrMapping(tcpip.ErrConnectStarted, ErrConnectStarted) + addErrMapping(tcpip.ErrDestinationRequired, ErrDestinationRequired) + addErrMapping(tcpip.ErrNotSupported, ErrNotSupported) + addErrMapping(tcpip.ErrQueueSizeNotSupported, ErrQueueSizeNotSupported) + addErrMapping(tcpip.ErrNotConnected, ErrNotConnected) + addErrMapping(tcpip.ErrConnectionReset, ErrConnectionReset) + addErrMapping(tcpip.ErrConnectionAborted, ErrConnectionAborted) + addErrMapping(tcpip.ErrNoSuchFile, ErrNoSuchFile) + addErrMapping(tcpip.ErrInvalidOptionValue, ErrInvalidOptionValue) + addErrMapping(tcpip.ErrNoLinkAddress, ErrHostDown) + addErrMapping(tcpip.ErrBadAddress, ErrBadAddress) + addErrMapping(tcpip.ErrNetworkUnreachable, ErrNetworkUnreachable) + addErrMapping(tcpip.ErrMessageTooLong, ErrMessageTooLong) + addErrMapping(tcpip.ErrNoBufferSpace, ErrNoBufferSpace) + addErrMapping(tcpip.ErrBroadcastDisabled, ErrBroadcastDisabled) + addErrMapping(tcpip.ErrNotPermitted, ErrNotPermittedNet) + addErrMapping(tcpip.ErrAddressFamilyNotSupported, ErrAddressFamilyNotSupported) } // TranslateNetstackError converts an error from the tcpip package to a sentry @@ -95,7 +108,7 @@ func TranslateNetstackError(err *tcpip.Error) *Error { if err == nil { return nil } - se, ok := netstackErrorTranslations[err] + se, ok := netstackErrorTranslations[err.String()] if !ok { panic("Unknown error: " + err.String()) } diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go index 81f762e10..91971b687 100644 --- a/pkg/tcpip/checker/checker.go +++ b/pkg/tcpip/checker/checker.go @@ -20,6 +20,7 @@ import ( "encoding/binary" "reflect" "testing" + "time" "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/tcpip" @@ -116,6 +117,10 @@ func TTL(ttl uint8) NetworkChecker { v = ip.TTL() case header.IPv6: v = ip.HopLimit() + case *ipv6HeaderWithExtHdr: + v = ip.HopLimit() + default: + t.Fatalf("unrecognized header type %T for TTL evaluation", ip) } if v != ttl { t.Fatalf("Bad TTL, got = %d, want = %d", v, ttl) @@ -216,6 +221,42 @@ func IPv4Options(want header.IPv4Options) NetworkChecker { } } +// IPv4RouterAlert returns a checker that checks that the RouterAlert option is +// set in an IPv4 packet. +func IPv4RouterAlert() NetworkChecker { + return func(t *testing.T, h []header.Network) { + t.Helper() + ip, ok := h[0].(header.IPv4) + if !ok { + t.Fatalf("unexpected network header passed to checker, got = %T, want = header.IPv4", h[0]) + } + iterator := ip.Options().MakeIterator() + for { + opt, done, err := iterator.Next() + if err != nil { + t.Fatalf("error acquiring next IPv4 option %s", err) + } + if done { + break + } + if opt.Type() != header.IPv4OptionRouterAlertType { + continue + } + want := [header.IPv4OptionRouterAlertLength]byte{ + byte(header.IPv4OptionRouterAlertType), + header.IPv4OptionRouterAlertLength, + header.IPv4OptionRouterAlertValue, + header.IPv4OptionRouterAlertValue, + } + if diff := cmp.Diff(want[:], opt.Contents()); diff != "" { + t.Errorf("router alert option mismatch (-want +got):\n%s", diff) + } + return + } + t.Errorf("failed to find router alert option in %v", ip.Options()) + } +} + // FragmentOffset creates a checker that checks the FragmentOffset field. func FragmentOffset(offset uint16) NetworkChecker { return func(t *testing.T, h []header.Network) { @@ -284,6 +325,19 @@ func ReceiveIPPacketInfo(want tcpip.IPPacketInfo) ControlMessagesChecker { } } +// ReceiveOriginalDstAddr creates a checker that checks the OriginalDstAddress +// field in ControlMessages. +func ReceiveOriginalDstAddr(want tcpip.FullAddress) ControlMessagesChecker { + return func(t *testing.T, cm tcpip.ControlMessages) { + t.Helper() + if !cm.HasOriginalDstAddress { + t.Errorf("got cm.HasOriginalDstAddress = %t, want = true", cm.HasOriginalDstAddress) + } else if diff := cmp.Diff(want, cm.OriginalDstAddress); diff != "" { + t.Errorf("OriginalDstAddress mismatch (-want +got):\n%s", diff) + } + } +} + // TOS creates a checker that checks the TOS field. func TOS(tos uint8, label uint32) NetworkChecker { return func(t *testing.T, h []header.Network) { @@ -1012,6 +1066,74 @@ func ICMPv6Payload(want []byte) TransportChecker { } } +// MLD creates a checker that checks that the packet contains a valid MLD +// message for type of mldType, with potentially additional checks specified by +// checkers. +// +// Checkers may assume that a valid ICMPv6 is passed to it containing a valid +// MLD message as far as the size of the message (minSize) is concerned. The +// values within the message are up to checkers to validate. +func MLD(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) NetworkChecker { + return func(t *testing.T, h []header.Network) { + t.Helper() + + // Check normal ICMPv6 first. + ICMPv6( + ICMPv6Type(msgType), + ICMPv6Code(0))(t, h) + + last := h[len(h)-1] + + icmp := header.ICMPv6(last.Payload()) + if got := len(icmp.MessageBody()); got < minSize { + t.Fatalf("ICMPv6 MLD (type = %d) payload size of %d is less than the minimum size of %d", msgType, got, minSize) + } + + for _, f := range checkers { + f(t, icmp) + } + if t.Failed() { + t.FailNow() + } + } +} + +// MLDMaxRespDelay creates a checker that checks the Maximum Response Delay +// field of a MLD message. +// +// The returned TransportChecker assumes that a valid ICMPv6 is passed to it +// containing a valid MLD message as far as the size is concerned. +func MLDMaxRespDelay(want time.Duration) TransportChecker { + return func(t *testing.T, h header.Transport) { + t.Helper() + + icmp := h.(header.ICMPv6) + ns := header.MLD(icmp.MessageBody()) + + if got := ns.MaximumResponseDelay(); got != want { + t.Errorf("got %T.MaximumResponseDelay() = %s, want = %s", ns, got, want) + } + } +} + +// MLDMulticastAddress creates a checker that checks the Multicast Address +// field of a MLD message. +// +// The returned TransportChecker assumes that a valid ICMPv6 is passed to it +// containing a valid MLD message as far as the size is concerned. +func MLDMulticastAddress(want tcpip.Address) TransportChecker { + return func(t *testing.T, h header.Transport) { + t.Helper() + + icmp := h.(header.ICMPv6) + ns := header.MLD(icmp.MessageBody()) + + if got := ns.MulticastAddress(); got != want { + t.Errorf("got %T.MulticastAddress() = %s, want = %s", ns, got, want) + } + } +} + // NDP creates a checker that checks that the packet contains a valid NDP // message for type of ty, with potentially additional checks specified by // checkers. @@ -1031,7 +1153,7 @@ func NDP(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) N last := h[len(h)-1] icmp := header.ICMPv6(last.Payload()) - if got := len(icmp.NDPPayload()); got < minSize { + if got := len(icmp.MessageBody()); got < minSize { t.Fatalf("ICMPv6 NDP (type = %d) payload size of %d is less than the minimum size of %d", msgType, got, minSize) } @@ -1065,7 +1187,7 @@ func NDPNSTargetAddress(want tcpip.Address) TransportChecker { t.Helper() icmp := h.(header.ICMPv6) - ns := header.NDPNeighborSolicit(icmp.NDPPayload()) + ns := header.NDPNeighborSolicit(icmp.MessageBody()) if got := ns.TargetAddress(); got != want { t.Errorf("got %T.TargetAddress() = %s, want = %s", ns, got, want) @@ -1094,7 +1216,7 @@ func NDPNATargetAddress(want tcpip.Address) TransportChecker { t.Helper() icmp := h.(header.ICMPv6) - na := header.NDPNeighborAdvert(icmp.NDPPayload()) + na := header.NDPNeighborAdvert(icmp.MessageBody()) if got := na.TargetAddress(); got != want { t.Errorf("got %T.TargetAddress() = %s, want = %s", na, got, want) @@ -1112,7 +1234,7 @@ func NDPNASolicitedFlag(want bool) TransportChecker { t.Helper() icmp := h.(header.ICMPv6) - na := header.NDPNeighborAdvert(icmp.NDPPayload()) + na := header.NDPNeighborAdvert(icmp.MessageBody()) if got := na.SolicitedFlag(); got != want { t.Errorf("got %T.SolicitedFlag = %t, want = %t", na, got, want) @@ -1183,7 +1305,7 @@ func NDPNAOptions(opts []header.NDPOption) TransportChecker { t.Helper() icmp := h.(header.ICMPv6) - na := header.NDPNeighborAdvert(icmp.NDPPayload()) + na := header.NDPNeighborAdvert(icmp.MessageBody()) ndpOptions(t, na.Options(), opts) } } @@ -1198,7 +1320,7 @@ func NDPNSOptions(opts []header.NDPOption) TransportChecker { t.Helper() icmp := h.(header.ICMPv6) - ns := header.NDPNeighborSolicit(icmp.NDPPayload()) + ns := header.NDPNeighborSolicit(icmp.MessageBody()) ndpOptions(t, ns.Options(), opts) } } @@ -1223,7 +1345,261 @@ func NDPRSOptions(opts []header.NDPOption) TransportChecker { t.Helper() icmp := h.(header.ICMPv6) - rs := header.NDPRouterSolicit(icmp.NDPPayload()) + rs := header.NDPRouterSolicit(icmp.MessageBody()) ndpOptions(t, rs.Options(), opts) } } + +// IGMP checks the validity and properties of the given IGMP packet. It is +// expected to be used in conjunction with other IGMP transport checkers for +// specific properties. +func IGMP(checkers ...TransportChecker) NetworkChecker { + return func(t *testing.T, h []header.Network) { + t.Helper() + + last := h[len(h)-1] + + if p := last.TransportProtocol(); p != header.IGMPProtocolNumber { + t.Fatalf("Bad protocol, got %d, want %d", p, header.IGMPProtocolNumber) + } + + igmp := header.IGMP(last.Payload()) + for _, f := range checkers { + f(t, igmp) + } + if t.Failed() { + t.FailNow() + } + } +} + +// IGMPType creates a checker that checks the IGMP Type field. +func IGMPType(want header.IGMPType) TransportChecker { + return func(t *testing.T, h header.Transport) { + t.Helper() + + igmp, ok := h.(header.IGMP) + if !ok { + t.Fatalf("got transport header = %T, want = header.IGMP", h) + } + if got := igmp.Type(); got != want { + t.Errorf("got igmp.Type() = %d, want = %d", got, want) + } + } +} + +// IGMPMaxRespTime creates a checker that checks the IGMP Max Resp Time field. +func IGMPMaxRespTime(want time.Duration) TransportChecker { + return func(t *testing.T, h header.Transport) { + t.Helper() + + igmp, ok := h.(header.IGMP) + if !ok { + t.Fatalf("got transport header = %T, want = header.IGMP", h) + } + if got := igmp.MaxRespTime(); got != want { + t.Errorf("got igmp.MaxRespTime() = %s, want = %s", got, want) + } + } +} + +// IGMPGroupAddress creates a checker that checks the IGMP Group Address field. +func IGMPGroupAddress(want tcpip.Address) TransportChecker { + return func(t *testing.T, h header.Transport) { + t.Helper() + + igmp, ok := h.(header.IGMP) + if !ok { + t.Fatalf("got transport header = %T, want = header.IGMP", h) + } + if got := igmp.GroupAddress(); got != want { + t.Errorf("got igmp.GroupAddress() = %s, want = %s", got, want) + } + } +} + +// IPv6ExtHdrChecker is a function to check an extension header. +type IPv6ExtHdrChecker func(*testing.T, header.IPv6PayloadHeader) + +// IPv6WithExtHdr is like IPv6 but allows IPv6 packets with extension headers. +func IPv6WithExtHdr(t *testing.T, b []byte, checkers ...NetworkChecker) { + t.Helper() + + ipv6 := header.IPv6(b) + if !ipv6.IsValid(len(b)) { + t.Error("not a valid IPv6 packet") + return + } + + payloadIterator := header.MakeIPv6PayloadIterator( + header.IPv6ExtensionHeaderIdentifier(ipv6.NextHeader()), + buffer.View(ipv6.Payload()).ToVectorisedView(), + ) + + var rawPayloadHeader header.IPv6RawPayloadHeader + for { + h, done, err := payloadIterator.Next() + if err != nil { + t.Errorf("payloadIterator.Next(): %s", err) + return + } + if done { + t.Errorf("got payloadIterator.Next() = (%T, %t, _), want = (_, true, _)", h, done) + return + } + r, ok := h.(header.IPv6RawPayloadHeader) + if ok { + rawPayloadHeader = r + break + } + } + + networkHeader := ipv6HeaderWithExtHdr{ + IPv6: ipv6, + transport: tcpip.TransportProtocolNumber(rawPayloadHeader.Identifier), + payload: rawPayloadHeader.Buf.ToView(), + } + + for _, checker := range checkers { + checker(t, []header.Network{&networkHeader}) + } +} + +// IPv6ExtHdr checks for the presence of extension headers. +// +// All the extension headers in headers will be checked exhaustively in the +// order provided. +func IPv6ExtHdr(headers ...IPv6ExtHdrChecker) NetworkChecker { + return func(t *testing.T, h []header.Network) { + t.Helper() + + extHdrs, ok := h[0].(*ipv6HeaderWithExtHdr) + if !ok { + t.Errorf("got network header = %T, want = *ipv6HeaderWithExtHdr", h[0]) + return + } + + payloadIterator := header.MakeIPv6PayloadIterator( + header.IPv6ExtensionHeaderIdentifier(extHdrs.IPv6.NextHeader()), + buffer.View(extHdrs.IPv6.Payload()).ToVectorisedView(), + ) + + for _, check := range headers { + h, done, err := payloadIterator.Next() + if err != nil { + t.Errorf("payloadIterator.Next(): %s", err) + return + } + if done { + t.Errorf("got payloadIterator.Next() = (%T, %t, _), want = (_, false, _)", h, done) + return + } + check(t, h) + } + // Validate we consumed all headers. + // + // The next one over should be a raw payload and then iterator should + // terminate. + wantDone := false + for { + h, done, err := payloadIterator.Next() + if err != nil { + t.Errorf("payloadIterator.Next(): %s", err) + return + } + if done != wantDone { + t.Errorf("got payloadIterator.Next() = (%T, %t, _), want = (_, %t, _)", h, done, wantDone) + return + } + if done { + break + } + if _, ok := h.(header.IPv6RawPayloadHeader); !ok { + t.Errorf("got payloadIterator.Next() = (%T, _, _), want = (header.IPv6RawPayloadHeader, _, _)", h) + continue + } + wantDone = true + } + } +} + +var _ header.Network = (*ipv6HeaderWithExtHdr)(nil) + +// ipv6HeaderWithExtHdr provides a header.Network implementation that takes +// extension headers into consideration, which is not the case with vanilla +// header.IPv6. +type ipv6HeaderWithExtHdr struct { + header.IPv6 + transport tcpip.TransportProtocolNumber + payload []byte +} + +// TransportProtocol implements header.Network. +func (h *ipv6HeaderWithExtHdr) TransportProtocol() tcpip.TransportProtocolNumber { + return h.transport +} + +// Payload implements header.Network. +func (h *ipv6HeaderWithExtHdr) Payload() []byte { + return h.payload +} + +// IPv6ExtHdrOptionChecker is a function to check an extension header option. +type IPv6ExtHdrOptionChecker func(*testing.T, header.IPv6ExtHdrOption) + +// IPv6HopByHopExtensionHeader checks the extension header is a Hop by Hop +// extension header and validates the containing options with checkers. +// +// checkers must exhaustively contain all the expected options. +func IPv6HopByHopExtensionHeader(checkers ...IPv6ExtHdrOptionChecker) IPv6ExtHdrChecker { + return func(t *testing.T, payloadHeader header.IPv6PayloadHeader) { + t.Helper() + + hbh, ok := payloadHeader.(header.IPv6HopByHopOptionsExtHdr) + if !ok { + t.Errorf("unexpected IPv6 payload header, got = %T, want = header.IPv6HopByHopOptionsExtHdr", payloadHeader) + return + } + optionsIterator := hbh.Iter() + for _, f := range checkers { + opt, done, err := optionsIterator.Next() + if err != nil { + t.Errorf("optionsIterator.Next(): %s", err) + return + } + if done { + t.Errorf("got optionsIterator.Next() = (%T, %t, _), want = (_, false, _)", opt, done) + } + f(t, opt) + } + // Validate all options were consumed. + for { + opt, done, err := optionsIterator.Next() + if err != nil { + t.Errorf("optionsIterator.Next(): %s", err) + return + } + if !done { + t.Errorf("got optionsIterator.Next() = (%T, %t, _), want = (_, true, _)", opt, done) + } + if done { + break + } + } + } +} + +// IPv6RouterAlert validates that an extension header option is the RouterAlert +// option and matches on its value. +func IPv6RouterAlert(want header.IPv6RouterAlertValue) IPv6ExtHdrOptionChecker { + return func(t *testing.T, opt header.IPv6ExtHdrOption) { + routerAlert, ok := opt.(*header.IPv6RouterAlertOption) + if !ok { + t.Errorf("unexpected extension header option, got = %T, want = header.IPv6RouterAlertOption", opt) + return + } + if routerAlert.Value != want { + t.Errorf("got routerAlert.Value = %d, want = %d", routerAlert.Value, want) + } + } +} diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD index d87797617..0bdc12d53 100644 --- a/pkg/tcpip/header/BUILD +++ b/pkg/tcpip/header/BUILD @@ -11,11 +11,13 @@ go_library( "gue.go", "icmpv4.go", "icmpv6.go", + "igmp.go", "interfaces.go", "ipv4.go", "ipv6.go", "ipv6_extension_headers.go", "ipv6_fragment.go", + "mld.go", "ndp_neighbor_advert.go", "ndp_neighbor_solicit.go", "ndp_options.go", @@ -39,6 +41,8 @@ go_test( size = "small", srcs = [ "checksum_test.go", + "igmp_test.go", + "ipv4_test.go", "ipv6_test.go", "ipversion_test.go", "tcp_test.go", @@ -58,6 +62,7 @@ go_test( srcs = [ "eth_test.go", "ipv6_extension_headers_test.go", + "mld_test.go", "ndp_test.go", ], library = ":header", diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go index 4303fc5d5..2eef64b4d 100644 --- a/pkg/tcpip/header/icmpv6.go +++ b/pkg/tcpip/header/icmpv6.go @@ -115,6 +115,12 @@ const ( ICMPv6NeighborSolicit ICMPv6Type = 135 ICMPv6NeighborAdvert ICMPv6Type = 136 ICMPv6RedirectMsg ICMPv6Type = 137 + + // Multicast Listener Discovery (MLD) messages, see RFC 2710. + + ICMPv6MulticastListenerQuery ICMPv6Type = 130 + ICMPv6MulticastListenerReport ICMPv6Type = 131 + ICMPv6MulticastListenerDone ICMPv6Type = 132 ) // IsErrorType returns true if the receiver is an ICMP error type. @@ -245,10 +251,9 @@ func (b ICMPv6) SetSequence(sequence uint16) { binary.BigEndian.PutUint16(b[icmpv6SequenceOffset:], sequence) } -// NDPPayload returns the NDP payload buffer. That is, it returns the ICMPv6 -// packet's message body as defined by RFC 4443 section 2.1; the portion of the -// ICMPv6 buffer after the first ICMPv6HeaderSize bytes. -func (b ICMPv6) NDPPayload() []byte { +// MessageBody returns the message body as defined by RFC 4443 section 2.1; the +// portion of the ICMPv6 buffer after the first ICMPv6HeaderSize bytes. +func (b ICMPv6) MessageBody() []byte { return b[ICMPv6HeaderSize:] } diff --git a/pkg/tcpip/header/igmp.go b/pkg/tcpip/header/igmp.go new file mode 100644 index 000000000..5c5be1b9d --- /dev/null +++ b/pkg/tcpip/header/igmp.go @@ -0,0 +1,181 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package header + +import ( + "encoding/binary" + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" +) + +// IGMP represents an IGMP header stored in a byte array. +type IGMP []byte + +// IGMP implements `Transport`. +var _ Transport = (*IGMP)(nil) + +const ( + // IGMPMinimumSize is the minimum size of a valid IGMP packet in bytes, + // as per RFC 2236, Section 2, Page 2. + IGMPMinimumSize = 8 + + // IGMPQueryMinimumSize is the minimum size of a valid Membership Query + // Message in bytes, as per RFC 2236, Section 2, Page 2. + IGMPQueryMinimumSize = 8 + + // IGMPReportMinimumSize is the minimum size of a valid Report Message in + // bytes, as per RFC 2236, Section 2, Page 2. + IGMPReportMinimumSize = 8 + + // IGMPLeaveMessageMinimumSize is the minimum size of a valid Leave Message + // in bytes, as per RFC 2236, Section 2, Page 2. + IGMPLeaveMessageMinimumSize = 8 + + // IGMPTTL is the TTL for all IGMP messages, as per RFC 2236, Section 3, Page + // 3. + IGMPTTL = 1 + + // igmpTypeOffset defines the offset of the type field in an IGMP message. + igmpTypeOffset = 0 + + // igmpMaxRespTimeOffset defines the offset of the MaxRespTime field in an + // IGMP message. + igmpMaxRespTimeOffset = 1 + + // igmpChecksumOffset defines the offset of the checksum field in an IGMP + // message. + igmpChecksumOffset = 2 + + // igmpGroupAddressOffset defines the offset of the Group Address field in an + // IGMP message. + igmpGroupAddressOffset = 4 + + // IGMPProtocolNumber is IGMP's transport protocol number. + IGMPProtocolNumber tcpip.TransportProtocolNumber = 2 +) + +// IGMPType is the IGMP type field as per RFC 2236. +type IGMPType byte + +// Values for the IGMP Type described in RFC 2236 Section 2.1, Page 2. +// Descriptions below come from there. +const ( + // IGMPMembershipQuery indicates that the message type is Membership Query. + // "There are two sub-types of Membership Query messages: + // - General Query, used to learn which groups have members on an + // attached network. + // - Group-Specific Query, used to learn if a particular group + // has any members on an attached network. + // These two messages are differentiated by the Group Address, as + // described in section 1.4 ." + IGMPMembershipQuery IGMPType = 0x11 + // IGMPv1MembershipReport indicates that the message is a Membership Report + // generated by a host using the IGMPv1 protocol: "an additional type of + // message, for backwards-compatibility with IGMPv1" + IGMPv1MembershipReport IGMPType = 0x12 + // IGMPv2MembershipReport indicates that the Message type is a Membership + // Report generated by a host using the IGMPv2 protocol. + IGMPv2MembershipReport IGMPType = 0x16 + // IGMPLeaveGroup indicates that the message type is a Leave Group + // notification message. + IGMPLeaveGroup IGMPType = 0x17 +) + +// Type is the IGMP type field. +func (b IGMP) Type() IGMPType { return IGMPType(b[igmpTypeOffset]) } + +// SetType sets the IGMP type field. +func (b IGMP) SetType(t IGMPType) { b[igmpTypeOffset] = byte(t) } + +// MaxRespTime gets the MaxRespTimeField. This is meaningful only in Membership +// Query messages, in other cases it is set to 0 by the sender and ignored by +// the receiver. +func (b IGMP) MaxRespTime() time.Duration { + // As per RFC 2236 section 2.2, + // + // The Max Response Time field is meaningful only in Membership Query + // messages, and specifies the maximum allowed time before sending a + // responding report in units of 1/10 second. In all other messages, it + // is set to zero by the sender and ignored by receivers. + return DecisecondToDuration(b[igmpMaxRespTimeOffset]) +} + +// SetMaxRespTime sets the MaxRespTimeField. +func (b IGMP) SetMaxRespTime(m byte) { b[igmpMaxRespTimeOffset] = m } + +// Checksum is the IGMP checksum field. +func (b IGMP) Checksum() uint16 { + return binary.BigEndian.Uint16(b[igmpChecksumOffset:]) +} + +// SetChecksum sets the IGMP checksum field. +func (b IGMP) SetChecksum(checksum uint16) { + binary.BigEndian.PutUint16(b[igmpChecksumOffset:], checksum) +} + +// GroupAddress gets the Group Address field. +func (b IGMP) GroupAddress() tcpip.Address { + return tcpip.Address(b[igmpGroupAddressOffset:][:IPv4AddressSize]) +} + +// SetGroupAddress sets the Group Address field. +func (b IGMP) SetGroupAddress(address tcpip.Address) { + if n := copy(b[igmpGroupAddressOffset:], address); n != IPv4AddressSize { + panic(fmt.Sprintf("copied %d bytes, expected %d", n, IPv4AddressSize)) + } +} + +// SourcePort implements Transport.SourcePort. +func (IGMP) SourcePort() uint16 { + return 0 +} + +// DestinationPort implements Transport.DestinationPort. +func (IGMP) DestinationPort() uint16 { + return 0 +} + +// SetSourcePort implements Transport.SetSourcePort. +func (IGMP) SetSourcePort(uint16) { +} + +// SetDestinationPort implements Transport.SetDestinationPort. +func (IGMP) SetDestinationPort(uint16) { +} + +// Payload implements Transport.Payload. +func (IGMP) Payload() []byte { + return nil +} + +// IGMPCalculateChecksum calculates the IGMP checksum over the provided IGMP +// header. +func IGMPCalculateChecksum(h IGMP) uint16 { + // The header contains a checksum itself, set it aside to avoid checksumming + // the checksum and replace it afterwards. + existingXsum := h.Checksum() + h.SetChecksum(0) + xsum := ^Checksum(h, 0) + h.SetChecksum(existingXsum) + return xsum +} + +// DecisecondToDuration converts a value representing deci-seconds to a +// time.Duration. +func DecisecondToDuration(ds uint8) time.Duration { + return time.Duration(ds) * time.Second / 10 +} diff --git a/pkg/tcpip/header/igmp_test.go b/pkg/tcpip/header/igmp_test.go new file mode 100644 index 000000000..b6126d29a --- /dev/null +++ b/pkg/tcpip/header/igmp_test.go @@ -0,0 +1,110 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package header_test + +import ( + "testing" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" +) + +// TestIGMPHeader tests the functions within header.igmp +func TestIGMPHeader(t *testing.T) { + const maxRespTimeTenthSec = 0xF0 + b := []byte{ + 0x11, // IGMP Type, Membership Query + maxRespTimeTenthSec, // Maximum Response Time + 0xC0, 0xC0, // Checksum + 0x01, 0x02, 0x03, 0x04, // Group Address + } + + igmpHeader := header.IGMP(b) + + if got, want := igmpHeader.Type(), header.IGMPMembershipQuery; got != want { + t.Errorf("got igmpHeader.Type() = %x, want = %x", got, want) + } + + if got, want := igmpHeader.MaxRespTime(), header.DecisecondToDuration(maxRespTimeTenthSec); got != want { + t.Errorf("got igmpHeader.MaxRespTime() = %s, want = %s", got, want) + } + + if got, want := igmpHeader.Checksum(), uint16(0xC0C0); got != want { + t.Errorf("got igmpHeader.Checksum() = %x, want = %x", got, want) + } + + if got, want := igmpHeader.GroupAddress(), tcpip.Address("\x01\x02\x03\x04"); got != want { + t.Errorf("got igmpHeader.GroupAddress() = %s, want = %s", got, want) + } + + igmpType := header.IGMPv2MembershipReport + igmpHeader.SetType(igmpType) + if got := igmpHeader.Type(); got != igmpType { + t.Errorf("got igmpHeader.Type() = %x, want = %x", got, igmpType) + } + if got := header.IGMPType(b[0]); got != igmpType { + t.Errorf("got IGMPtype in backing buffer = %x, want %x", got, igmpType) + } + + respTime := byte(0x02) + igmpHeader.SetMaxRespTime(respTime) + if got, want := igmpHeader.MaxRespTime(), header.DecisecondToDuration(respTime); got != want { + t.Errorf("got igmpHeader.MaxRespTime() = %s, want = %s", got, want) + } + + checksum := uint16(0x0102) + igmpHeader.SetChecksum(checksum) + if got := igmpHeader.Checksum(); got != checksum { + t.Errorf("got igmpHeader.Checksum() = %x, want = %x", got, checksum) + } + + groupAddress := tcpip.Address("\x04\x03\x02\x01") + igmpHeader.SetGroupAddress(groupAddress) + if got := igmpHeader.GroupAddress(); got != groupAddress { + t.Errorf("got igmpHeader.GroupAddress() = %s, want = %s", got, groupAddress) + } +} + +// TestIGMPChecksum ensures that the checksum calculator produces the expected +// checksum. +func TestIGMPChecksum(t *testing.T) { + b := []byte{ + 0x11, // IGMP Type, Membership Query + 0xF0, // Maximum Response Time + 0xC0, 0xC0, // Checksum + 0x01, 0x02, 0x03, 0x04, // Group Address + } + + igmpHeader := header.IGMP(b) + + // Calculate the initial checksum after setting the checksum temporarily to 0 + // to avoid checksumming the checksum. + initialChecksum := igmpHeader.Checksum() + igmpHeader.SetChecksum(0) + checksum := ^header.Checksum(b, 0) + igmpHeader.SetChecksum(initialChecksum) + + if got := header.IGMPCalculateChecksum(igmpHeader); got != checksum { + t.Errorf("got IGMPCalculateChecksum = %x, want %x", got, checksum) + } +} + +func TestDecisecondToDuration(t *testing.T) { + const valueInDeciseconds = 5 + if got, want := header.DecisecondToDuration(valueInDeciseconds), valueInDeciseconds*time.Second/10; got != want { + t.Fatalf("got header.DecisecondToDuration(%d) = %s, want = %s", valueInDeciseconds, got, want) + } +} diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go index 91fe7b6a5..e6103f4bc 100644 --- a/pkg/tcpip/header/ipv4.go +++ b/pkg/tcpip/header/ipv4.go @@ -100,7 +100,7 @@ type IPv4Fields struct { // // That leaves ten 32 bit (4 byte) fields for options. An attempt to encode // more will fail. - Options IPv4Options + Options IPv4OptionsSerializer } // IPv4 is an IPv4 header. @@ -157,6 +157,9 @@ const ( // IPv4Any is the non-routable IPv4 "any" meta address. IPv4Any tcpip.Address = "\x00\x00\x00\x00" + // IPv4AllRoutersGroup is a multicast address for all routers. + IPv4AllRoutersGroup tcpip.Address = "\xe0\x00\x00\x02" + // IPv4MinimumProcessableDatagramSize is the minimum size of an IP // packet that every IPv4 capable host must be able to // process/reassemble. @@ -282,18 +285,17 @@ func (b IPv4) DestinationAddress() tcpip.Address { return tcpip.Address(b[dstAddr : dstAddr+IPv4AddressSize]) } -// IPv4Options is a buffer that holds all the raw IP options. -type IPv4Options []byte - -// SizeWithPadding implements stack.NetOptions. -// It reports the size to allocate for the Options. RFC 791 page 23 (end of -// section 3.1) says of the padding at the end of the options: +// padIPv4OptionsLength returns the total length for IPv4 options of length l +// after applying padding according to RFC 791: // The internet header padding is used to ensure that the internet // header ends on a 32 bit boundary. -func (o IPv4Options) SizeWithPadding() int { - return (len(o) + IPv4IHLStride - 1) & ^(IPv4IHLStride - 1) +func padIPv4OptionsLength(length uint8) uint8 { + return (length + IPv4IHLStride - 1) & ^uint8(IPv4IHLStride-1) } +// IPv4Options is a buffer that holds all the raw IP options. +type IPv4Options []byte + // Options returns a buffer holding the options. func (b IPv4) Options() IPv4Options { hdrLen := b.HeaderLength() @@ -372,26 +374,16 @@ func (b IPv4) CalculateChecksum() uint16 { func (b IPv4) Encode(i *IPv4Fields) { // The size of the options defines the size of the whole header and thus the // IHL field. Options are rare and this is a heavily used function so it is - // worth a bit of optimisation here to keep the copy out of the fast path. - hdrLen := IPv4MinimumSize + // worth a bit of optimisation here to keep the serializer out of the fast + // path. + hdrLen := uint8(IPv4MinimumSize) if len(i.Options) != 0 { - // SizeWithPadding is always >= len(i.Options). - aLen := i.Options.SizeWithPadding() - hdrLen += aLen - if hdrLen > len(b) { - panic(fmt.Sprintf("encode received %d bytes, wanted >= %d", len(b), hdrLen)) - } - opts := b[options:] - // This avoids bounds checks on the next line(s) which would happen even - // if there's no work to do. - if n := copy(opts, i.Options); n != aLen { - padding := opts[n:][:aLen-n] - for i := range padding { - padding[i] = 0 - } - } + hdrLen += i.Options.Serialize(b[options:]) + } + if hdrLen > IPv4MaximumHeaderSize { + panic(fmt.Sprintf("%d is larger than maximum IPv4 header size of %d", hdrLen, IPv4MaximumHeaderSize)) } - b.SetHeaderLength(uint8(hdrLen)) + b.SetHeaderLength(hdrLen) b[tos] = i.TOS b.SetTotalLength(i.TotalLength) binary.BigEndian.PutUint16(b[id:], i.ID) @@ -471,6 +463,10 @@ const ( // options and may appear multiple times. IPv4OptionNOPType IPv4OptionType = 1 + // IPv4OptionRouterAlertType is the option type for the Router Alert option, + // defined in RFC 2113 Section 2.1. + IPv4OptionRouterAlertType IPv4OptionType = 20 | 0x80 + // IPv4OptionRecordRouteType is used by each router on the path of the packet // to record its path. It is carried over to an Echo Reply. IPv4OptionRecordRouteType IPv4OptionType = 7 @@ -871,3 +867,162 @@ func (rr *IPv4OptionRecordRoute) Size() uint8 { return uint8(len(*rr)) } // Contents implements IPv4Option. func (rr *IPv4OptionRecordRoute) Contents() []byte { return []byte(*rr) } + +// Router Alert option specific related constants. +// +// from RFC 2113 section 2.1: +// +// +--------+--------+--------+--------+ +// |10010100|00000100| 2 octet value | +// +--------+--------+--------+--------+ +// +// Type: +// Copied flag: 1 (all fragments must carry the option) +// Option class: 0 (control) +// Option number: 20 (decimal) +// +// Length: 4 +// +// Value: A two octet code with the following values: +// 0 - Router shall examine packet +// 1-65535 - Reserved +const ( + // IPv4OptionRouterAlertLength is the length of a Router Alert option. + IPv4OptionRouterAlertLength = 4 + + // IPv4OptionRouterAlertValue is the only permissible value of the 16 bit + // payload of the router alert option. + IPv4OptionRouterAlertValue = 0 + + // iPv4OptionRouterAlertValueOffset is the offset for the value of a + // RouterAlert option. + iPv4OptionRouterAlertValueOffset = 2 +) + +// IPv4SerializableOption is an interface to represent serializable IPv4 option +// types. +type IPv4SerializableOption interface { + // optionType returns the type identifier of the option. + optionType() IPv4OptionType +} + +// IPv4SerializableOptionPayload is an interface providing serialization of the +// payload of an IPv4 option. +type IPv4SerializableOptionPayload interface { + // length returns the size of the payload. + length() uint8 + + // serializeInto serializes the payload into the provided byte buffer. + // + // Note, the caller MUST provide a byte buffer with size of at least + // Length. Implementers of this function may assume that the byte buffer + // is of sufficient size. serializeInto MUST panic if the provided byte + // buffer is not of sufficient size. + // + // serializeInto will return the number of bytes that was used to + // serialize the receiver. Implementers must only use the number of + // bytes required to serialize the receiver. Callers MAY provide a + // larger buffer than required to serialize into. + serializeInto(buffer []byte) uint8 +} + +// IPv4OptionsSerializer is a serializer for IPv4 options. +type IPv4OptionsSerializer []IPv4SerializableOption + +// Length returns the total number of bytes required to serialize the options. +func (s IPv4OptionsSerializer) Length() uint8 { + var total uint8 + for _, opt := range s { + total++ + if withPayload, ok := opt.(IPv4SerializableOptionPayload); ok { + // Add 1 to reported length to account for the length byte. + total += 1 + withPayload.length() + } + } + return padIPv4OptionsLength(total) +} + +// Serialize serializes the provided list of IPV4 options into b. +// +// Note, b must be of sufficient size to hold all the options in s. See +// IPv4OptionsSerializer.Length for details on the getting the total size +// of a serialized IPv4OptionsSerializer. +// +// Serialize panics if b is not of sufficient size to hold all the options in s. +func (s IPv4OptionsSerializer) Serialize(b []byte) uint8 { + var total uint8 + for _, opt := range s { + ty := opt.optionType() + if withPayload, ok := opt.(IPv4SerializableOptionPayload); ok { + // Serialize first to reduce bounds checks. + l := 2 + withPayload.serializeInto(b[2:]) + b[0] = byte(ty) + b[1] = l + b = b[l:] + total += l + continue + } + // Options without payload consist only of the type field. + // + // NB: Repeating code from the branch above is intentional to minimize + // bounds checks. + b[0] = byte(ty) + b = b[1:] + total++ + } + + // According to RFC 791: + // + // The internet header padding is used to ensure that the internet + // header ends on a 32 bit boundary. The padding is zero. + padded := padIPv4OptionsLength(total) + b = b[:padded-total] + for i := range b { + b[i] = 0 + } + return padded +} + +var _ IPv4SerializableOptionPayload = (*IPv4SerializableRouterAlertOption)(nil) +var _ IPv4SerializableOption = (*IPv4SerializableRouterAlertOption)(nil) + +// IPv4SerializableRouterAlertOption provides serialization of the Router Alert +// IPv4 option according to RFC 2113. +type IPv4SerializableRouterAlertOption struct{} + +// Type implements IPv4SerializableOption. +func (*IPv4SerializableRouterAlertOption) optionType() IPv4OptionType { + return IPv4OptionRouterAlertType +} + +// Length implements IPv4SerializableOption. +func (*IPv4SerializableRouterAlertOption) length() uint8 { + return IPv4OptionRouterAlertLength - iPv4OptionRouterAlertValueOffset +} + +// SerializeInto implements IPv4SerializableOption. +func (o *IPv4SerializableRouterAlertOption) serializeInto(buffer []byte) uint8 { + binary.BigEndian.PutUint16(buffer, IPv4OptionRouterAlertValue) + return o.length() +} + +var _ IPv4SerializableOption = (*IPv4SerializableNOPOption)(nil) + +// IPv4SerializableNOPOption provides serialization for the IPv4 no-op option. +type IPv4SerializableNOPOption struct{} + +// Type implements IPv4SerializableOption. +func (*IPv4SerializableNOPOption) optionType() IPv4OptionType { + return IPv4OptionNOPType +} + +var _ IPv4SerializableOption = (*IPv4SerializableListEndOption)(nil) + +// IPv4SerializableListEndOption provides serialization for the IPv4 List End +// option. +type IPv4SerializableListEndOption struct{} + +// Type implements IPv4SerializableOption. +func (*IPv4SerializableListEndOption) optionType() IPv4OptionType { + return IPv4OptionListEndType +} diff --git a/pkg/tcpip/header/ipv4_test.go b/pkg/tcpip/header/ipv4_test.go new file mode 100644 index 000000000..6475cd694 --- /dev/null +++ b/pkg/tcpip/header/ipv4_test.go @@ -0,0 +1,179 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package header_test + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" +) + +func TestIPv4OptionsSerializer(t *testing.T) { + optCases := []struct { + name string + option []header.IPv4SerializableOption + expect []byte + }{ + { + name: "NOP", + option: []header.IPv4SerializableOption{ + &header.IPv4SerializableNOPOption{}, + }, + expect: []byte{1, 0, 0, 0}, + }, + { + name: "ListEnd", + option: []header.IPv4SerializableOption{ + &header.IPv4SerializableListEndOption{}, + }, + expect: []byte{0, 0, 0, 0}, + }, + { + name: "RouterAlert", + option: []header.IPv4SerializableOption{ + &header.IPv4SerializableRouterAlertOption{}, + }, + expect: []byte{148, 4, 0, 0}, + }, { + name: "NOP and RouterAlert", + option: []header.IPv4SerializableOption{ + &header.IPv4SerializableNOPOption{}, + &header.IPv4SerializableRouterAlertOption{}, + }, + expect: []byte{1, 148, 4, 0, 0, 0, 0, 0}, + }, + } + + for _, opt := range optCases { + t.Run(opt.name, func(t *testing.T) { + s := header.IPv4OptionsSerializer(opt.option) + l := s.Length() + if got := len(opt.expect); got != int(l) { + t.Fatalf("s.Length() = %d, want = %d", got, l) + } + b := make([]byte, l) + for i := range b { + // Fill the buffer with full bytes to ensure padding is being set + // correctly. + b[i] = 0xFF + } + if serializedLength := s.Serialize(b); serializedLength != l { + t.Fatalf("s.Serialize(_) = %d, want %d", serializedLength, l) + } + if diff := cmp.Diff(opt.expect, b); diff != "" { + t.Errorf("mismatched serialized option (-want +got):\n%s", diff) + } + }) + } +} + +// TestIPv4Encode checks that ipv4.Encode correctly fills out the requested +// fields when options are supplied. +func TestIPv4EncodeOptions(t *testing.T) { + tests := []struct { + name string + numberOfNops int + encodedOptions header.IPv4Options // reply should look like this + wantIHL int + }{ + { + name: "valid no options", + wantIHL: header.IPv4MinimumSize, + }, + { + name: "one byte options", + numberOfNops: 1, + encodedOptions: header.IPv4Options{1, 0, 0, 0}, + wantIHL: header.IPv4MinimumSize + 4, + }, + { + name: "two byte options", + numberOfNops: 2, + encodedOptions: header.IPv4Options{1, 1, 0, 0}, + wantIHL: header.IPv4MinimumSize + 4, + }, + { + name: "three byte options", + numberOfNops: 3, + encodedOptions: header.IPv4Options{1, 1, 1, 0}, + wantIHL: header.IPv4MinimumSize + 4, + }, + { + name: "four byte options", + numberOfNops: 4, + encodedOptions: header.IPv4Options{1, 1, 1, 1}, + wantIHL: header.IPv4MinimumSize + 4, + }, + { + name: "five byte options", + numberOfNops: 5, + encodedOptions: header.IPv4Options{1, 1, 1, 1, 1, 0, 0, 0}, + wantIHL: header.IPv4MinimumSize + 8, + }, + { + name: "thirty nine byte options", + numberOfNops: 39, + encodedOptions: header.IPv4Options{ + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 0, + }, + wantIHL: header.IPv4MinimumSize + 40, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + serializeOpts := header.IPv4OptionsSerializer(make([]header.IPv4SerializableOption, test.numberOfNops)) + for i := range serializeOpts { + serializeOpts[i] = &header.IPv4SerializableNOPOption{} + } + paddedOptionLength := serializeOpts.Length() + ipHeaderLength := int(header.IPv4MinimumSize + paddedOptionLength) + if ipHeaderLength > header.IPv4MaximumHeaderSize { + t.Fatalf("IP header length too large: got = %d, want <= %d ", ipHeaderLength, header.IPv4MaximumHeaderSize) + } + totalLen := uint16(ipHeaderLength) + hdr := buffer.NewPrependable(int(totalLen)) + ip := header.IPv4(hdr.Prepend(ipHeaderLength)) + // To check the padding works, poison the last byte of the options space. + if paddedOptionLength != serializeOpts.Length() { + ip.SetHeaderLength(uint8(ipHeaderLength)) + ip.Options()[paddedOptionLength-1] = 0xff + ip.SetHeaderLength(0) + } + ip.Encode(&header.IPv4Fields{ + Options: serializeOpts, + }) + options := ip.Options() + wantOptions := test.encodedOptions + if got, want := int(ip.HeaderLength()), test.wantIHL; got != want { + t.Errorf("got IHL of %d, want %d", got, want) + } + + // cmp.Diff does not consider nil slices equal to empty slices, but we do. + if len(wantOptions) == 0 && len(options) == 0 { + return + } + + if diff := cmp.Diff(wantOptions, options); diff != "" { + t.Errorf("options mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go index 55d09355a..d522e5f10 100644 --- a/pkg/tcpip/header/ipv6.go +++ b/pkg/tcpip/header/ipv6.go @@ -48,11 +48,13 @@ type IPv6Fields struct { // FlowLabel is the "flow label" field of an IPv6 packet. FlowLabel uint32 - // PayloadLength is the "payload length" field of an IPv6 packet. + // PayloadLength is the "payload length" field of an IPv6 packet, including + // the length of all extension headers. PayloadLength uint16 - // NextHeader is the "next header" field of an IPv6 packet. - NextHeader uint8 + // TransportProtocol is the transport layer protocol number. Serialized in the + // last "next header" field of the IPv6 header + extension headers. + TransportProtocol tcpip.TransportProtocolNumber // HopLimit is the "Hop Limit" field of an IPv6 packet. HopLimit uint8 @@ -62,6 +64,9 @@ type IPv6Fields struct { // DstAddr is the "destination ip address" of an IPv6 packet. DstAddr tcpip.Address + + // ExtensionHeaders are the extension headers following the IPv6 header. + ExtensionHeaders IPv6ExtHdrSerializer } // IPv6 represents an ipv6 header stored in a byte array. @@ -253,12 +258,14 @@ func (IPv6) SetChecksum(uint16) { // Encode encodes all the fields of the ipv6 header. func (b IPv6) Encode(i *IPv6Fields) { + extHdr := b[IPv6MinimumSize:] b.SetTOS(i.TrafficClass, i.FlowLabel) b.SetPayloadLength(i.PayloadLength) - b[IPv6NextHeaderOffset] = i.NextHeader b[hopLimit] = i.HopLimit b.SetSourceAddress(i.SrcAddr) b.SetDestinationAddress(i.DstAddr) + nextHeader, _ := i.ExtensionHeaders.Serialize(i.TransportProtocol, extHdr) + b[IPv6NextHeaderOffset] = nextHeader } // IsValid performs basic validation on the packet. diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go index 583c2c5d3..f18981332 100644 --- a/pkg/tcpip/header/ipv6_extension_headers.go +++ b/pkg/tcpip/header/ipv6_extension_headers.go @@ -18,9 +18,12 @@ import ( "bufio" "bytes" "encoding/binary" + "errors" "fmt" "io" + "math" + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" ) @@ -47,6 +50,11 @@ const ( // IPv6NoNextHeaderIdentifier is the header identifier used to signify the end // of an IPv6 payload, as per RFC 8200 section 4.7. IPv6NoNextHeaderIdentifier IPv6ExtensionHeaderIdentifier = 59 + + // IPv6UnknownExtHdrIdentifier is reserved by IANA. + // https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#extension-header + // "254 Use for experimentation and testing [RFC3692][RFC4727]" + IPv6UnknownExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 254 ) const ( @@ -70,8 +78,8 @@ const ( // Fragment Offset field within an IPv6FragmentExtHdr. ipv6FragmentExtHdrFragmentOffsetOffset = 0 - // ipv6FragmentExtHdrFragmentOffsetShift is the least significant bits to - // discard from the Fragment Offset. + // ipv6FragmentExtHdrFragmentOffsetShift is the bit offset of the Fragment + // Offset field within an IPv6FragmentExtHdr. ipv6FragmentExtHdrFragmentOffsetShift = 3 // ipv6FragmentExtHdrFlagsIdx is the index to the flags field within an @@ -109,6 +117,37 @@ const ( IPv6FragmentExtHdrFragmentOffsetBytesPerUnit = 8 ) +// padIPv6OptionsLength returns the total length for IPv6 options of length l +// considering the 8-octet alignment as stated in RFC 8200 Section 4.2. +func padIPv6OptionsLength(length int) int { + return (length + ipv6ExtHdrLenBytesPerUnit - 1) & ^(ipv6ExtHdrLenBytesPerUnit - 1) +} + +// padIPv6Option fills b with the appropriate padding options depending on its +// length. +func padIPv6Option(b []byte) { + switch len(b) { + case 0: // No padding needed. + case 1: // Pad with Pad1. + b[ipv6ExtHdrOptionTypeOffset] = uint8(ipv6Pad1ExtHdrOptionIdentifier) + default: // Pad with PadN. + s := b[ipv6ExtHdrOptionPayloadOffset:] + for i := range s { + s[i] = 0 + } + b[ipv6ExtHdrOptionTypeOffset] = uint8(ipv6PadNExtHdrOptionIdentifier) + b[ipv6ExtHdrOptionLengthOffset] = uint8(len(s)) + } +} + +// ipv6OptionsAlignmentPadding returns the number of padding bytes needed to +// serialize an option at headerOffset with alignment requirements +// [align]n + alignOffset. +func ipv6OptionsAlignmentPadding(headerOffset int, align int, alignOffset int) int { + padLen := headerOffset - alignOffset + return ((padLen + align - 1) & ^(align - 1)) - padLen +} + // IPv6PayloadHeader is implemented by the various headers that can be found // in an IPv6 payload. // @@ -201,29 +240,55 @@ type IPv6ExtHdrOption interface { isIPv6ExtHdrOption() } -// IPv6ExtHdrOptionIndentifier is an IPv6 extension header option identifier. -type IPv6ExtHdrOptionIndentifier uint8 +// IPv6ExtHdrOptionIdentifier is an IPv6 extension header option identifier. +type IPv6ExtHdrOptionIdentifier uint8 const ( // ipv6Pad1ExtHdrOptionIdentifier is the identifier for a padding option that // provides 1 byte padding, as outlined in RFC 8200 section 4.2. - ipv6Pad1ExtHdrOptionIdentifier IPv6ExtHdrOptionIndentifier = 0 + ipv6Pad1ExtHdrOptionIdentifier IPv6ExtHdrOptionIdentifier = 0 // ipv6PadBExtHdrOptionIdentifier is the identifier for a padding option that // provides variable length byte padding, as outlined in RFC 8200 section 4.2. - ipv6PadNExtHdrOptionIdentifier IPv6ExtHdrOptionIndentifier = 1 + ipv6PadNExtHdrOptionIdentifier IPv6ExtHdrOptionIdentifier = 1 + + // ipv6RouterAlertHopByHopOptionIdentifier is the identifier for the Router + // Alert Hop by Hop option as defined in RFC 2711 section 2.1. + ipv6RouterAlertHopByHopOptionIdentifier IPv6ExtHdrOptionIdentifier = 5 + + // ipv6ExtHdrOptionTypeOffset is the option type offset in an extension header + // option as defined in RFC 8200 section 4.2. + ipv6ExtHdrOptionTypeOffset = 0 + + // ipv6ExtHdrOptionLengthOffset is the option length offset in an extension + // header option as defined in RFC 8200 section 4.2. + ipv6ExtHdrOptionLengthOffset = 1 + + // ipv6ExtHdrOptionPayloadOffset is the option payload offset in an extension + // header option as defined in RFC 8200 section 4.2. + ipv6ExtHdrOptionPayloadOffset = 2 ) +// ipv6UnknownActionFromIdentifier maps an extension header option's +// identifier's high bits to the action to take when the identifier is unknown. +func ipv6UnknownActionFromIdentifier(id IPv6ExtHdrOptionIdentifier) IPv6OptionUnknownAction { + return IPv6OptionUnknownAction((id & ipv6UnknownExtHdrOptionActionMask) >> ipv6UnknownExtHdrOptionActionShift) +} + +// ErrMalformedIPv6ExtHdrOption indicates that an IPv6 extension header option +// is malformed. +var ErrMalformedIPv6ExtHdrOption = errors.New("malformed IPv6 extension header option") + // IPv6UnknownExtHdrOption holds the identifier and data for an IPv6 extension // header option that is unknown by the parsing utilities. type IPv6UnknownExtHdrOption struct { - Identifier IPv6ExtHdrOptionIndentifier + Identifier IPv6ExtHdrOptionIdentifier Data []byte } // UnknownAction implements IPv6OptionUnknownAction.UnknownAction. func (o *IPv6UnknownExtHdrOption) UnknownAction() IPv6OptionUnknownAction { - return IPv6OptionUnknownAction((o.Identifier & ipv6UnknownExtHdrOptionActionMask) >> ipv6UnknownExtHdrOptionActionShift) + return ipv6UnknownActionFromIdentifier(o.Identifier) } // isIPv6ExtHdrOption implements IPv6ExtHdrOption.isIPv6ExtHdrOption. @@ -246,7 +311,7 @@ func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error // options buffer has been exhausted and we are done iterating. return nil, true, nil } - id := IPv6ExtHdrOptionIndentifier(temp) + id := IPv6ExtHdrOptionIdentifier(temp) // If the option identifier indicates the option is a Pad1 option, then we // know the option does not have Length and Data fields. End processing of @@ -289,6 +354,19 @@ func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error panic(fmt.Sprintf("error when skipping PadN (N = %d) option's data bytes: %s", length, err)) } continue + case ipv6RouterAlertHopByHopOptionIdentifier: + var routerAlertValue [ipv6RouterAlertPayloadLength]byte + if n, err := io.ReadFull(&i.reader, routerAlertValue[:]); err != nil { + switch err { + case io.EOF, io.ErrUnexpectedEOF: + return nil, true, fmt.Errorf("got invalid length (%d) for router alert option (want = %d): %w", length, ipv6RouterAlertPayloadLength, ErrMalformedIPv6ExtHdrOption) + default: + return nil, true, fmt.Errorf("read %d out of %d option data bytes for router alert option: %w", n, ipv6RouterAlertPayloadLength, err) + } + } else if n != int(length) { + return nil, true, fmt.Errorf("got invalid length (%d) for router alert option (want = %d): %w", length, ipv6RouterAlertPayloadLength, ErrMalformedIPv6ExtHdrOption) + } + return &IPv6RouterAlertOption{Value: IPv6RouterAlertValue(binary.BigEndian.Uint16(routerAlertValue[:]))}, false, nil default: bytes := make([]byte, length) if n, err := io.ReadFull(&i.reader, bytes); err != nil { @@ -452,9 +530,11 @@ func (i *IPv6PayloadIterator) AsRawHeader(consume bool) IPv6RawPayloadHeader { // Since we consume the iterator, we return the payload as is. buf = i.payload - // Mark i as done. + // Mark i as done, but keep track of where we were for error reporting. *i = IPv6PayloadIterator{ nextHdrIdentifier: IPv6NoNextHeaderIdentifier, + headerOffset: i.headerOffset, + nextOffset: i.nextOffset, } } else { buf = i.payload.Clone(nil) @@ -602,3 +682,248 @@ func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IP return IPv6ExtensionHeaderIdentifier(nextHdrIdentifier), bytes, nil } + +// IPv6SerializableExtHdr provides serialization for IPv6 extension +// headers. +type IPv6SerializableExtHdr interface { + // identifier returns the assigned IPv6 header identifier for this extension + // header. + identifier() IPv6ExtensionHeaderIdentifier + + // length returns the total serialized length in bytes of this extension + // header, including the common next header and length fields. + length() int + + // serializeInto serializes the receiver into the provided byte + // buffer and with the provided nextHeader value. + // + // Note, the caller MUST provide a byte buffer with size of at least + // length. Implementers of this function may assume that the byte buffer + // is of sufficient size. serializeInto MAY panic if the provided byte + // buffer is not of sufficient size. + // + // serializeInto returns the number of bytes that was used to serialize the + // receiver. Implementers must only use the number of bytes required to + // serialize the receiver. Callers MAY provide a larger buffer than required + // to serialize into. + serializeInto(nextHeader uint8, b []byte) int +} + +var _ IPv6SerializableExtHdr = (*IPv6SerializableHopByHopExtHdr)(nil) + +// IPv6SerializableHopByHopExtHdr implements serialization of the Hop by Hop +// options extension header. +type IPv6SerializableHopByHopExtHdr []IPv6SerializableHopByHopOption + +const ( + // ipv6HopByHopExtHdrNextHeaderOffset is the offset of the next header field + // in a hop by hop extension header as defined in RFC 8200 section 4.3. + ipv6HopByHopExtHdrNextHeaderOffset = 0 + + // ipv6HopByHopExtHdrLengthOffset is the offset of the length field in a hop + // by hop extension header as defined in RFC 8200 section 4.3. + ipv6HopByHopExtHdrLengthOffset = 1 + + // ipv6HopByHopExtHdrPayloadOffset is the offset of the options in a hop by + // hop extension header as defined in RFC 8200 section 4.3. + ipv6HopByHopExtHdrOptionsOffset = 2 + + // ipv6HopByHopExtHdrUnaccountedLenWords is the implicit number of 8-octet + // words in a hop by hop extension header's length field, as stated in RFC + // 8200 section 4.3: + // Length of the Hop-by-Hop Options header in 8-octet units, + // not including the first 8 octets. + ipv6HopByHopExtHdrUnaccountedLenWords = 1 +) + +// identifier implements IPv6SerializableExtHdr. +func (IPv6SerializableHopByHopExtHdr) identifier() IPv6ExtensionHeaderIdentifier { + return IPv6HopByHopOptionsExtHdrIdentifier +} + +// length implements IPv6SerializableExtHdr. +func (h IPv6SerializableHopByHopExtHdr) length() int { + var total int + for _, opt := range h { + align, alignOffset := opt.alignment() + total += ipv6OptionsAlignmentPadding(total, align, alignOffset) + total += ipv6ExtHdrOptionPayloadOffset + int(opt.length()) + } + // Account for next header and total length fields and add padding. + return padIPv6OptionsLength(ipv6HopByHopExtHdrOptionsOffset + total) +} + +// serializeInto implements IPv6SerializableExtHdr. +func (h IPv6SerializableHopByHopExtHdr) serializeInto(nextHeader uint8, b []byte) int { + optBuffer := b[ipv6HopByHopExtHdrOptionsOffset:] + totalLength := ipv6HopByHopExtHdrOptionsOffset + for _, opt := range h { + // Calculate alignment requirements and pad buffer if necessary. + align, alignOffset := opt.alignment() + padLen := ipv6OptionsAlignmentPadding(totalLength, align, alignOffset) + if padLen != 0 { + padIPv6Option(optBuffer[:padLen]) + totalLength += padLen + optBuffer = optBuffer[padLen:] + } + + l := opt.serializeInto(optBuffer[ipv6ExtHdrOptionPayloadOffset:]) + optBuffer[ipv6ExtHdrOptionTypeOffset] = uint8(opt.identifier()) + optBuffer[ipv6ExtHdrOptionLengthOffset] = l + l += ipv6ExtHdrOptionPayloadOffset + totalLength += int(l) + optBuffer = optBuffer[l:] + } + padded := padIPv6OptionsLength(totalLength) + if padded != totalLength { + padIPv6Option(optBuffer[:padded-totalLength]) + totalLength = padded + } + wordsLen := totalLength/ipv6ExtHdrLenBytesPerUnit - ipv6HopByHopExtHdrUnaccountedLenWords + if wordsLen > math.MaxUint8 { + panic(fmt.Sprintf("IPv6 hop by hop options too large: %d+1 64-bit words", wordsLen)) + } + b[ipv6HopByHopExtHdrNextHeaderOffset] = nextHeader + b[ipv6HopByHopExtHdrLengthOffset] = uint8(wordsLen) + return totalLength +} + +// IPv6SerializableHopByHopOption provides serialization for hop by hop options. +type IPv6SerializableHopByHopOption interface { + // identifier returns the option identifier of this Hop by Hop option. + identifier() IPv6ExtHdrOptionIdentifier + + // length returns the *payload* size of the option (not considering the type + // and length fields). + length() uint8 + + // alignment returns the alignment requirements from this option. + // + // Alignment requirements take the form [align]n + offset as specified in + // RFC 8200 section 4.2. The alignment requirement is on the offset between + // the option type byte and the start of the hop by hop header. + // + // align must be a power of 2. + alignment() (align int, offset int) + + // serializeInto serializes the receiver into the provided byte + // buffer. + // + // Note, the caller MUST provide a byte buffer with size of at least + // length. Implementers of this function may assume that the byte buffer + // is of sufficient size. serializeInto MAY panic if the provided byte + // buffer is not of sufficient size. + // + // serializeInto will return the number of bytes that was used to + // serialize the receiver. Implementers must only use the number of + // bytes required to serialize the receiver. Callers MAY provide a + // larger buffer than required to serialize into. + serializeInto([]byte) uint8 +} + +var _ IPv6SerializableHopByHopOption = (*IPv6RouterAlertOption)(nil) + +// IPv6RouterAlertOption is the IPv6 Router alert Hop by Hop option defined in +// RFC 2711 section 2.1. +type IPv6RouterAlertOption struct { + Value IPv6RouterAlertValue +} + +// IPv6RouterAlertValue is the payload of an IPv6 Router Alert option. +type IPv6RouterAlertValue uint16 + +const ( + // IPv6RouterAlertMLD indicates a datagram containing a Multicast Listener + // Discovery message as defined in RFC 2711 section 2.1. + IPv6RouterAlertMLD IPv6RouterAlertValue = 0 + // IPv6RouterAlertRSVP indicates a datagram containing an RSVP message as + // defined in RFC 2711 section 2.1. + IPv6RouterAlertRSVP IPv6RouterAlertValue = 1 + // IPv6RouterAlertActiveNetworks indicates a datagram containing an Active + // Networks message as defined in RFC 2711 section 2.1. + IPv6RouterAlertActiveNetworks IPv6RouterAlertValue = 2 + + // ipv6RouterAlertPayloadLength is the length of the Router Alert payload + // as defined in RFC 2711. + ipv6RouterAlertPayloadLength = 2 + + // ipv6RouterAlertAlignmentRequirement is the alignment requirement for the + // Router Alert option defined as 2n+0 in RFC 2711. + ipv6RouterAlertAlignmentRequirement = 2 + + // ipv6RouterAlertAlignmentOffsetRequirement is the alignment offset + // requirement for the Router Alert option defined as 2n+0 in RFC 2711 section + // 2.1. + ipv6RouterAlertAlignmentOffsetRequirement = 0 +) + +// UnknownAction implements IPv6ExtHdrOption. +func (*IPv6RouterAlertOption) UnknownAction() IPv6OptionUnknownAction { + return ipv6UnknownActionFromIdentifier(ipv6RouterAlertHopByHopOptionIdentifier) +} + +// isIPv6ExtHdrOption implements IPv6ExtHdrOption. +func (*IPv6RouterAlertOption) isIPv6ExtHdrOption() {} + +// identifier implements IPv6SerializableHopByHopOption. +func (*IPv6RouterAlertOption) identifier() IPv6ExtHdrOptionIdentifier { + return ipv6RouterAlertHopByHopOptionIdentifier +} + +// length implements IPv6SerializableHopByHopOption. +func (*IPv6RouterAlertOption) length() uint8 { + return ipv6RouterAlertPayloadLength +} + +// alignment implements IPv6SerializableHopByHopOption. +func (*IPv6RouterAlertOption) alignment() (int, int) { + // From RFC 2711 section 2.1: + // Alignment requirement: 2n+0. + return ipv6RouterAlertAlignmentRequirement, ipv6RouterAlertAlignmentOffsetRequirement +} + +// serializeInto implements IPv6SerializableHopByHopOption. +func (o *IPv6RouterAlertOption) serializeInto(b []byte) uint8 { + binary.BigEndian.PutUint16(b, uint16(o.Value)) + return ipv6RouterAlertPayloadLength +} + +// IPv6ExtHdrSerializer provides serialization of IPv6 extension headers. +type IPv6ExtHdrSerializer []IPv6SerializableExtHdr + +// Serialize serializes the provided list of IPv6 extension headers into b. +// +// Note, b must be of sufficient size to hold all the headers in s. See +// IPv6ExtHdrSerializer.Length for details on the getting the total size of a +// serialized IPv6ExtHdrSerializer. +// +// Serialize may panic if b is not of sufficient size to hold all the options +// in s. +// +// Serialize takes the transportProtocol value to be used as the last extension +// header's Next Header value and returns the header identifier of the first +// serialized extension header and the total serialized length. +func (s IPv6ExtHdrSerializer) Serialize(transportProtocol tcpip.TransportProtocolNumber, b []byte) (uint8, int) { + nextHeader := uint8(transportProtocol) + if len(s) == 0 { + return nextHeader, 0 + } + var totalLength int + for i, h := range s[:len(s)-1] { + length := h.serializeInto(uint8(s[i+1].identifier()), b) + b = b[length:] + totalLength += length + } + totalLength += s[len(s)-1].serializeInto(nextHeader, b) + return uint8(s[0].identifier()), totalLength +} + +// Length returns the total number of bytes required to serialize the extension +// headers. +func (s IPv6ExtHdrSerializer) Length() int { + var totalLength int + for _, h := range s { + totalLength += h.length() + } + return totalLength +} diff --git a/pkg/tcpip/header/ipv6_extension_headers_test.go b/pkg/tcpip/header/ipv6_extension_headers_test.go index ab20c5f37..65adc6250 100644 --- a/pkg/tcpip/header/ipv6_extension_headers_test.go +++ b/pkg/tcpip/header/ipv6_extension_headers_test.go @@ -21,6 +21,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" ) @@ -59,7 +60,7 @@ func (a IPv6DestinationOptionsExtHdr) Equal(b IPv6DestinationOptionsExtHdr) bool func TestIPv6UnknownExtHdrOption(t *testing.T) { tests := []struct { name string - identifier IPv6ExtHdrOptionIndentifier + identifier IPv6ExtHdrOptionIdentifier expectedUnknownAction IPv6OptionUnknownAction }{ { @@ -211,6 +212,31 @@ func TestIPv6OptionsExtHdrIterErr(t *testing.T) { bytes: []byte{1, 3}, err: io.ErrUnexpectedEOF, }, + { + name: "Router alert without data", + bytes: []byte{byte(ipv6RouterAlertHopByHopOptionIdentifier), 0}, + err: ErrMalformedIPv6ExtHdrOption, + }, + { + name: "Router alert with partial data", + bytes: []byte{byte(ipv6RouterAlertHopByHopOptionIdentifier), 1, 1}, + err: ErrMalformedIPv6ExtHdrOption, + }, + { + name: "Router alert with partial data and Pad1", + bytes: []byte{byte(ipv6RouterAlertHopByHopOptionIdentifier), 1, 1, 0}, + err: ErrMalformedIPv6ExtHdrOption, + }, + { + name: "Router alert with extra data", + bytes: []byte{byte(ipv6RouterAlertHopByHopOptionIdentifier), 3, 1, 2, 3}, + err: ErrMalformedIPv6ExtHdrOption, + }, + { + name: "Router alert with missing data", + bytes: []byte{byte(ipv6RouterAlertHopByHopOptionIdentifier), 1}, + err: io.ErrUnexpectedEOF, + }, } check := func(t *testing.T, it IPv6OptionsExtHdrOptionsIterator, expectedErr error) { @@ -990,3 +1016,331 @@ func TestIPv6ExtHdrIter(t *testing.T) { }) } } + +var _ IPv6SerializableHopByHopOption = (*dummyHbHOptionSerializer)(nil) + +// dummyHbHOptionSerializer provides a generic implementation of +// IPv6SerializableHopByHopOption for use in tests. +type dummyHbHOptionSerializer struct { + id IPv6ExtHdrOptionIdentifier + payload []byte + align int + alignOffset int +} + +// identifier implements IPv6SerializableHopByHopOption. +func (s *dummyHbHOptionSerializer) identifier() IPv6ExtHdrOptionIdentifier { + return s.id +} + +// length implements IPv6SerializableHopByHopOption. +func (s *dummyHbHOptionSerializer) length() uint8 { + return uint8(len(s.payload)) +} + +// alignment implements IPv6SerializableHopByHopOption. +func (s *dummyHbHOptionSerializer) alignment() (int, int) { + align := 1 + if s.align != 0 { + align = s.align + } + return align, s.alignOffset +} + +// serializeInto implements IPv6SerializableHopByHopOption. +func (s *dummyHbHOptionSerializer) serializeInto(b []byte) uint8 { + return uint8(copy(b, s.payload)) +} + +func TestIPv6HopByHopSerializer(t *testing.T) { + validateDummies := func(t *testing.T, serializable IPv6SerializableHopByHopOption, deserialized IPv6ExtHdrOption) { + t.Helper() + dummy, ok := serializable.(*dummyHbHOptionSerializer) + if !ok { + t.Fatalf("got serializable = %T, want = *dummyHbHOptionSerializer", serializable) + } + unknown, ok := deserialized.(*IPv6UnknownExtHdrOption) + if !ok { + t.Fatalf("got deserialized = %T, want = %T", deserialized, &IPv6UnknownExtHdrOption{}) + } + if dummy.id != unknown.Identifier { + t.Errorf("got deserialized identifier = %d, want = %d", unknown.Identifier, dummy.id) + } + if diff := cmp.Diff(dummy.payload, unknown.Data); diff != "" { + t.Errorf("option payload deserialization mismatch (-want +got):\n%s", diff) + } + } + tests := []struct { + name string + nextHeader uint8 + options []IPv6SerializableHopByHopOption + expect []byte + validate func(*testing.T, IPv6SerializableHopByHopOption, IPv6ExtHdrOption) + }{ + { + name: "single option", + nextHeader: 13, + options: []IPv6SerializableHopByHopOption{ + &dummyHbHOptionSerializer{ + id: 15, + payload: []byte{9, 8, 7, 6}, + }, + }, + expect: []byte{13, 0, 15, 4, 9, 8, 7, 6}, + validate: validateDummies, + }, + { + name: "short option padN zero", + nextHeader: 88, + options: []IPv6SerializableHopByHopOption{ + &dummyHbHOptionSerializer{ + id: 22, + payload: []byte{4, 5}, + }, + }, + expect: []byte{88, 0, 22, 2, 4, 5, 1, 0}, + validate: validateDummies, + }, + { + name: "short option pad1", + nextHeader: 11, + options: []IPv6SerializableHopByHopOption{ + &dummyHbHOptionSerializer{ + id: 33, + payload: []byte{1, 2, 3}, + }, + }, + expect: []byte{11, 0, 33, 3, 1, 2, 3, 0}, + validate: validateDummies, + }, + { + name: "long option padN", + nextHeader: 55, + options: []IPv6SerializableHopByHopOption{ + &dummyHbHOptionSerializer{ + id: 77, + payload: []byte{1, 2, 3, 4, 5, 6, 7, 8}, + }, + }, + expect: []byte{55, 1, 77, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 0, 0}, + validate: validateDummies, + }, + { + name: "two options", + nextHeader: 33, + options: []IPv6SerializableHopByHopOption{ + &dummyHbHOptionSerializer{ + id: 11, + payload: []byte{1, 2, 3}, + }, + &dummyHbHOptionSerializer{ + id: 22, + payload: []byte{4, 5, 6}, + }, + }, + expect: []byte{33, 1, 11, 3, 1, 2, 3, 22, 3, 4, 5, 6, 1, 2, 0, 0}, + validate: validateDummies, + }, + { + name: "two options align 2n", + nextHeader: 33, + options: []IPv6SerializableHopByHopOption{ + &dummyHbHOptionSerializer{ + id: 11, + payload: []byte{1, 2, 3}, + }, + &dummyHbHOptionSerializer{ + id: 22, + payload: []byte{4, 5, 6}, + align: 2, + }, + }, + expect: []byte{33, 1, 11, 3, 1, 2, 3, 0, 22, 3, 4, 5, 6, 1, 1, 0}, + validate: validateDummies, + }, + { + name: "two options align 8n+1", + nextHeader: 33, + options: []IPv6SerializableHopByHopOption{ + &dummyHbHOptionSerializer{ + id: 11, + payload: []byte{1, 2}, + }, + &dummyHbHOptionSerializer{ + id: 22, + payload: []byte{4, 5, 6}, + align: 8, + alignOffset: 1, + }, + }, + expect: []byte{33, 1, 11, 2, 1, 2, 1, 1, 0, 22, 3, 4, 5, 6, 1, 0}, + validate: validateDummies, + }, + { + name: "no options", + nextHeader: 33, + options: []IPv6SerializableHopByHopOption{}, + expect: []byte{33, 0, 1, 4, 0, 0, 0, 0}, + }, + { + name: "Router Alert", + nextHeader: 33, + options: []IPv6SerializableHopByHopOption{&IPv6RouterAlertOption{Value: IPv6RouterAlertMLD}}, + expect: []byte{33, 0, 5, 2, 0, 0, 1, 0}, + validate: func(t *testing.T, _ IPv6SerializableHopByHopOption, deserialized IPv6ExtHdrOption) { + t.Helper() + routerAlert, ok := deserialized.(*IPv6RouterAlertOption) + if !ok { + t.Fatalf("got deserialized = %T, want = *IPv6RouterAlertOption", deserialized) + } + if routerAlert.Value != IPv6RouterAlertMLD { + t.Errorf("got routerAlert.Value = %d, want = %d", routerAlert.Value, IPv6RouterAlertMLD) + } + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := IPv6SerializableHopByHopExtHdr(test.options) + length := s.length() + if length != len(test.expect) { + t.Fatalf("got s.length() = %d, want = %d", length, len(test.expect)) + } + b := make([]byte, length) + for i := range b { + // Fill the buffer with ones to ensure all padding is correctly set. + b[i] = 0xFF + } + if got := s.serializeInto(test.nextHeader, b); got != length { + t.Fatalf("got s.serializeInto(..) = %d, want = %d", got, length) + } + if diff := cmp.Diff(test.expect, b); diff != "" { + t.Fatalf("serialization mismatch (-want +got):\n%s", diff) + } + + // Deserialize the options and verify them. + optLen := (b[ipv6HopByHopExtHdrLengthOffset] + ipv6HopByHopExtHdrUnaccountedLenWords) * ipv6ExtHdrLenBytesPerUnit + iter := ipv6OptionsExtHdr(b[ipv6HopByHopExtHdrOptionsOffset:optLen]).Iter() + for _, testOpt := range test.options { + opt, done, err := iter.Next() + if err != nil { + t.Fatalf("iter.Next(): %s", err) + } + if done { + t.Fatalf("got iter.Next() = (%T, %t, _), want = (_, false, _)", opt, done) + } + test.validate(t, testOpt, opt) + } + opt, done, err := iter.Next() + if err != nil { + t.Fatalf("iter.Next(): %s", err) + } + if !done { + t.Fatalf("got iter.Next() = (%T, %t, _), want = (_, true, _)", opt, done) + } + }) + } +} + +var _ IPv6SerializableExtHdr = (*dummyIPv6ExtHdrSerializer)(nil) + +// dummyIPv6ExtHdrSerializer provides a generic implementation of +// IPv6SerializableExtHdr for use in tests. +// +// The dummy header always carries the nextHeader value in the first byte. +type dummyIPv6ExtHdrSerializer struct { + id IPv6ExtensionHeaderIdentifier + headerContents []byte +} + +// identifier implements IPv6SerializableExtHdr. +func (s *dummyIPv6ExtHdrSerializer) identifier() IPv6ExtensionHeaderIdentifier { + return s.id +} + +// length implements IPv6SerializableExtHdr. +func (s *dummyIPv6ExtHdrSerializer) length() int { + return len(s.headerContents) + 1 +} + +// serializeInto implements IPv6SerializableExtHdr. +func (s *dummyIPv6ExtHdrSerializer) serializeInto(nextHeader uint8, b []byte) int { + b[0] = nextHeader + return copy(b[1:], s.headerContents) + 1 +} + +func TestIPv6ExtHdrSerializer(t *testing.T) { + tests := []struct { + name string + headers []IPv6SerializableExtHdr + nextHeader tcpip.TransportProtocolNumber + expectSerialized []byte + expectNextHeader uint8 + }{ + { + name: "one header", + headers: []IPv6SerializableExtHdr{ + &dummyIPv6ExtHdrSerializer{ + id: 15, + headerContents: []byte{1, 2, 3, 4}, + }, + }, + nextHeader: TCPProtocolNumber, + expectSerialized: []byte{byte(TCPProtocolNumber), 1, 2, 3, 4}, + expectNextHeader: 15, + }, + { + name: "two headers", + headers: []IPv6SerializableExtHdr{ + &dummyIPv6ExtHdrSerializer{ + id: 22, + headerContents: []byte{1, 2, 3}, + }, + &dummyIPv6ExtHdrSerializer{ + id: 23, + headerContents: []byte{4, 5, 6}, + }, + }, + nextHeader: ICMPv6ProtocolNumber, + expectSerialized: []byte{ + 23, 1, 2, 3, + byte(ICMPv6ProtocolNumber), 4, 5, 6, + }, + expectNextHeader: 22, + }, + { + name: "no headers", + headers: []IPv6SerializableExtHdr{}, + nextHeader: UDPProtocolNumber, + expectSerialized: []byte{}, + expectNextHeader: byte(UDPProtocolNumber), + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := IPv6ExtHdrSerializer(test.headers) + l := s.Length() + if got, want := l, len(test.expectSerialized); got != want { + t.Fatalf("got serialized length = %d, want = %d", got, want) + } + b := make([]byte, l) + for i := range b { + // Fill the buffer with garbage to make sure we're writing to all bytes. + b[i] = 0xFF + } + nextHeader, serializedLen := s.Serialize(test.nextHeader, b) + if serializedLen != len(test.expectSerialized) || nextHeader != test.expectNextHeader { + t.Errorf( + "got s.Serialize(..) = (%d, %d), want = (%d, %d)", + nextHeader, + serializedLen, + test.expectNextHeader, + len(test.expectSerialized), + ) + } + if diff := cmp.Diff(test.expectSerialized, b); diff != "" { + t.Errorf("serialization mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/tcpip/header/ipv6_fragment.go b/pkg/tcpip/header/ipv6_fragment.go index 018555a26..9d09f32eb 100644 --- a/pkg/tcpip/header/ipv6_fragment.go +++ b/pkg/tcpip/header/ipv6_fragment.go @@ -27,12 +27,11 @@ const ( idV6 = 4 ) -// IPv6FragmentFields contains the fields of an IPv6 fragment. It is used to describe the -// fields of a packet that needs to be encoded. -type IPv6FragmentFields struct { - // NextHeader is the "next header" field of an IPv6 fragment. - NextHeader uint8 +var _ IPv6SerializableExtHdr = (*IPv6SerializableFragmentExtHdr)(nil) +// IPv6SerializableFragmentExtHdr is used to serialize an IPv6 fragment +// extension header as defined in RFC 8200 section 4.5. +type IPv6SerializableFragmentExtHdr struct { // FragmentOffset is the "fragment offset" field of an IPv6 fragment. FragmentOffset uint16 @@ -43,6 +42,29 @@ type IPv6FragmentFields struct { Identification uint32 } +// identifier implements IPv6SerializableFragmentExtHdr. +func (h *IPv6SerializableFragmentExtHdr) identifier() IPv6ExtensionHeaderIdentifier { + return IPv6FragmentHeader +} + +// length implements IPv6SerializableFragmentExtHdr. +func (h *IPv6SerializableFragmentExtHdr) length() int { + return IPv6FragmentHeaderSize +} + +// serializeInto implements IPv6SerializableFragmentExtHdr. +func (h *IPv6SerializableFragmentExtHdr) serializeInto(nextHeader uint8, b []byte) int { + // Prevent too many bounds checks. + _ = b[IPv6FragmentHeaderSize:] + binary.BigEndian.PutUint32(b[idV6:], h.Identification) + binary.BigEndian.PutUint16(b[fragOff:], h.FragmentOffset<<ipv6FragmentExtHdrFragmentOffsetShift) + b[nextHdrFrag] = nextHeader + if h.M { + b[more] |= ipv6FragmentExtHdrMFlagMask + } + return IPv6FragmentHeaderSize +} + // IPv6Fragment represents an ipv6 fragment header stored in a byte array. // Most of the methods of IPv6Fragment access to the underlying slice without // checking the boundaries and could panic because of 'index out of range'. @@ -58,16 +80,6 @@ const ( IPv6FragmentHeaderSize = 8 ) -// Encode encodes all the fields of the ipv6 fragment. -func (b IPv6Fragment) Encode(i *IPv6FragmentFields) { - b[nextHdrFrag] = i.NextHeader - binary.BigEndian.PutUint16(b[fragOff:], i.FragmentOffset<<3) - if i.M { - b[more] |= 1 - } - binary.BigEndian.PutUint32(b[idV6:], i.Identification) -} - // IsValid performs basic validation on the fragment header. func (b IPv6Fragment) IsValid() bool { return len(b) >= IPv6FragmentHeaderSize diff --git a/pkg/tcpip/header/mld.go b/pkg/tcpip/header/mld.go new file mode 100644 index 000000000..ffe03c76a --- /dev/null +++ b/pkg/tcpip/header/mld.go @@ -0,0 +1,103 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package header + +import ( + "encoding/binary" + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" +) + +const ( + // MLDMinimumSize is the minimum size for an MLD message. + MLDMinimumSize = 20 + + // MLDHopLimit is the Hop Limit for all IPv6 packets with an MLD message, as + // per RFC 2710 section 3. + MLDHopLimit = 1 + + // mldMaximumResponseDelayOffset is the offset to the Maximum Response Delay + // field within MLD. + mldMaximumResponseDelayOffset = 0 + + // mldMulticastAddressOffset is the offset to the Multicast Address field + // within MLD. + mldMulticastAddressOffset = 4 +) + +// MLD is a Multicast Listener Discovery message in an ICMPv6 packet. +// +// MLD will only contain the body of an ICMPv6 packet. +// +// As per RFC 2710 section 3, MLD messages have the following format (MLD only +// holds the bytes after the first four bytes in the diagram below): +// +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Type | Code | Checksum | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Maximum Response Delay | Reserved | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | | +// + + +// | | +// + Multicast Address + +// | | +// + + +// | | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +type MLD []byte + +// MaximumResponseDelay returns the Maximum Response Delay. +func (m MLD) MaximumResponseDelay() time.Duration { + // As per RFC 2710 section 3.4: + // + // The Maximum Response Delay field is meaningful only in Query + // messages, and specifies the maximum allowed delay before sending a + // responding Report, in units of milliseconds. In all other messages, + // it is set to zero by the sender and ignored by receivers. + return time.Duration(binary.BigEndian.Uint16(m[mldMaximumResponseDelayOffset:])) * time.Millisecond +} + +// SetMaximumResponseDelay sets the Maximum Response Delay field. +// +// maxRespDelayMS is the value in milliseconds. +func (m MLD) SetMaximumResponseDelay(maxRespDelayMS uint16) { + binary.BigEndian.PutUint16(m[mldMaximumResponseDelayOffset:], maxRespDelayMS) +} + +// MulticastAddress returns the Multicast Address. +func (m MLD) MulticastAddress() tcpip.Address { + // As per RFC 2710 section 3.5: + // + // In a Query message, the Multicast Address field is set to zero when + // sending a General Query, and set to a specific IPv6 multicast address + // when sending a Multicast-Address-Specific Query. + // + // In a Report or Done message, the Multicast Address field holds a + // specific IPv6 multicast address to which the message sender is + // listening or is ceasing to listen, respectively. + return tcpip.Address(m[mldMulticastAddressOffset:][:IPv6AddressSize]) +} + +// SetMulticastAddress sets the Multicast Address field. +func (m MLD) SetMulticastAddress(multicastAddress tcpip.Address) { + if n := copy(m[mldMulticastAddressOffset:], multicastAddress); n != IPv6AddressSize { + panic(fmt.Sprintf("copied %d bytes, expected to copy %d bytes", n, IPv6AddressSize)) + } +} diff --git a/pkg/tcpip/header/mld_test.go b/pkg/tcpip/header/mld_test.go new file mode 100644 index 000000000..0cecf10d4 --- /dev/null +++ b/pkg/tcpip/header/mld_test.go @@ -0,0 +1,61 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package header + +import ( + "encoding/binary" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" +) + +func TestMLD(t *testing.T) { + b := []byte{ + // Maximum Response Delay + 0, 0, + + // Reserved + 0, 0, + + // MulticastAddress + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, + } + + const maxRespDelay = 513 + binary.BigEndian.PutUint16(b, maxRespDelay) + + mld := MLD(b) + + if got, want := mld.MaximumResponseDelay(), maxRespDelay*time.Millisecond; got != want { + t.Errorf("got mld.MaximumResponseDelay() = %s, want = %s", got, want) + } + + const newMaxRespDelay = 1234 + mld.SetMaximumResponseDelay(newMaxRespDelay) + if got, want := mld.MaximumResponseDelay(), newMaxRespDelay*time.Millisecond; got != want { + t.Errorf("got mld.MaximumResponseDelay() = %s, want = %s", got, want) + } + + if got, want := mld.MulticastAddress(), tcpip.Address([]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6}); got != want { + t.Errorf("got mld.MulticastAddress() = %s, want = %s", got, want) + } + + multicastAddress := tcpip.Address([]byte{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}) + mld.SetMulticastAddress(multicastAddress) + if got := mld.MulticastAddress(); got != multicastAddress { + t.Errorf("got mld.MulticastAddress() = %s, want = %s", got, multicastAddress) + } +} diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go index 5d3975c56..554242f0c 100644 --- a/pkg/tcpip/header/ndp_options.go +++ b/pkg/tcpip/header/ndp_options.go @@ -298,7 +298,7 @@ func (b NDPOptions) Iter(check bool) (NDPOptionIterator, error) { return it, nil } -// Serialize serializes the provided list of NDP options into o. +// Serialize serializes the provided list of NDP options into b. // // Note, b must be of sufficient size to hold all the options in s. See // NDPOptionsSerializer.Length for details on the getting the total size diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go index 98bdd29db..a6d4fcd59 100644 --- a/pkg/tcpip/header/udp.go +++ b/pkg/tcpip/header/udp.go @@ -36,10 +36,10 @@ const ( // UDPFields contains the fields of a UDP packet. It is used to describe the // fields of a packet that needs to be encoded. type UDPFields struct { - // SrcPort is the "source port" field of a UDP packet. + // SrcPort is the "Source Port" field of a UDP packet. SrcPort uint16 - // DstPort is the "destination port" field of a UDP packet. + // DstPort is the "Destination Port" field of a UDP packet. DstPort uint16 // Length is the "length" field of a UDP packet. @@ -64,52 +64,57 @@ const ( UDPProtocolNumber tcpip.TransportProtocolNumber = 17 ) -// SourcePort returns the "source port" field of the udp header. +// SourcePort returns the "Source Port" field of the UDP header. func (b UDP) SourcePort() uint16 { return binary.BigEndian.Uint16(b[udpSrcPort:]) } -// DestinationPort returns the "destination port" field of the udp header. +// DestinationPort returns the "Destination Port" field of the UDP header. func (b UDP) DestinationPort() uint16 { return binary.BigEndian.Uint16(b[udpDstPort:]) } -// Length returns the "length" field of the udp header. +// Length returns the "Length" field of the UDP header. func (b UDP) Length() uint16 { return binary.BigEndian.Uint16(b[udpLength:]) } // Payload returns the data contained in the UDP datagram. func (b UDP) Payload() []byte { - return b[UDPMinimumSize:] + return b[:b.Length()][UDPMinimumSize:] } -// Checksum returns the "checksum" field of the udp header. +// Checksum returns the "checksum" field of the UDP header. func (b UDP) Checksum() uint16 { return binary.BigEndian.Uint16(b[udpChecksum:]) } -// SetSourcePort sets the "source port" field of the udp header. +// SetSourcePort sets the "source port" field of the UDP header. func (b UDP) SetSourcePort(port uint16) { binary.BigEndian.PutUint16(b[udpSrcPort:], port) } -// SetDestinationPort sets the "destination port" field of the udp header. +// SetDestinationPort sets the "destination port" field of the UDP header. func (b UDP) SetDestinationPort(port uint16) { binary.BigEndian.PutUint16(b[udpDstPort:], port) } -// SetChecksum sets the "checksum" field of the udp header. +// SetChecksum sets the "checksum" field of the UDP header. func (b UDP) SetChecksum(checksum uint16) { binary.BigEndian.PutUint16(b[udpChecksum:], checksum) } -// SetLength sets the "length" field of the udp header. +// SetLength sets the "length" field of the UDP header. func (b UDP) SetLength(length uint16) { binary.BigEndian.PutUint16(b[udpLength:], length) } -// CalculateChecksum calculates the checksum of the udp packet, given the +// PayloadLength returns the length of the payload following the UDP header. +func (b UDP) PayloadLength() uint16 { + return b.Length() - UDPMinimumSize +} + +// CalculateChecksum calculates the checksum of the UDP packet, given the // checksum of the network-layer pseudo-header and the checksum of the payload. func (b UDP) CalculateChecksum(partialChecksum uint16) uint16 { // Calculate the rest of the checksum. diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD index 39ca774ef..973f06cbc 100644 --- a/pkg/tcpip/link/channel/BUILD +++ b/pkg/tcpip/link/channel/BUILD @@ -9,7 +9,6 @@ go_library( deps = [ "//pkg/sync", "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/stack", ], diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go index c95aef63c..0efbfb22b 100644 --- a/pkg/tcpip/link/channel/channel.go +++ b/pkg/tcpip/link/channel/channel.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -32,7 +31,7 @@ type PacketInfo struct { Pkt *stack.PacketBuffer Proto tcpip.NetworkProtocolNumber GSO *stack.GSO - Route stack.Route + Route *stack.Route } // Notification is the interface for receiving notification from the packet @@ -271,21 +270,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe return n, nil } -// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. -func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - p := PacketInfo{ - Pkt: stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: vv, - }), - Proto: 0, - GSO: nil, - } - - e.q.Write(p) - - return nil -} - // Wait implements stack.LinkEndpoint.Wait. func (*Endpoint) Wait() {} diff --git a/pkg/tcpip/link/ethernet/ethernet.go b/pkg/tcpip/link/ethernet/ethernet.go index 3eef7cd56..beefcd008 100644 --- a/pkg/tcpip/link/ethernet/ethernet.go +++ b/pkg/tcpip/link/ethernet/ethernet.go @@ -62,7 +62,7 @@ func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { // WritePacket implements stack.LinkEndpoint. func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { - e.AddHeader(e.Endpoint.LinkAddress(), r.RemoteLinkAddress, proto, pkt) + e.AddHeader(e.Endpoint.LinkAddress(), r.RemoteLinkAddress(), proto, pkt) return e.Endpoint.WritePacket(r, gso, proto, pkt) } @@ -71,7 +71,7 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe linkAddr := e.Endpoint.LinkAddress() for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { - e.AddHeader(linkAddr, r.RemoteLinkAddress, proto, pkt) + e.AddHeader(linkAddr, r.RemoteLinkAddress(), proto, pkt) } return e.Endpoint.WritePackets(r, gso, pkts, proto) diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 975309fc8..cb94cbea6 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -284,9 +284,12 @@ func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher } switch sa.(type) { case *unix.SockaddrLinklayer: - // enable PACKET_FANOUT mode is the underlying socket is - // of type AF_PACKET. - const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG + // Enable PACKET_FANOUT mode if the underlying socket is of type + // AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will + // prevent gvisor from receiving fragmented packets and the host does the + // reassembly on our behalf before delivering the fragments. This makes it + // hard to test fragmentation reassembly code in Netstack. + const fanoutType = unix.PACKET_FANOUT_HASH fanoutArg := fanoutID | fanoutType<<16 if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) @@ -410,7 +413,7 @@ func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net // currently writable, the packet is dropped. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { if e.hdrSize > 0 { - e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt) + e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress(), protocol, pkt) } var builder iovec.Builder @@ -453,7 +456,7 @@ func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tc mmsgHdrs := make([]rawfile.MMsgHdr, 0, len(batch)) for _, pkt := range batch { if e.hdrSize > 0 { - e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress, pkt.NetworkProtocolNumber, pkt) + e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress(), pkt.NetworkProtocolNumber, pkt) } var vnetHdrBuf []byte @@ -558,11 +561,6 @@ func viewsEqual(vs1, vs2 []buffer.View) bool { return len(vs1) == len(vs2) && (len(vs1) == 0 || &vs1[0] == &vs2[0]) } -// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. -func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - return rawfile.NonBlockingWrite(e.fds[0], vv.ToView()) -} - // InjectOutobund implements stack.InjectableEndpoint.InjectOutbound. func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error { return rawfile.NonBlockingWrite(e.fds[0], packet) diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index 709f829c8..ce4da7230 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -183,9 +183,8 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash u c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth, GSOMaxSize: gsoMaxSize}) defer c.cleanup() - r := &stack.Route{ - RemoteLinkAddress: raddr, - } + var r stack.Route + r.ResolveWith(raddr) // Build payload. payload := buffer.NewView(plen) @@ -220,7 +219,7 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash u L3HdrLen: header.IPv4MaximumHeaderSize, } } - if err := c.ep.WritePacket(r, gso, proto, pkt); err != nil { + if err := c.ep.WritePacket(&r, gso, proto, pkt); err != nil { t.Fatalf("WritePacket failed: %v", err) } @@ -325,9 +324,9 @@ func TestPreserveSrcAddress(t *testing.T) { // Set LocalLinkAddress in route to the value of the bridged address. r := &stack.Route{ - RemoteLinkAddress: raddr, - LocalLinkAddress: baddr, + LocalLinkAddress: baddr, } + r.ResolveWith(raddr) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ // WritePacket panics given a prependable with anything less than diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go index 38aa694e4..edca57e4e 100644 --- a/pkg/tcpip/link/loopback/loopback.go +++ b/pkg/tcpip/link/loopback/loopback.go @@ -96,23 +96,6 @@ func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList panic("not implemented") } -// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. -func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: vv, - }) - // There should be an ethernet header at the beginning of vv. - hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) - if !ok { - // Reject the packet if it's shorter than an ethernet header. - return tcpip.ErrBadAddress - } - linkHeader := header.Ethernet(hdr) - e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, linkHeader.Type(), pkt) - - return nil -} - // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (*endpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareLoopback diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD index e7493e5c5..cbda59775 100644 --- a/pkg/tcpip/link/muxed/BUILD +++ b/pkg/tcpip/link/muxed/BUILD @@ -8,7 +8,6 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/stack", ], diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go index 56a611825..22e79ce3a 100644 --- a/pkg/tcpip/link/muxed/injectable.go +++ b/pkg/tcpip/link/muxed/injectable.go @@ -17,7 +17,6 @@ package muxed import ( "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -106,13 +105,6 @@ func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protoco return tcpip.ErrNoRoute } -// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. -func (m *InjectableEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error { - // WriteRawPacket doesn't get a route or network address, so there's - // nowhere to write this. - return tcpip.ErrNoRoute -} - // InjectOutbound writes outbound packets to the appropriate // LinkInjectableEndpoint based on the dest address. func (m *InjectableEndpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error { diff --git a/pkg/tcpip/link/nested/BUILD b/pkg/tcpip/link/nested/BUILD index 2cdb23475..00b42b924 100644 --- a/pkg/tcpip/link/nested/BUILD +++ b/pkg/tcpip/link/nested/BUILD @@ -11,7 +11,6 @@ go_library( deps = [ "//pkg/sync", "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/stack", ], diff --git a/pkg/tcpip/link/nested/nested.go b/pkg/tcpip/link/nested/nested.go index d40de54df..0ee54c3d5 100644 --- a/pkg/tcpip/link/nested/nested.go +++ b/pkg/tcpip/link/nested/nested.go @@ -19,7 +19,6 @@ package nested import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -123,11 +122,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe return e.child.WritePackets(r, gso, pkts, protocol) } -// WriteRawPacket implements stack.LinkEndpoint. -func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - return e.child.WriteRawPacket(vv) -} - // Wait implements stack.LinkEndpoint. func (e *Endpoint) Wait() { e.child.Wait() diff --git a/pkg/tcpip/link/packetsocket/endpoint.go b/pkg/tcpip/link/packetsocket/endpoint.go index 3922c2a04..9a1b0c0c2 100644 --- a/pkg/tcpip/link/packetsocket/endpoint.go +++ b/pkg/tcpip/link/packetsocket/endpoint.go @@ -36,14 +36,14 @@ func New(lower stack.LinkEndpoint) stack.LinkEndpoint { // WritePacket implements stack.LinkEndpoint.WritePacket. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { - e.Endpoint.DeliverOutboundPacket(r.RemoteLinkAddress, r.LocalLinkAddress, protocol, pkt) + e.Endpoint.DeliverOutboundPacket(r.RemoteLinkAddress(), r.LocalLinkAddress, protocol, pkt) return e.Endpoint.WritePacket(r, gso, protocol, pkt) } // WritePackets implements stack.LinkEndpoint.WritePackets. func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { - e.Endpoint.DeliverOutboundPacket(pkt.EgressRoute.RemoteLinkAddress, pkt.EgressRoute.LocalLinkAddress, pkt.NetworkProtocolNumber, pkt) + e.Endpoint.DeliverOutboundPacket(pkt.EgressRoute.RemoteLinkAddress(), pkt.EgressRoute.LocalLinkAddress, pkt.NetworkProtocolNumber, pkt) } return e.Endpoint.WritePackets(r, gso, pkts, proto) diff --git a/pkg/tcpip/link/pipe/pipe.go b/pkg/tcpip/link/pipe/pipe.go index 523b0d24b..25c364391 100644 --- a/pkg/tcpip/link/pipe/pipe.go +++ b/pkg/tcpip/link/pipe/pipe.go @@ -55,7 +55,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, _ *stack.GSO, proto tcpip.Network // remote address from the perspective of the other end of the pipe // (e.linked). Similarly, the remote address from the perspective of this // endpoint is the local address on the other end. - e.linked.dispatcher.DeliverNetworkPacket(r.LocalLinkAddress /* remote */, r.RemoteLinkAddress /* local */, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{ + e.linked.dispatcher.DeliverNetworkPacket(r.LocalLinkAddress /* remote */, r.RemoteLinkAddress() /* local */, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()), })) @@ -67,11 +67,6 @@ func (*Endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, panic("not implemented") } -// WriteRawPacket implements stack.LinkEndpoint. -func (*Endpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error { - panic("not implemented") -} - // Attach implements stack.LinkEndpoint. func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.dispatcher = dispatcher diff --git a/pkg/tcpip/link/qdisc/fifo/BUILD b/pkg/tcpip/link/qdisc/fifo/BUILD index 1d0079bd6..5bea598eb 100644 --- a/pkg/tcpip/link/qdisc/fifo/BUILD +++ b/pkg/tcpip/link/qdisc/fifo/BUILD @@ -13,7 +13,6 @@ go_library( "//pkg/sleep", "//pkg/sync", "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/stack", ], diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go index fc1e34fc7..27667f5f0 100644 --- a/pkg/tcpip/link/qdisc/fifo/endpoint.go +++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sleep" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -156,7 +155,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne // WritePacket caller's do not set the following fields in PacketBuffer // so we populate them here. newRoute := r.Clone() - pkt.EgressRoute = &newRoute + pkt.EgressRoute = newRoute pkt.GSOOptions = gso pkt.NetworkProtocolNumber = protocol d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)] @@ -183,7 +182,7 @@ func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketB // the route here to ensure it doesn't get released while the // packet is still in our queue. newRoute := pkt.EgressRoute.Clone() - pkt.EgressRoute = &newRoute + pkt.EgressRoute = newRoute if !d.q.enqueue(pkt) { if enqueued > 0 { d.newPacketWaker.Assert() @@ -197,13 +196,6 @@ func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketB return enqueued, nil } -// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. -func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - // TODO(gvisor.dev/issue/3267): Queue these packets as well once - // WriteRawPacket takes PacketBuffer instead of VectorisedView. - return e.lower.WriteRawPacket(vv) -} - // Wait implements stack.LinkEndpoint.Wait. func (e *endpoint) Wait() { e.lower.Wait() diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index 7fb8a6c49..5660418fa 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -204,7 +204,7 @@ func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net // WritePacket writes outbound packets to the file descriptor. If it is not // currently writable, the packet is dropped. func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { - e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt) + e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress(), protocol, pkt) views := pkt.Views() // Transmit the packet. @@ -224,21 +224,6 @@ func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketB panic("not implemented") } -// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. -func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - views := vv.Views() - // Transmit the packet. - e.mu.Lock() - ok := e.tx.transmit(views...) - e.mu.Unlock() - - if !ok { - return tcpip.ErrWouldBlock - } - - return nil -} - // dispatchLoop reads packets from the rx queue in a loop and dispatches them // to the network stack. func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) { diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go index 22d5c97f1..7131392cc 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem_test.go +++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go @@ -260,9 +260,8 @@ func TestSimpleSend(t *testing.T) { defer c.cleanup() // Prepare route. - r := stack.Route{ - RemoteLinkAddress: remoteLinkAddr, - } + var r stack.Route + r.ResolveWith(remoteLinkAddr) for iters := 1000; iters > 0; iters-- { func() { @@ -342,9 +341,9 @@ func TestPreserveSrcAddressInSend(t *testing.T) { newLocalLinkAddress := tcpip.LinkAddress(strings.Repeat("0xFE", 6)) // Set both remote and local link address in route. r := stack.Route{ - RemoteLinkAddress: remoteLinkAddr, - LocalLinkAddress: newLocalLinkAddress, + LocalLinkAddress: newLocalLinkAddress, } + r.ResolveWith(remoteLinkAddr) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ // WritePacket panics given a prependable with anything less than @@ -395,9 +394,8 @@ func TestFillTxQueue(t *testing.T) { defer c.cleanup() // Prepare to send a packet. - r := stack.Route{ - RemoteLinkAddress: remoteLinkAddr, - } + var r stack.Route + r.ResolveWith(remoteLinkAddr) buf := buffer.NewView(100) @@ -444,9 +442,8 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) { c.txq.rx.Flush() // Prepare to send a packet. - r := stack.Route{ - RemoteLinkAddress: remoteLinkAddr, - } + var r stack.Route + r.ResolveWith(remoteLinkAddr) buf := buffer.NewView(100) @@ -509,9 +506,8 @@ func TestFillTxMemory(t *testing.T) { defer c.cleanup() // Prepare to send a packet. - r := stack.Route{ - RemoteLinkAddress: remoteLinkAddr, - } + var r stack.Route + r.ResolveWith(remoteLinkAddr) buf := buffer.NewView(100) @@ -557,9 +553,8 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) { defer c.cleanup() // Prepare to send a packet. - r := stack.Route{ - RemoteLinkAddress: remoteLinkAddr, - } + var r stack.Route + r.ResolveWith(remoteLinkAddr) buf := buffer.NewView(100) diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index b3e8c4b92..8d9a91020 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -53,16 +53,35 @@ type endpoint struct { nested.Endpoint writer io.Writer maxPCAPLen uint32 + logPrefix string } var _ stack.GSOEndpoint = (*endpoint)(nil) var _ stack.LinkEndpoint = (*endpoint)(nil) var _ stack.NetworkDispatcher = (*endpoint)(nil) +type direction int + +const ( + directionSend = iota + directionRecv +) + // New creates a new sniffer link-layer endpoint. It wraps around another // endpoint and logs packets and they traverse the endpoint. func New(lower stack.LinkEndpoint) stack.LinkEndpoint { - sniffer := &endpoint{} + return NewWithPrefix(lower, "") +} + +// NewWithPrefix creates a new sniffer link-layer endpoint. It wraps around +// another endpoint and logs packets prefixed with logPrefix as they traverse +// the endpoint. +// +// logPrefix is prepended to the log line without any separators. +// E.g. logPrefix = "NIC:en0/" will produce log lines like +// "NIC:en0/send udp [...]". +func NewWithPrefix(lower stack.LinkEndpoint, logPrefix string) stack.LinkEndpoint { + sniffer := &endpoint{logPrefix: logPrefix} sniffer.Endpoint.Init(lower, sniffer) return sniffer } @@ -120,7 +139,7 @@ func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) ( // called by the link-layer endpoint being wrapped when a packet arrives, and // logs the packet before forwarding to the actual dispatcher. func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { - e.dumpPacket("recv", nil, protocol, pkt) + e.dumpPacket(directionRecv, nil, protocol, pkt) e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt) } @@ -129,10 +148,10 @@ func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protoc e.Endpoint.DeliverOutboundPacket(remote, local, protocol, pkt) } -func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { +func (e *endpoint) dumpPacket(dir direction, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { writer := e.writer if writer == nil && atomic.LoadUint32(&LogPackets) == 1 { - logPacket(prefix, protocol, pkt, gso) + logPacket(e.logPrefix, dir, protocol, pkt, gso) } if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 { totalLength := pkt.Size() @@ -169,7 +188,7 @@ func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.Netw // higher-level protocols to write packets; it just logs the packet and // forwards the request to the lower endpoint. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { - e.dumpPacket("send", gso, protocol, pkt) + e.dumpPacket(directionSend, gso, protocol, pkt) return e.Endpoint.WritePacket(r, gso, protocol, pkt) } @@ -178,20 +197,12 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne // forwards the request to the lower endpoint. func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { - e.dumpPacket("send", gso, protocol, pkt) + e.dumpPacket(directionSend, gso, protocol, pkt) } return e.Endpoint.WritePackets(r, gso, pkts, protocol) } -// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. -func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - e.dumpPacket("send", nil, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: vv, - })) - return e.Endpoint.WriteRawPacket(vv) -} - -func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) { +func logPacket(prefix string, dir direction, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) { // Figure out the network layer info. var transProto uint8 src := tcpip.Address("unknown") @@ -201,6 +212,16 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P var fragmentOffset uint16 var moreFragments bool + var directionPrefix string + switch dir { + case directionSend: + directionPrefix = "send" + case directionRecv: + directionPrefix = "recv" + default: + panic(fmt.Sprintf("unrecognized direction: %d", dir)) + } + // Clone the packet buffer to not modify the original. // // We don't clone the original packet buffer so that the new packet buffer @@ -248,15 +269,16 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P arp := header.ARP(pkt.NetworkHeader().View()) log.Infof( - "%s arp %s (%s) -> %s (%s) valid:%t", + "%s%s arp %s (%s) -> %s (%s) valid:%t", prefix, + directionPrefix, tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()), tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()), arp.IsValid(), ) return default: - log.Infof("%s unknown network protocol", prefix) + log.Infof("%s%s unknown network protocol", prefix, directionPrefix) return } @@ -300,7 +322,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P icmpType = "info reply" } } - log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code()) + log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code()) return case header.ICMPv6ProtocolNumber: @@ -335,7 +357,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P case header.ICMPv6RedirectMsg: icmpType = "redirect message" } - log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code()) + log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code()) return case header.UDPProtocolNumber: @@ -391,7 +413,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P } default: - log.Infof("%s %s -> %s unknown transport protocol: %d", prefix, src, dst, transProto) + log.Infof("%s%s %s -> %s unknown transport protocol: %d", prefix, directionPrefix, src, dst, transProto) return } @@ -399,5 +421,5 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P details += fmt.Sprintf(" gso: %+v", gso) } - log.Infof("%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details) + log.Infof("%s%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, directionPrefix, transName, src, srcPort, dst, dstPort, size, id, details) } diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go index 9a76bdba7..a364c5801 100644 --- a/pkg/tcpip/link/tun/device.go +++ b/pkg/tcpip/link/tun/device.go @@ -264,7 +264,7 @@ func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) { // If the packet does not already have link layer header, and the route // does not exist, we can't compute it. This is possibly a raw packet, tun // device doesn't support this at the moment. - if info.Pkt.LinkHeader().View().IsEmpty() && info.Route.RemoteLinkAddress == "" { + if info.Pkt.LinkHeader().View().IsEmpty() && info.Route.RemoteLinkAddress() == "" { return nil, false } @@ -272,7 +272,7 @@ func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) { if d.hasFlags(linux.IFF_TAP) { // Add ethernet header if not provided. if info.Pkt.LinkHeader().View().IsEmpty() { - d.endpoint.AddHeader(info.Route.LocalLinkAddress, info.Route.RemoteLinkAddress, info.Proto, info.Pkt) + d.endpoint.AddHeader(info.Route.LocalLinkAddress, info.Route.RemoteLinkAddress(), info.Proto, info.Pkt) } vv.AppendView(info.Pkt.LinkHeader().View()) } diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD index ee84c3d96..9b4602c1b 100644 --- a/pkg/tcpip/link/waitable/BUILD +++ b/pkg/tcpip/link/waitable/BUILD @@ -11,7 +11,6 @@ go_library( deps = [ "//pkg/gate", "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/stack", ], @@ -25,7 +24,6 @@ go_test( library = ":waitable", deps = [ "//pkg/tcpip", - "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/stack", ], diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go index b152a0f26..cf0077f43 100644 --- a/pkg/tcpip/link/waitable/waitable.go +++ b/pkg/tcpip/link/waitable/waitable.go @@ -24,7 +24,6 @@ package waitable import ( "gvisor.dev/gvisor/pkg/gate" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -132,17 +131,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe return n, err } -// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. -func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - if !e.writeGate.Enter() { - return nil - } - - err := e.lower.WriteRawPacket(vv) - e.writeGate.Leave() - return err -} - // WaitWrite prevents new calls to WritePacket from reaching the lower endpoint, // and waits for inflight ones to finish before returning. func (e *Endpoint) WaitWrite() { diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go index 94827fc56..cf7fb5126 100644 --- a/pkg/tcpip/link/waitable/waitable_test.go +++ b/pkg/tcpip/link/waitable/waitable_test.go @@ -18,7 +18,6 @@ import ( "testing" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -81,11 +80,6 @@ func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack. return pkts.Len(), nil } -func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error { - e.writeCount++ - return nil -} - // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (*countedEndpoint) ARPHardwareType() header.ARPHardwareType { panic("unimplemented") diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD index b38aff0b8..9ebf31b78 100644 --- a/pkg/tcpip/network/BUILD +++ b/pkg/tcpip/network/BUILD @@ -7,12 +7,14 @@ go_test( size = "small", srcs = [ "ip_test.go", + "multicast_group_test.go", ], deps = [ "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/checker", + "//pkg/tcpip/faketime", "//pkg/tcpip/header", "//pkg/tcpip/header/parse", "//pkg/tcpip/link/channel", diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go index f462524c9..0fb373612 100644 --- a/pkg/tcpip/network/arp/arp_test.go +++ b/pkg/tcpip/network/arp/arp_test.go @@ -319,9 +319,9 @@ func TestDirectRequestWithNeighborCache(t *testing.T) { copy(h.HardwareAddressSender(), test.senderLinkAddr) copy(h.ProtocolAddressSender(), test.senderAddr) copy(h.ProtocolAddressTarget(), test.targetAddr) - c.linkEP.InjectInbound(arp.ProtocolNumber, &stack.PacketBuffer{ + c.linkEP.InjectInbound(arp.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: v.ToVectorisedView(), - }) + })) if !test.isValid { // No packets should be sent after receiving an invalid ARP request. @@ -442,9 +442,9 @@ func (*testInterface) Promiscuous() bool { func (t *testInterface) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { r := stack.Route{ - NetProto: protocol, - RemoteLinkAddress: remoteLinkAddr, + NetProto: protocol, } + r.ResolveWith(remoteLinkAddr) return t.LinkEndpoint.WritePacket(&r, gso, protocol, pkt) } @@ -557,8 +557,8 @@ func TestLinkAddressRequest(t *testing.T) { t.Fatal("expected to send a link address request") } - if pkt.Route.RemoteLinkAddress != test.expectedRemoteLinkAddr { - t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", pkt.Route.RemoteLinkAddress, test.expectedRemoteLinkAddr) + if got := pkt.Route.RemoteLinkAddress(); got != test.expectedRemoteLinkAddr { + t.Errorf("got pkt.Route.RemoteLinkAddress() = %s, want = %s", got, test.expectedRemoteLinkAddr) } rep := header.ARP(stack.PayloadSince(pkt.Pkt.NetworkHeader())) diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD index d8e4a3b54..429af69ee 100644 --- a/pkg/tcpip/network/fragmentation/BUILD +++ b/pkg/tcpip/network/fragmentation/BUILD @@ -18,7 +18,6 @@ go_template_instance( go_library( name = "fragmentation", srcs = [ - "frag_heap.go", "fragmentation.go", "reassembler.go", "reassembler_list.go", @@ -38,7 +37,6 @@ go_test( name = "fragmentation_test", size = "small", srcs = [ - "frag_heap_test.go", "fragmentation_test.go", "reassembler_test.go", ], diff --git a/pkg/tcpip/network/fragmentation/frag_heap.go b/pkg/tcpip/network/fragmentation/frag_heap.go deleted file mode 100644 index 0b570d25a..000000000 --- a/pkg/tcpip/network/fragmentation/frag_heap.go +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fragmentation - -import ( - "container/heap" - "fmt" - - "gvisor.dev/gvisor/pkg/tcpip/buffer" -) - -type fragment struct { - offset uint16 - vv buffer.VectorisedView -} - -type fragHeap []fragment - -func (h *fragHeap) Len() int { - return len(*h) -} - -func (h *fragHeap) Less(i, j int) bool { - return (*h)[i].offset < (*h)[j].offset -} - -func (h *fragHeap) Swap(i, j int) { - (*h)[i], (*h)[j] = (*h)[j], (*h)[i] -} - -func (h *fragHeap) Push(x interface{}) { - *h = append(*h, x.(fragment)) -} - -func (h *fragHeap) Pop() interface{} { - old := *h - n := len(old) - x := old[n-1] - *h = old[:n-1] - return x -} - -// reassamble empties the heap and returns a VectorisedView -// containing a reassambled version of the fragments inside the heap. -func (h *fragHeap) reassemble() (buffer.VectorisedView, error) { - curr := heap.Pop(h).(fragment) - views := curr.vv.Views() - size := curr.vv.Size() - - if curr.offset != 0 { - return buffer.VectorisedView{}, fmt.Errorf("offset of the first packet is != 0 (%d)", curr.offset) - } - - for h.Len() > 0 { - curr := heap.Pop(h).(fragment) - if int(curr.offset) < size { - curr.vv.TrimFront(size - int(curr.offset)) - } else if int(curr.offset) > size { - return buffer.VectorisedView{}, fmt.Errorf("packet has a hole, expected offset %d, got %d", size, curr.offset) - } - size += curr.vv.Size() - views = append(views, curr.vv.Views()...) - } - return buffer.NewVectorisedView(size, views), nil -} diff --git a/pkg/tcpip/network/fragmentation/frag_heap_test.go b/pkg/tcpip/network/fragmentation/frag_heap_test.go deleted file mode 100644 index 9ececcb9f..000000000 --- a/pkg/tcpip/network/fragmentation/frag_heap_test.go +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fragmentation - -import ( - "container/heap" - "reflect" - "testing" - - "gvisor.dev/gvisor/pkg/tcpip/buffer" -) - -var reassambleTestCases = []struct { - comment string - in []fragment - want buffer.VectorisedView -}{ - { - comment: "Non-overlapping in-order", - in: []fragment{ - {offset: 0, vv: vv(1, "0")}, - {offset: 1, vv: vv(1, "1")}, - }, - want: vv(2, "0", "1"), - }, - { - comment: "Non-overlapping out-of-order", - in: []fragment{ - {offset: 1, vv: vv(1, "1")}, - {offset: 0, vv: vv(1, "0")}, - }, - want: vv(2, "0", "1"), - }, - { - comment: "Duplicated packets", - in: []fragment{ - {offset: 0, vv: vv(1, "0")}, - {offset: 0, vv: vv(1, "0")}, - }, - want: vv(1, "0"), - }, - { - comment: "Overlapping in-order", - in: []fragment{ - {offset: 0, vv: vv(2, "01")}, - {offset: 1, vv: vv(2, "12")}, - }, - want: vv(3, "01", "2"), - }, - { - comment: "Overlapping out-of-order", - in: []fragment{ - {offset: 1, vv: vv(2, "12")}, - {offset: 0, vv: vv(2, "01")}, - }, - want: vv(3, "01", "2"), - }, - { - comment: "Overlapping subset in-order", - in: []fragment{ - {offset: 0, vv: vv(3, "012")}, - {offset: 1, vv: vv(1, "1")}, - }, - want: vv(3, "012"), - }, - { - comment: "Overlapping subset out-of-order", - in: []fragment{ - {offset: 1, vv: vv(1, "1")}, - {offset: 0, vv: vv(3, "012")}, - }, - want: vv(3, "012"), - }, -} - -func TestReassamble(t *testing.T) { - for _, c := range reassambleTestCases { - t.Run(c.comment, func(t *testing.T) { - h := make(fragHeap, 0, 8) - heap.Init(&h) - for _, f := range c.in { - heap.Push(&h, f) - } - got, err := h.reassemble() - if err != nil { - t.Fatal(err) - } - if !reflect.DeepEqual(got, c.want) { - t.Errorf("got reassemble(%+v) = %v, want = %v", c.in, got, c.want) - } - }) - } -} - -func TestReassambleFailsForNonZeroOffset(t *testing.T) { - h := make(fragHeap, 0, 8) - heap.Init(&h) - heap.Push(&h, fragment{offset: 1, vv: vv(1, "0")}) - _, err := h.reassemble() - if err == nil { - t.Errorf("reassemble() did not fail when the first packet had offset != 0") - } -} - -func TestReassambleFailsForHoles(t *testing.T) { - h := make(fragHeap, 0, 8) - heap.Init(&h) - heap.Push(&h, fragment{offset: 0, vv: vv(1, "0")}) - heap.Push(&h, fragment{offset: 2, vv: vv(1, "1")}) - _, err := h.reassemble() - if err == nil { - t.Errorf("reassemble() did not fail when there was a hole in the packet") - } -} diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go index c75ca7d71..1af87d713 100644 --- a/pkg/tcpip/network/fragmentation/fragmentation.go +++ b/pkg/tcpip/network/fragmentation/fragmentation.go @@ -46,9 +46,17 @@ const ( ) var ( - // ErrInvalidArgs indicates to the caller that that an invalid argument was + // ErrInvalidArgs indicates to the caller that an invalid argument was // provided. ErrInvalidArgs = errors.New("invalid args") + + // ErrFragmentOverlap indicates that, during reassembly, a fragment overlaps + // with another one. + ErrFragmentOverlap = errors.New("overlapping fragments") + + // ErrFragmentConflict indicates that, during reassembly, some fragments are + // in conflict with one another. + ErrFragmentConflict = errors.New("conflicting fragments") ) // FragmentID is the identifier for a fragment. diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go index 19f4920b3..9b20bb1d8 100644 --- a/pkg/tcpip/network/fragmentation/reassembler.go +++ b/pkg/tcpip/network/fragmentation/reassembler.go @@ -15,9 +15,8 @@ package fragmentation import ( - "container/heap" - "fmt" "math" + "sort" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" @@ -26,9 +25,11 @@ import ( ) type hole struct { - first uint16 - last uint16 - deleted bool + first uint16 + last uint16 + filled bool + final bool + data buffer.View } type reassembler struct { @@ -38,8 +39,7 @@ type reassembler struct { proto uint8 mu sync.Mutex holes []hole - deleted int - heap fragHeap + filled int done bool creationTime int64 pkt *stack.PacketBuffer @@ -48,49 +48,94 @@ type reassembler struct { func newReassembler(id FragmentID, clock tcpip.Clock) *reassembler { r := &reassembler{ id: id, - holes: make([]hole, 0, 16), - heap: make(fragHeap, 0, 8), creationTime: clock.NowMonotonic(), } r.holes = append(r.holes, hole{ - first: 0, - last: math.MaxUint16, - deleted: false}) + first: 0, + last: math.MaxUint16, + filled: false, + final: true, + }) return r } -// updateHoles updates the list of holes for an incoming fragment and -// returns true iff the fragment filled at least part of an existing hole. -func (r *reassembler) updateHoles(first, last uint16, more bool) bool { - used := false - for i := range r.holes { - if r.holes[i].deleted || first > r.holes[i].last || last < r.holes[i].first { - continue - } - used = true - r.deleted++ - r.holes[i].deleted = true - if first > r.holes[i].first { - r.holes = append(r.holes, hole{r.holes[i].first, first - 1, false}) - } - if last < r.holes[i].last && more { - r.holes = append(r.holes, hole{last + 1, r.holes[i].last, false}) - } - } - return used -} - func (r *reassembler) process(first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) (buffer.VectorisedView, uint8, bool, int, error) { r.mu.Lock() defer r.mu.Unlock() - consumed := 0 if r.done { // A concurrent goroutine might have already reassembled // the packet and emptied the heap while this goroutine // was waiting on the mutex. We don't have to do anything in this case. - return buffer.VectorisedView{}, 0, false, consumed, nil + return buffer.VectorisedView{}, 0, false, 0, nil } - if r.updateHoles(first, last, more) { + + var holeFound bool + var consumed int + for i := range r.holes { + currentHole := &r.holes[i] + + if last < currentHole.first || currentHole.last < first { + continue + } + // For IPv6, overlaps with an existing fragment are explicitly forbidden by + // RFC 8200 section 4.5: + // If any of the fragments being reassembled overlap with any other + // fragments being reassembled for the same packet, reassembly of that + // packet must be abandoned and all the fragments that have been received + // for that packet must be discarded, and no ICMP error messages should be + // sent. + // + // It is not explicitly forbidden for IPv4, but to keep parity with Linux we + // disallow it as well: + // https://github.com/torvalds/linux/blob/38525c6/net/ipv4/inet_fragment.c#L349 + if first < currentHole.first || currentHole.last < last { + // Incoming fragment only partially fits in the free hole. + return buffer.VectorisedView{}, 0, false, 0, ErrFragmentOverlap + } + if !more { + if !currentHole.final || currentHole.filled && currentHole.last != last { + // We have another final fragment, which does not perfectly overlap. + return buffer.VectorisedView{}, 0, false, 0, ErrFragmentConflict + } + } + + holeFound = true + if currentHole.filled { + // Incoming fragment is a duplicate. + continue + } + + // We are populating the current hole with the payload and creating a new + // hole for any unfilled ranges on either end. + if first > currentHole.first { + r.holes = append(r.holes, hole{ + first: currentHole.first, + last: first - 1, + filled: false, + final: false, + }) + } + if last < currentHole.last && more { + r.holes = append(r.holes, hole{ + first: last + 1, + last: currentHole.last, + filled: false, + final: currentHole.final, + }) + currentHole.final = false + } + v := pkt.Data.ToOwnedView() + consumed = v.Size() + r.size += consumed + // Update the current hole to precisely match the incoming fragment. + r.holes[i] = hole{ + first: first, + last: last, + filled: true, + final: currentHole.final, + data: v, + } + r.filled++ // For IPv6, it is possible to have different Protocol values between // fragments of a packet (because, unlike IPv4, the Protocol is not used to // identify a fragment). In this case, only the Protocol of the first @@ -103,21 +148,30 @@ func (r *reassembler) process(first, last uint16, more bool, proto uint8, pkt *s r.pkt = pkt r.proto = proto } - vv := pkt.Data - // We store the incoming packet only if it filled some holes. - heap.Push(&r.heap, fragment{offset: first, vv: vv.Clone(nil)}) - consumed = vv.Size() - r.size += consumed + + break + } + if !holeFound { + // Incoming fragment is beyond end. + return buffer.VectorisedView{}, 0, false, 0, ErrFragmentConflict } - // Check if all the holes have been deleted and we are ready to reassamble. - if r.deleted < len(r.holes) { + + // Check if all the holes have been filled and we are ready to reassemble. + if r.filled < len(r.holes) { return buffer.VectorisedView{}, 0, false, consumed, nil } - res, err := r.heap.reassemble() - if err != nil { - return buffer.VectorisedView{}, 0, false, consumed, fmt.Errorf("fragment reassembly failed: %w", err) + + sort.Slice(r.holes, func(i, j int) bool { + return r.holes[i].first < r.holes[j].first + }) + + var size int + views := make([]buffer.View, 0, len(r.holes)) + for _, hole := range r.holes { + views = append(views, hole.data) + size += hole.data.Size() } - return res, r.proto, true, consumed, nil + return buffer.NewVectorisedView(size, views), r.proto, true, consumed, nil } func (r *reassembler) checkDoneOrMark() bool { diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go index a0a04a027..2ff03eeeb 100644 --- a/pkg/tcpip/network/fragmentation/reassembler_test.go +++ b/pkg/tcpip/network/fragmentation/reassembler_test.go @@ -16,92 +16,175 @@ package fragmentation import ( "math" - "reflect" "testing" + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/faketime" + "gvisor.dev/gvisor/pkg/tcpip/stack" ) -type updateHolesInput struct { - first uint16 - last uint16 - more bool +type processParams struct { + first uint16 + last uint16 + more bool + pkt *stack.PacketBuffer + wantDone bool + wantError error } -var holesTestCases = []struct { - comment string - in []updateHolesInput - want []hole -}{ - { - comment: "No fragments. Expected holes: {[0 -> inf]}.", - in: []updateHolesInput{}, - want: []hole{{first: 0, last: math.MaxUint16, deleted: false}}, - }, - { - comment: "One fragment at beginning. Expected holes: {[2, inf]}.", - in: []updateHolesInput{{first: 0, last: 1, more: true}}, - want: []hole{ - {first: 0, last: math.MaxUint16, deleted: true}, - {first: 2, last: math.MaxUint16, deleted: false}, +func TestReassemblerProcess(t *testing.T) { + const proto = 99 + + v := func(size int) buffer.View { + payload := buffer.NewView(size) + for i := 1; i < size; i++ { + payload[i] = uint8(i) * 3 + } + return payload + } + + pkt := func(size int) *stack.PacketBuffer { + return stack.NewPacketBuffer(stack.PacketBufferOptions{ + Data: v(size).ToVectorisedView(), + }) + } + + var tests = []struct { + name string + params []processParams + want []hole + }{ + { + name: "No fragments", + params: nil, + want: []hole{{first: 0, last: math.MaxUint16, filled: false, final: true}}, }, - }, - { - comment: "One fragment in the middle. Expected holes: {[0, 0], [3, inf]}.", - in: []updateHolesInput{{first: 1, last: 2, more: true}}, - want: []hole{ - {first: 0, last: math.MaxUint16, deleted: true}, - {first: 0, last: 0, deleted: false}, - {first: 3, last: math.MaxUint16, deleted: false}, + { + name: "One fragment at beginning", + params: []processParams{{first: 0, last: 1, more: true, pkt: pkt(2), wantDone: false, wantError: nil}}, + want: []hole{ + {first: 0, last: 1, filled: true, final: false, data: v(2)}, + {first: 2, last: math.MaxUint16, filled: false, final: true}, + }, }, - }, - { - comment: "One fragment at the end. Expected holes: {[0, 0]}.", - in: []updateHolesInput{{first: 1, last: 2, more: false}}, - want: []hole{ - {first: 0, last: math.MaxUint16, deleted: true}, - {first: 0, last: 0, deleted: false}, + { + name: "One fragment in the middle", + params: []processParams{{first: 1, last: 2, more: true, pkt: pkt(2), wantDone: false, wantError: nil}}, + want: []hole{ + {first: 1, last: 2, filled: true, final: false, data: v(2)}, + {first: 0, last: 0, filled: false, final: false}, + {first: 3, last: math.MaxUint16, filled: false, final: true}, + }, }, - }, - { - comment: "One fragment completing a packet. Expected holes: {}.", - in: []updateHolesInput{{first: 0, last: 1, more: false}}, - want: []hole{ - {first: 0, last: math.MaxUint16, deleted: true}, + { + name: "One fragment at the end", + params: []processParams{{first: 1, last: 2, more: false, pkt: pkt(2), wantDone: false, wantError: nil}}, + want: []hole{ + {first: 1, last: 2, filled: true, final: true, data: v(2)}, + {first: 0, last: 0, filled: false}, + }, }, - }, - { - comment: "Two non-overlapping fragments completing a packet. Expected holes: {}.", - in: []updateHolesInput{ - {first: 0, last: 1, more: true}, - {first: 2, last: 3, more: false}, + { + name: "One fragment completing a packet", + params: []processParams{{first: 0, last: 1, more: false, pkt: pkt(2), wantDone: true, wantError: nil}}, + want: []hole{ + {first: 0, last: 1, filled: true, final: true, data: v(2)}, + }, }, - want: []hole{ - {first: 0, last: math.MaxUint16, deleted: true}, - {first: 2, last: math.MaxUint16, deleted: true}, + { + name: "Two fragments completing a packet", + params: []processParams{ + {first: 0, last: 1, more: true, pkt: pkt(2), wantDone: false, wantError: nil}, + {first: 2, last: 3, more: false, pkt: pkt(2), wantDone: true, wantError: nil}, + }, + want: []hole{ + {first: 0, last: 1, filled: true, final: false, data: v(2)}, + {first: 2, last: 3, filled: true, final: true, data: v(2)}, + }, }, - }, - { - comment: "Two overlapping fragments completing a packet. Expected holes: {}.", - in: []updateHolesInput{ - {first: 0, last: 2, more: true}, - {first: 2, last: 3, more: false}, + { + name: "Two fragments completing a packet with a duplicate", + params: []processParams{ + {first: 0, last: 1, more: true, pkt: pkt(2), wantDone: false, wantError: nil}, + {first: 0, last: 1, more: true, pkt: pkt(2), wantDone: false, wantError: nil}, + {first: 2, last: 3, more: false, pkt: pkt(2), wantDone: true, wantError: nil}, + }, + want: []hole{ + {first: 0, last: 1, filled: true, final: false, data: v(2)}, + {first: 2, last: 3, filled: true, final: true, data: v(2)}, + }, }, - want: []hole{ - {first: 0, last: math.MaxUint16, deleted: true}, - {first: 3, last: math.MaxUint16, deleted: true}, + { + name: "Two fragments completing a packet with a partial duplicate", + params: []processParams{ + {first: 0, last: 3, more: true, pkt: pkt(4), wantDone: false, wantError: nil}, + {first: 1, last: 2, more: true, pkt: pkt(2), wantDone: false, wantError: nil}, + {first: 4, last: 5, more: false, pkt: pkt(2), wantDone: true, wantError: nil}, + }, + want: []hole{ + {first: 0, last: 3, filled: true, final: false, data: v(4)}, + {first: 4, last: 5, filled: true, final: true, data: v(2)}, + }, }, - }, -} + { + name: "Two overlapping fragments", + params: []processParams{ + {first: 0, last: 10, more: true, pkt: pkt(11), wantDone: false, wantError: nil}, + {first: 5, last: 15, more: false, pkt: pkt(11), wantDone: false, wantError: ErrFragmentOverlap}, + }, + want: []hole{ + {first: 0, last: 10, filled: true, final: false, data: v(11)}, + {first: 11, last: math.MaxUint16, filled: false, final: true}, + }, + }, + { + name: "Two final fragments with different ends", + params: []processParams{ + {first: 10, last: 14, more: false, pkt: pkt(5), wantDone: false, wantError: nil}, + {first: 0, last: 9, more: false, pkt: pkt(10), wantDone: false, wantError: ErrFragmentConflict}, + }, + want: []hole{ + {first: 10, last: 14, filled: true, final: true, data: v(5)}, + {first: 0, last: 9, filled: false, final: false}, + }, + }, + { + name: "Two final fragments - duplicate", + params: []processParams{ + {first: 5, last: 14, more: false, pkt: pkt(10), wantDone: false, wantError: nil}, + {first: 10, last: 14, more: false, pkt: pkt(5), wantDone: false, wantError: nil}, + }, + want: []hole{ + {first: 5, last: 14, filled: true, final: true, data: v(10)}, + {first: 0, last: 4, filled: false, final: false}, + }, + }, + { + name: "Two final fragments - duplicate, with different ends", + params: []processParams{ + {first: 5, last: 14, more: false, pkt: pkt(10), wantDone: false, wantError: nil}, + {first: 10, last: 13, more: false, pkt: pkt(4), wantDone: false, wantError: ErrFragmentConflict}, + }, + want: []hole{ + {first: 5, last: 14, filled: true, final: true, data: v(10)}, + {first: 0, last: 4, filled: false, final: false}, + }, + }, + } -func TestUpdateHoles(t *testing.T) { - for _, c := range holesTestCases { - r := newReassembler(FragmentID{}, &faketime.NullClock{}) - for _, i := range c.in { - r.updateHoles(i.first, i.last, i.more) - } - if !reflect.DeepEqual(r.holes, c.want) { - t.Errorf("Test \"%s\" produced unexepetced holes. Got %v. Want %v", c.comment, r.holes, c.want) - } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + r := newReassembler(FragmentID{}, &faketime.NullClock{}) + for _, param := range test.params { + _, _, done, _, err := r.process(param.first, param.last, param.more, proto, param.pkt) + if done != param.wantDone || err != param.wantError { + t.Errorf("got r.process(%d, %d, %t, %d, _) = (_, _, %t, _, %v), want = (%t, %v)", param.first, param.last, param.more, proto, done, err, param.wantDone, param.wantError) + } + } + if diff := cmp.Diff(test.want, r.holes, cmp.AllowUnexported(hole{})); diff != "" { + t.Errorf("r.holes mismatch (-want +got):\n%s", diff) + } + }) } } diff --git a/pkg/tcpip/network/ip/BUILD b/pkg/tcpip/network/ip/BUILD new file mode 100644 index 000000000..ca1247c1e --- /dev/null +++ b/pkg/tcpip/network/ip/BUILD @@ -0,0 +1,26 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "ip", + srcs = ["generic_multicast_protocol.go"], + visibility = ["//visibility:public"], + deps = [ + "//pkg/sync", + "//pkg/tcpip", + ], +) + +go_test( + name = "ip_test", + size = "small", + srcs = ["generic_multicast_protocol_test.go"], + deps = [ + ":ip", + "//pkg/sync", + "//pkg/tcpip", + "//pkg/tcpip/faketime", + "@com_github_google_go_cmp//cmp:go_default_library", + ], +) diff --git a/pkg/tcpip/network/ip/generic_multicast_protocol.go b/pkg/tcpip/network/ip/generic_multicast_protocol.go new file mode 100644 index 000000000..f2f0e069c --- /dev/null +++ b/pkg/tcpip/network/ip/generic_multicast_protocol.go @@ -0,0 +1,676 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ip holds IPv4/IPv6 common utilities. +package ip + +import ( + "fmt" + "math/rand" + "time" + + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" +) + +// hostState is the state a host may be in for a multicast group. +type hostState int + +// The states below are generic across IGMPv2 (RFC 2236 section 6) and MLDv1 +// (RFC 2710 section 5). Even though the states are generic across both IGMPv2 +// and MLDv1, IGMPv2 terminology will be used. +// +// ______________receive query______________ +// | | +// | _____send or receive report_____ | +// | | | | +// V | V | +// +-------+ +-----------+ +------------+ +-------------------+ +--------+ | +// | Non-M | | Pending-M | | Delaying-M | | Queued Delaying-M | | Idle-M | - +// +-------+ +-----------+ +------------+ +-------------------+ +--------+ +// | ^ | ^ | ^ | ^ +// | | | | | | | | +// ---------- ------- ---------- ------------- +// initialize new send inital fail to send send or receive +// group membership report delayed report report +// +// Not shown in the diagram above, but any state may transition into the non +// member state when a group is left. +const ( + // nonMember is the "'Non-Member' state, when the host does not belong to the + // group on the interface. This is the initial state for all memberships on + // all network interfaces; it requires no storage in the host." + // + // 'Non-Listener' is the MLDv1 term used to describe this state. + // + // This state is used to keep track of groups that have been joined locally, + // but without advertising the membership to the network. + nonMember hostState = iota + + // pendingMember is a newly joined member that is waiting to successfully send + // the initial set of reports. + // + // This is not an RFC defined state; it is an implementation specific state to + // track that the initial report needs to be sent. + // + // MAY NOT transition to the idle member state from this state. + pendingMember + + // delayingMember is the "'Delaying Member' state, when the host belongs to + // the group on the interface and has a report delay timer running for that + // membership." + // + // 'Delaying Listener' is the MLDv1 term used to describe this state. + delayingMember + + // queuedDelayingMember is a delayingMember that failed to send a report after + // its delayed report timer fired. Hosts in this state are waiting to attempt + // retransmission of the delayed report. + // + // This is not an RFC defined state; it is an implementation specific state to + // track that the delayed report needs to be sent. + // + // May transition to idle member if a report is received for a group. + queuedDelayingMember + + // idleMember is the "Idle Member" state, when the host belongs to the group + // on the interface and does not have a report delay timer running for that + // membership. + // + // 'Idle Listener' is the MLDv1 term used to describe this state. + idleMember +) + +func (s hostState) isDelayingMember() bool { + switch s { + case nonMember, pendingMember, idleMember: + return false + case delayingMember, queuedDelayingMember: + return true + default: + panic(fmt.Sprintf("unrecognized host state = %d", s)) + } +} + +// multicastGroupState holds the Generic Multicast Protocol state for a +// multicast group. +type multicastGroupState struct { + // joins is the number of times the group has been joined. + joins uint64 + + // state holds the host's state for the group. + state hostState + + // lastToSendReport is true if we sent the last report for the group. It is + // used to track whether there are other hosts on the subnet that are also + // members of the group. + // + // Defined in RFC 2236 section 6 page 9 for IGMPv2 and RFC 2710 section 5 page + // 8 for MLDv1. + lastToSendReport bool + + // delayedReportJob is used to delay sending responses to membership report + // messages in order to reduce duplicate reports from multiple hosts on the + // interface. + // + // Must not be nil. + delayedReportJob *tcpip.Job +} + +// GenericMulticastProtocolOptions holds options for the generic multicast +// protocol. +type GenericMulticastProtocolOptions struct { + // Rand is the source of random numbers. + Rand *rand.Rand + + // Clock is the clock used to create timers. + Clock tcpip.Clock + + // Protocol is the implementation of the variant of multicast group protocol + // in use. + Protocol MulticastGroupProtocol + + // MaxUnsolicitedReportDelay is the maximum amount of time to wait between + // transmitting unsolicited reports. + // + // Unsolicited reports are transmitted when a group is newly joined. + MaxUnsolicitedReportDelay time.Duration + + // AllNodesAddress is a multicast address that all nodes on a network should + // be a member of. + // + // This address will not have the generic multicast protocol performed on it; + // it will be left in the non member/listener state, and packets will never + // be sent for it. + AllNodesAddress tcpip.Address +} + +// MulticastGroupProtocol is a multicast group protocol whose core state machine +// can be represented by GenericMulticastProtocolState. +type MulticastGroupProtocol interface { + // Enabled indicates whether the generic multicast protocol will be + // performed. + // + // When enabled, the protocol may transmit report and leave messages when + // joining and leaving multicast groups respectively, and handle incoming + // packets. + // + // When disabled, the protocol will still keep track of locally joined groups, + // it just won't transmit and handle packets, or update groups' state. + Enabled() bool + + // SendReport sends a multicast report for the specified group address. + // + // Returns false if the caller should queue the report to be sent later. Note, + // returning false does not mean that the receiver hit an error. + SendReport(groupAddress tcpip.Address) (sent bool, err *tcpip.Error) + + // SendLeave sends a multicast leave for the specified group address. + SendLeave(groupAddress tcpip.Address) *tcpip.Error +} + +// GenericMulticastProtocolState is the per interface generic multicast protocol +// state. +// +// There is actually no protocol named "Generic Multicast Protocol". Instead, +// the term used to refer to a generic multicast protocol that applies to both +// IPv4 and IPv6. Specifically, Generic Multicast Protocol is the core state +// machine of IGMPv2 as defined by RFC 2236 and MLDv1 as defined by RFC 2710. +// +// Callers must synchronize accesses to the generic multicast protocol state; +// GenericMulticastProtocolState obtains no locks in any of its methods. The +// only exception to this is GenericMulticastProtocolState's timer/job callbacks +// which will obtain the lock provided to the GenericMulticastProtocolState when +// it is initialized. +// +// GenericMulticastProtocolState.Init MUST be called before calling any of +// the methods on GenericMulticastProtocolState. +// +// GenericMulticastProtocolState.MakeAllNonMemberLocked MUST be called when the +// multicast group protocol is disabled so that leave messages may be sent. +type GenericMulticastProtocolState struct { + // Do not allow overwriting this state. + _ sync.NoCopy + + opts GenericMulticastProtocolOptions + + // memberships holds group addresses and their associated state. + memberships map[tcpip.Address]multicastGroupState + + // protocolMU is the mutex used to protect the protocol. + protocolMU *sync.RWMutex +} + +// Init initializes the Generic Multicast Protocol state. +// +// Must only be called once for the lifetime of g; Init will panic if it is +// called twice. +// +// The GenericMulticastProtocolState will only grab the lock when timers/jobs +// fire. +// +// Note: the methods on opts.Protocol will always be called while protocolMU is +// held. +func (g *GenericMulticastProtocolState) Init(protocolMU *sync.RWMutex, opts GenericMulticastProtocolOptions) { + if g.memberships != nil { + panic("attempted to initialize generic membership protocol state twice") + } + + *g = GenericMulticastProtocolState{ + opts: opts, + memberships: make(map[tcpip.Address]multicastGroupState), + protocolMU: protocolMU, + } +} + +// MakeAllNonMemberLocked transitions all groups to the non-member state. +// +// The groups will still be considered joined locally. +// +// MUST be called when the multicast group protocol is disabled. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) MakeAllNonMemberLocked() { + if !g.opts.Protocol.Enabled() { + return + } + + for groupAddress, info := range g.memberships { + g.transitionToNonMemberLocked(groupAddress, &info) + g.memberships[groupAddress] = info + } +} + +// InitializeGroupsLocked initializes each group, as if they were newly joined +// but without affecting the groups' join count. +// +// Must only be called after calling MakeAllNonMember as a group should not be +// initialized while it is not in the non-member state. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) InitializeGroupsLocked() { + if !g.opts.Protocol.Enabled() { + return + } + + for groupAddress, info := range g.memberships { + g.initializeNewMemberLocked(groupAddress, &info) + g.memberships[groupAddress] = info + } +} + +// SendQueuedReportsLocked attempts to send reports for groups that failed to +// send reports during their last attempt. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) SendQueuedReportsLocked() { + for groupAddress, info := range g.memberships { + switch info.state { + case nonMember, delayingMember, idleMember: + case pendingMember: + // pendingMembers failed to send their initial unsolicited report so try + // to send the report and queue the extra unsolicited reports. + g.maybeSendInitialReportLocked(groupAddress, &info) + case queuedDelayingMember: + // queuedDelayingMembers failed to send their delayed reports so try to + // send the report and transition them to the idle state. + g.maybeSendDelayedReportLocked(groupAddress, &info) + default: + panic(fmt.Sprintf("unrecognized host state = %d", info.state)) + } + g.memberships[groupAddress] = info + } +} + +// JoinGroupLocked handles joining a new group. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) JoinGroupLocked(groupAddress tcpip.Address) { + if info, ok := g.memberships[groupAddress]; ok { + // The group has already been joined. + info.joins++ + g.memberships[groupAddress] = info + return + } + + info := multicastGroupState{ + // Since we just joined the group, its count is 1. + joins: 1, + // The state will be updated below, if required. + state: nonMember, + lastToSendReport: false, + delayedReportJob: tcpip.NewJob(g.opts.Clock, g.protocolMU, func() { + if !g.opts.Protocol.Enabled() { + panic(fmt.Sprintf("delayed report job fired for group %s while the multicast group protocol is disabled", groupAddress)) + } + + info, ok := g.memberships[groupAddress] + if !ok { + panic(fmt.Sprintf("expected to find group state for group = %s", groupAddress)) + } + + g.maybeSendDelayedReportLocked(groupAddress, &info) + g.memberships[groupAddress] = info + }), + } + + if g.opts.Protocol.Enabled() { + g.initializeNewMemberLocked(groupAddress, &info) + } + + g.memberships[groupAddress] = info +} + +// IsLocallyJoinedRLocked returns true if the group is locally joined. +// +// Precondition: g.protocolMU must be read locked. +func (g *GenericMulticastProtocolState) IsLocallyJoinedRLocked(groupAddress tcpip.Address) bool { + _, ok := g.memberships[groupAddress] + return ok +} + +// LeaveGroupLocked handles leaving the group. +// +// Returns false if the group is not currently joined. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) LeaveGroupLocked(groupAddress tcpip.Address) bool { + info, ok := g.memberships[groupAddress] + if !ok { + return false + } + + if info.joins == 0 { + panic(fmt.Sprintf("tried to leave group %s with a join count of 0", groupAddress)) + } + info.joins-- + if info.joins != 0 { + // If we still have outstanding joins, then do nothing further. + g.memberships[groupAddress] = info + return true + } + + g.transitionToNonMemberLocked(groupAddress, &info) + delete(g.memberships, groupAddress) + return true +} + +// HandleQueryLocked handles a query message with the specified maximum response +// time. +// +// If the group address is unspecified, then reports will be scheduled for all +// joined groups. +// +// Report(s) will be scheduled to be sent after a random duration between 0 and +// the maximum response time. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) HandleQueryLocked(groupAddress tcpip.Address, maxResponseTime time.Duration) { + if !g.opts.Protocol.Enabled() { + return + } + + // As per RFC 2236 section 2.4 (for IGMPv2), + // + // In a Membership Query message, the group address field is set to zero + // when sending a General Query, and set to the group address being + // queried when sending a Group-Specific Query. + // + // As per RFC 2710 section 3.6 (for MLDv1), + // + // In a Query message, the Multicast Address field is set to zero when + // sending a General Query, and set to a specific IPv6 multicast address + // when sending a Multicast-Address-Specific Query. + if groupAddress.Unspecified() { + // This is a general query as the group address is unspecified. + for groupAddress, info := range g.memberships { + g.setDelayTimerForAddressRLocked(groupAddress, &info, maxResponseTime) + g.memberships[groupAddress] = info + } + } else if info, ok := g.memberships[groupAddress]; ok { + g.setDelayTimerForAddressRLocked(groupAddress, &info, maxResponseTime) + g.memberships[groupAddress] = info + } +} + +// HandleReportLocked handles a report message. +// +// If the report is for a joined group, any active delayed report will be +// cancelled and the host state for the group transitions to idle. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) HandleReportLocked(groupAddress tcpip.Address) { + if !g.opts.Protocol.Enabled() { + return + } + + // As per RFC 2236 section 3 pages 3-4 (for IGMPv2), + // + // If the host receives another host's Report (version 1 or 2) while it has + // a timer running, it stops its timer for the specified group and does not + // send a Report + // + // As per RFC 2710 section 4 page 6 (for MLDv1), + // + // If a node receives another node's Report from an interface for a + // multicast address while it has a timer running for that same address + // on that interface, it stops its timer and does not send a Report for + // that address, thus suppressing duplicate reports on the link. + if info, ok := g.memberships[groupAddress]; ok && info.state.isDelayingMember() { + info.delayedReportJob.Cancel() + info.lastToSendReport = false + info.state = idleMember + g.memberships[groupAddress] = info + } +} + +// initializeNewMemberLocked initializes a new group membership. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) initializeNewMemberLocked(groupAddress tcpip.Address, info *multicastGroupState) { + if info.state != nonMember { + panic(fmt.Sprintf("host must be in non-member state to be initialized; group = %s, state = %d", groupAddress, info.state)) + } + + info.lastToSendReport = false + + if groupAddress == g.opts.AllNodesAddress { + // As per RFC 2236 section 6 page 10 (for IGMPv2), + // + // The all-systems group (address 224.0.0.1) is handled as a special + // case. The host starts in Idle Member state for that group on every + // interface, never transitions to another state, and never sends a + // report for that group. + // + // As per RFC 2710 section 5 page 10 (for MLDv1), + // + // The link-scope all-nodes address (FF02::1) is handled as a special + // case. The node starts in Idle Listener state for that address on + // every interface, never transitions to another state, and never sends + // a Report or Done for that address. + info.state = idleMember + return + } + + info.state = pendingMember + g.maybeSendInitialReportLocked(groupAddress, info) +} + +// maybeSendInitialReportLocked attempts to start transmission of the initial +// set of reports after newly joining a group. +// +// Host must be in pending member state. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) maybeSendInitialReportLocked(groupAddress tcpip.Address, info *multicastGroupState) { + if info.state != pendingMember { + panic(fmt.Sprintf("host must be in pending member state to send initial reports; group = %s, state = %d", groupAddress, info.state)) + } + + // As per RFC 2236 section 3 page 5 (for IGMPv2), + // + // When a host joins a multicast group, it should immediately transmit an + // unsolicited Version 2 Membership Report for that group" ... "it is + // recommended that it be repeated". + // + // As per RFC 2710 section 4 page 6 (for MLDv1), + // + // When a node starts listening to a multicast address on an interface, + // it should immediately transmit an unsolicited Report for that address + // on that interface, in case it is the first listener on the link. To + // cover the possibility of the initial Report being lost or damaged, it + // is recommended that it be repeated once or twice after short delays + // [Unsolicited Report Interval]. + // + // TODO(gvisor.dev/issue/4901): Support a configurable number of initial + // unsolicited reports. + sent, err := g.opts.Protocol.SendReport(groupAddress) + if err == nil && sent { + info.lastToSendReport = true + g.setDelayTimerForAddressRLocked(groupAddress, info, g.opts.MaxUnsolicitedReportDelay) + } +} + +// maybeSendDelayedReportLocked attempts to send the delayed report. +// +// Host must be in pending, delaying or queued delaying member state. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) maybeSendDelayedReportLocked(groupAddress tcpip.Address, info *multicastGroupState) { + if !info.state.isDelayingMember() { + panic(fmt.Sprintf("host must be in delaying or queued delaying member state to send delayed reports; group = %s, state = %d", groupAddress, info.state)) + } + + sent, err := g.opts.Protocol.SendReport(groupAddress) + if err == nil && sent { + info.lastToSendReport = true + info.state = idleMember + } else { + info.state = queuedDelayingMember + } +} + +// maybeSendLeave attempts to send a leave message. +func (g *GenericMulticastProtocolState) maybeSendLeave(groupAddress tcpip.Address, lastToSendReport bool) { + if !g.opts.Protocol.Enabled() || !lastToSendReport { + return + } + + if groupAddress == g.opts.AllNodesAddress { + // As per RFC 2236 section 6 page 10 (for IGMPv2), + // + // The all-systems group (address 224.0.0.1) is handled as a special + // case. The host starts in Idle Member state for that group on every + // interface, never transitions to another state, and never sends a + // report for that group. + // + // As per RFC 2710 section 5 page 10 (for MLDv1), + // + // The link-scope all-nodes address (FF02::1) is handled as a special + // case. The node starts in Idle Listener state for that address on + // every interface, never transitions to another state, and never sends + // a Report or Done for that address. + return + } + + // Okay to ignore the error here as if packet write failed, the multicast + // routers will eventually drop our membership anyways. If the interface is + // being disabled or removed, the generic multicast protocol's should be + // cleared eventually. + // + // As per RFC 2236 section 3 page 5 (for IGMPv2), + // + // When a router receives a Report, it adds the group being reported to + // the list of multicast group memberships on the network on which it + // received the Report and sets the timer for the membership to the + // [Group Membership Interval]. Repeated Reports refresh the timer. If + // no Reports are received for a particular group before this timer has + // expired, the router assumes that the group has no local members and + // that it need not forward remotely-originated multicasts for that + // group onto the attached network. + // + // As per RFC 2710 section 4 page 5 (for MLDv1), + // + // When a router receives a Report from a link, if the reported address + // is not already present in the router's list of multicast address + // having listeners on that link, the reported address is added to the + // list, its timer is set to [Multicast Listener Interval], and its + // appearance is made known to the router's multicast routing component. + // If a Report is received for a multicast address that is already + // present in the router's list, the timer for that address is reset to + // [Multicast Listener Interval]. If an address's timer expires, it is + // assumed that there are no longer any listeners for that address + // present on the link, so it is deleted from the list and its + // disappearance is made known to the multicast routing component. + // + // The requirement to send a leave message is also optional (it MAY be + // skipped): + // + // As per RFC 2236 section 6 page 8 (for IGMPv2), + // + // "send leave" for the group on the interface. If the interface + // state says the Querier is running IGMPv1, this action SHOULD be + // skipped. If the flag saying we were the last host to report is + // cleared, this action MAY be skipped. The Leave Message is sent to + // the ALL-ROUTERS group (224.0.0.2). + // + // As per RFC 2710 section 5 page 8 (for MLDv1), + // + // "send done" for the address on the interface. If the flag saying + // we were the last node to report is cleared, this action MAY be + // skipped. The Done message is sent to the link-scope all-routers + // address (FF02::2). + _ = g.opts.Protocol.SendLeave(groupAddress) +} + +// transitionToNonMemberLocked transitions the given multicast group the the +// non-member/listener state. +// +// Precondition: g.protocolMU must be locked. +func (g *GenericMulticastProtocolState) transitionToNonMemberLocked(groupAddress tcpip.Address, info *multicastGroupState) { + if info.state == nonMember { + return + } + + info.delayedReportJob.Cancel() + g.maybeSendLeave(groupAddress, info.lastToSendReport) + info.lastToSendReport = false + info.state = nonMember +} + +// setDelayTimerForAddressRLocked sets timer to send a delay report. +// +// Precondition: g.protocolMU MUST be read locked. +func (g *GenericMulticastProtocolState) setDelayTimerForAddressRLocked(groupAddress tcpip.Address, info *multicastGroupState, maxResponseTime time.Duration) { + if info.state == nonMember { + return + } + + if groupAddress == g.opts.AllNodesAddress { + // As per RFC 2236 section 6 page 10 (for IGMPv2), + // + // The all-systems group (address 224.0.0.1) is handled as a special + // case. The host starts in Idle Member state for that group on every + // interface, never transitions to another state, and never sends a + // report for that group. + // + // As per RFC 2710 section 5 page 10 (for MLDv1), + // + // The link-scope all-nodes address (FF02::1) is handled as a special + // case. The node starts in Idle Listener state for that address on + // every interface, never transitions to another state, and never sends + // a Report or Done for that address. + return + } + + // As per RFC 2236 section 3 page 3 (for IGMPv2), + // + // If a timer for the group is already unning, it is reset to the random + // value only if the requested Max Response Time is less than the remaining + // value of the running timer. + // + // As per RFC 2710 section 4 page 5 (for MLDv1), + // + // If a timer for any address is already running, it is reset to the new + // random value only if the requested Maximum Response Delay is less than + // the remaining value of the running timer. + if info.state == delayingMember { + // TODO: Reset the timer if time remaining is greater than maxResponseTime. + return + } + + info.state = delayingMember + info.delayedReportJob.Cancel() + info.delayedReportJob.Schedule(g.calculateDelayTimerDuration(maxResponseTime)) +} + +// calculateDelayTimerDuration returns a random time between (0, maxRespTime]. +func (g *GenericMulticastProtocolState) calculateDelayTimerDuration(maxRespTime time.Duration) time.Duration { + // As per RFC 2236 section 3 page 3 (for IGMPv2), + // + // When a host receives a Group-Specific Query, it sets a delay timer to a + // random value selected from the range (0, Max Response Time]... + // + // As per RFC 2710 section 4 page 6 (for MLDv1), + // + // When a node receives a Multicast-Address-Specific Query, if it is + // listening to the queried Multicast Address on the interface from + // which the Query was received, it sets a delay timer for that address + // to a random value selected from the range [0, Maximum Response Delay], + // as above. + if maxRespTime == 0 { + return 0 + } + return time.Duration(g.opts.Rand.Int63n(int64(maxRespTime))) +} diff --git a/pkg/tcpip/network/ip/generic_multicast_protocol_test.go b/pkg/tcpip/network/ip/generic_multicast_protocol_test.go new file mode 100644 index 000000000..f56f7aa90 --- /dev/null +++ b/pkg/tcpip/network/ip/generic_multicast_protocol_test.go @@ -0,0 +1,877 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ip_test + +import ( + "math/rand" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/faketime" + "gvisor.dev/gvisor/pkg/tcpip/network/ip" +) + +const ( + addr1 = tcpip.Address("\x01") + addr2 = tcpip.Address("\x02") + addr3 = tcpip.Address("\x03") + addr4 = tcpip.Address("\x04") + + maxUnsolicitedReportDelay = time.Second +) + +var _ ip.MulticastGroupProtocol = (*mockMulticastGroupProtocol)(nil) + +type mockMulticastGroupProtocol struct { + t *testing.T + + mu sync.RWMutex + + // Must only be accessed with mu held. + sendReportGroupAddrCount map[tcpip.Address]int + + // Must only be accessed with mu held. + sendLeaveGroupAddrCount map[tcpip.Address]int + + // Must only be accessed with mu held. + makeQueuePackets bool + + // Must only be accessed with mu held. + disabled bool +} + +func (m *mockMulticastGroupProtocol) init() { + m.mu.Lock() + defer m.mu.Unlock() + m.initLocked() +} + +func (m *mockMulticastGroupProtocol) initLocked() { + m.sendReportGroupAddrCount = make(map[tcpip.Address]int) + m.sendLeaveGroupAddrCount = make(map[tcpip.Address]int) +} + +func (m *mockMulticastGroupProtocol) setEnabled(v bool) { + m.mu.Lock() + defer m.mu.Unlock() + m.disabled = !v +} + +// Enabled implements ip.MulticastGroupProtocol. +// +// Precondition: m.mu must be read locked. +func (m *mockMulticastGroupProtocol) Enabled() bool { + return !m.disabled +} + +// SendReport implements ip.MulticastGroupProtocol. +// +// Precondition: m.mu must be locked. +func (m *mockMulticastGroupProtocol) SendReport(groupAddress tcpip.Address) (bool, *tcpip.Error) { + if m.mu.TryLock() { + m.mu.Unlock() + m.t.Fatalf("got write lock, expected to not take the lock; generic multicast protocol must take the write lock before sending report for %s", groupAddress) + } + if m.mu.TryRLock() { + m.mu.RUnlock() + m.t.Fatalf("got read lock, expected to not take the lock; generic multicast protocol must take the write lock before sending report for %s", groupAddress) + } + + m.sendReportGroupAddrCount[groupAddress]++ + return !m.makeQueuePackets, nil +} + +// SendLeave implements ip.MulticastGroupProtocol. +// +// Precondition: m.mu must be locked. +func (m *mockMulticastGroupProtocol) SendLeave(groupAddress tcpip.Address) *tcpip.Error { + if m.mu.TryLock() { + m.mu.Unlock() + m.t.Fatalf("got write lock, expected to not take the lock; generic multicast protocol must take the write lock before sending leave for %s", groupAddress) + } + if m.mu.TryRLock() { + m.mu.RUnlock() + m.t.Fatalf("got read lock, expected to not take the lock; generic multicast protocol must take the write lock before sending leave for %s", groupAddress) + } + + m.sendLeaveGroupAddrCount[groupAddress]++ + return nil +} + +func (m *mockMulticastGroupProtocol) check(sendReportGroupAddresses []tcpip.Address, sendLeaveGroupAddresses []tcpip.Address) string { + m.mu.Lock() + defer m.mu.Unlock() + + sendReportGroupAddrCount := make(map[tcpip.Address]int) + for _, a := range sendReportGroupAddresses { + sendReportGroupAddrCount[a] = 1 + } + + sendLeaveGroupAddrCount := make(map[tcpip.Address]int) + for _, a := range sendLeaveGroupAddresses { + sendLeaveGroupAddrCount[a] = 1 + } + + diff := cmp.Diff( + &mockMulticastGroupProtocol{ + sendReportGroupAddrCount: sendReportGroupAddrCount, + sendLeaveGroupAddrCount: sendLeaveGroupAddrCount, + }, + m, + cmp.AllowUnexported(mockMulticastGroupProtocol{}), + // ignore mockMulticastGroupProtocol.mu and mockMulticastGroupProtocol.t + cmp.FilterPath( + func(p cmp.Path) bool { + switch p.Last().String() { + case ".mu", ".t", ".makeQueuePackets", ".disabled": + return true + } + return false + }, + cmp.Ignore(), + ), + ) + m.initLocked() + return diff +} + +func TestJoinGroup(t *testing.T) { + tests := []struct { + name string + addr tcpip.Address + shouldSendReports bool + }{ + { + name: "Normal group", + addr: addr1, + shouldSendReports: true, + }, + { + name: "All-nodes group", + addr: addr2, + shouldSendReports: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + var g ip.GenericMulticastProtocolState + mgp := mockMulticastGroupProtocol{t: t} + clock := faketime.NewManualClock() + + mgp.init() + g.Init(&mgp.mu, ip.GenericMulticastProtocolOptions{ + Rand: rand.New(rand.NewSource(0)), + Clock: clock, + Protocol: &mgp, + MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay, + AllNodesAddress: addr2, + }) + + // Joining a group should send a report immediately and another after + // a random interval between 0 and the maximum unsolicited report delay. + mgp.mu.Lock() + g.JoinGroupLocked(test.addr) + mgp.mu.Unlock() + if test.shouldSendReports { + if diff := mgp.check([]tcpip.Address{test.addr} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Generic multicast protocol timers are expected to take the job mutex. + clock.Advance(maxUnsolicitedReportDelay) + if diff := mgp.check([]tcpip.Address{test.addr} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + } + + // Should have no more messages to send. + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestLeaveGroup(t *testing.T) { + tests := []struct { + name string + addr tcpip.Address + shouldSendMessages bool + }{ + { + name: "Normal group", + addr: addr1, + shouldSendMessages: true, + }, + { + name: "All-nodes group", + addr: addr2, + shouldSendMessages: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + var g ip.GenericMulticastProtocolState + mgp := mockMulticastGroupProtocol{t: t} + clock := faketime.NewManualClock() + + mgp.init() + g.Init(&mgp.mu, ip.GenericMulticastProtocolOptions{ + Rand: rand.New(rand.NewSource(1)), + Clock: clock, + Protocol: &mgp, + MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay, + AllNodesAddress: addr2, + }) + + mgp.mu.Lock() + g.JoinGroupLocked(test.addr) + mgp.mu.Unlock() + if test.shouldSendMessages { + if diff := mgp.check([]tcpip.Address{test.addr} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + } + + // Leaving a group should send a leave report immediately and cancel any + // delayed reports. + { + mgp.mu.Lock() + res := g.LeaveGroupLocked(test.addr) + mgp.mu.Unlock() + if !res { + t.Fatalf("got g.LeaveGroupLocked(%s) = false, want = true", test.addr) + } + } + if test.shouldSendMessages { + if diff := mgp.check(nil /* sendReportGroupAddresses */, []tcpip.Address{test.addr} /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + } + + // Should have no more messages to send. + // + // Generic multicast protocol timers are expected to take the job mutex. + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestHandleReport(t *testing.T) { + tests := []struct { + name string + reportAddr tcpip.Address + expectReportsFor []tcpip.Address + }{ + { + name: "Unpecified empty", + reportAddr: "", + expectReportsFor: []tcpip.Address{addr1, addr2}, + }, + { + name: "Unpecified any", + reportAddr: "\x00", + expectReportsFor: []tcpip.Address{addr1, addr2}, + }, + { + name: "Specified", + reportAddr: addr1, + expectReportsFor: []tcpip.Address{addr2}, + }, + { + name: "Specified all-nodes", + reportAddr: addr3, + expectReportsFor: []tcpip.Address{addr1, addr2}, + }, + { + name: "Specified other", + reportAddr: addr4, + expectReportsFor: []tcpip.Address{addr1, addr2}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + var g ip.GenericMulticastProtocolState + mgp := mockMulticastGroupProtocol{t: t} + clock := faketime.NewManualClock() + + mgp.init() + g.Init(&mgp.mu, ip.GenericMulticastProtocolOptions{ + Rand: rand.New(rand.NewSource(2)), + Clock: clock, + Protocol: &mgp, + MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay, + AllNodesAddress: addr3, + }) + + mgp.mu.Lock() + g.JoinGroupLocked(addr1) + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + mgp.mu.Lock() + g.JoinGroupLocked(addr2) + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + mgp.mu.Lock() + g.JoinGroupLocked(addr3) + mgp.mu.Unlock() + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Receiving a report for a group we have a timer scheduled for should + // cancel our delayed report timer for the group. + mgp.mu.Lock() + g.HandleReportLocked(test.reportAddr) + mgp.mu.Unlock() + if len(test.expectReportsFor) != 0 { + // Generic multicast protocol timers are expected to take the job mutex. + clock.Advance(maxUnsolicitedReportDelay) + if diff := mgp.check(test.expectReportsFor /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + } + + // Should have no more messages to send. + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestHandleQuery(t *testing.T) { + tests := []struct { + name string + queryAddr tcpip.Address + maxDelay time.Duration + expectReportsFor []tcpip.Address + }{ + { + name: "Unpecified empty", + queryAddr: "", + maxDelay: 0, + expectReportsFor: []tcpip.Address{addr1, addr2}, + }, + { + name: "Unpecified any", + queryAddr: "\x00", + maxDelay: 1, + expectReportsFor: []tcpip.Address{addr1, addr2}, + }, + { + name: "Specified", + queryAddr: addr1, + maxDelay: 2, + expectReportsFor: []tcpip.Address{addr1}, + }, + { + name: "Specified all-nodes", + queryAddr: addr3, + maxDelay: 3, + expectReportsFor: nil, + }, + { + name: "Specified other", + queryAddr: addr4, + maxDelay: 4, + expectReportsFor: nil, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + var g ip.GenericMulticastProtocolState + mgp := mockMulticastGroupProtocol{t: t} + clock := faketime.NewManualClock() + + mgp.init() + g.Init(&mgp.mu, ip.GenericMulticastProtocolOptions{ + Rand: rand.New(rand.NewSource(3)), + Clock: clock, + Protocol: &mgp, + MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay, + AllNodesAddress: addr3, + }) + + mgp.mu.Lock() + g.JoinGroupLocked(addr1) + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + mgp.mu.Lock() + g.JoinGroupLocked(addr2) + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + mgp.mu.Lock() + g.JoinGroupLocked(addr3) + mgp.mu.Unlock() + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + // Generic multicast protocol timers are expected to take the job mutex. + clock.Advance(maxUnsolicitedReportDelay) + if diff := mgp.check([]tcpip.Address{addr1, addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Receiving a query should make us schedule a new delayed report if it + // is a query directed at us or a general query. + mgp.mu.Lock() + g.HandleQueryLocked(test.queryAddr, test.maxDelay) + mgp.mu.Unlock() + if len(test.expectReportsFor) != 0 { + clock.Advance(test.maxDelay) + if diff := mgp.check(test.expectReportsFor /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + } + + // Should have no more messages to send. + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestJoinCount(t *testing.T) { + var g ip.GenericMulticastProtocolState + mgp := mockMulticastGroupProtocol{t: t} + clock := faketime.NewManualClock() + + mgp.init() + g.Init(&mgp.mu, ip.GenericMulticastProtocolOptions{ + Rand: rand.New(rand.NewSource(4)), + Clock: clock, + Protocol: &mgp, + MaxUnsolicitedReportDelay: time.Second, + }) + + // Set the join count to 2 for a group. + { + mgp.mu.Lock() + g.JoinGroupLocked(addr1) + res := g.IsLocallyJoinedRLocked(addr1) + mgp.mu.Unlock() + if !res { + t.Fatalf("got g.IsLocallyJoinedRLocked(%s) = false, want = true", addr1) + } + } + // Only the first join should trigger a report to be sent. + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + { + mgp.mu.Lock() + g.JoinGroupLocked(addr1) + res := g.IsLocallyJoinedRLocked(addr1) + mgp.mu.Unlock() + if !res { + t.Errorf("got g.IsLocallyJoinedRLocked(%s) = false, want = true", addr1) + } + } + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + if t.Failed() { + t.FailNow() + } + + // Group should still be considered joined after leaving once. + { + mgp.mu.Lock() + leaveGroupRes := g.LeaveGroupLocked(addr1) + isLocallyJoined := g.IsLocallyJoinedRLocked(addr1) + mgp.mu.Unlock() + if !leaveGroupRes { + t.Errorf("got g.LeaveGroupLocked(%s) = false, want = true", addr1) + } + if !isLocallyJoined { + t.Errorf("got g.IsLocallyJoinedRLocked(%s) = false, want = true", addr1) + } + } + // A leave report should only be sent once the join count reaches 0. + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + if t.Failed() { + t.FailNow() + } + + // Leaving once more should actually remove us from the group. + { + mgp.mu.Lock() + leaveGroupRes := g.LeaveGroupLocked(addr1) + isLocallyJoined := g.IsLocallyJoinedRLocked(addr1) + mgp.mu.Unlock() + if !leaveGroupRes { + t.Errorf("got g.LeaveGroupLocked(%s) = false, want = true", addr1) + } + if isLocallyJoined { + t.Errorf("got g.IsLocallyJoinedRLocked(%s) = true, want = false", addr1) + } + } + if diff := mgp.check(nil /* sendReportGroupAddresses */, []tcpip.Address{addr1} /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + if t.Failed() { + t.FailNow() + } + + // Group should no longer be joined so we should not have anything to + // leave. + { + mgp.mu.Lock() + leaveGroupRes := g.LeaveGroupLocked(addr1) + isLocallyJoined := g.IsLocallyJoinedRLocked(addr1) + mgp.mu.Unlock() + if leaveGroupRes { + t.Errorf("got g.LeaveGroupLocked(%s) = true, want = false", addr1) + } + if isLocallyJoined { + t.Errorf("got g.IsLocallyJoinedRLocked(%s) = true, want = false", addr1) + } + } + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Should have no more messages to send. + // + // Generic multicast protocol timers are expected to take the job mutex. + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } +} + +func TestMakeAllNonMemberAndInitialize(t *testing.T) { + var g ip.GenericMulticastProtocolState + mgp := mockMulticastGroupProtocol{t: t} + clock := faketime.NewManualClock() + + mgp.init() + g.Init(&mgp.mu, ip.GenericMulticastProtocolOptions{ + Rand: rand.New(rand.NewSource(3)), + Clock: clock, + Protocol: &mgp, + MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay, + AllNodesAddress: addr3, + }) + + mgp.mu.Lock() + g.JoinGroupLocked(addr1) + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + mgp.mu.Lock() + g.JoinGroupLocked(addr2) + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + mgp.mu.Lock() + g.JoinGroupLocked(addr3) + mgp.mu.Unlock() + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Should send the leave reports for each but still consider them locally + // joined. + mgp.mu.Lock() + g.MakeAllNonMemberLocked() + mgp.mu.Unlock() + if diff := mgp.check(nil /* sendReportGroupAddresses */, []tcpip.Address{addr1, addr2} /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + // Generic multicast protocol timers are expected to take the job mutex. + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + for _, group := range []tcpip.Address{addr1, addr2, addr3} { + mgp.mu.RLock() + res := g.IsLocallyJoinedRLocked(group) + mgp.mu.RUnlock() + if !res { + t.Fatalf("got g.IsLocallyJoinedRLocked(%s) = false, want = true", group) + } + } + + // Should send the initial set of unsolcited reports. + mgp.mu.Lock() + g.InitializeGroupsLocked() + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr1, addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + clock.Advance(maxUnsolicitedReportDelay) + if diff := mgp.check([]tcpip.Address{addr1, addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Should have no more messages to send. + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } +} + +// TestGroupStateNonMember tests that groups do not send packets when in the +// non-member state, but are still considered locally joined. +func TestGroupStateNonMember(t *testing.T) { + var g ip.GenericMulticastProtocolState + mgp := mockMulticastGroupProtocol{t: t} + clock := faketime.NewManualClock() + + mgp.init() + mgp.setEnabled(false) + g.Init(&mgp.mu, ip.GenericMulticastProtocolOptions{ + Rand: rand.New(rand.NewSource(3)), + Clock: clock, + Protocol: &mgp, + MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay, + }) + + // Joining groups should not send any reports. + { + mgp.mu.Lock() + g.JoinGroupLocked(addr1) + res := g.IsLocallyJoinedRLocked(addr1) + mgp.mu.Unlock() + if !res { + t.Fatalf("got g.IsLocallyJoinedRLocked(%s) = false, want = true", addr1) + } + } + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + { + mgp.mu.Lock() + g.JoinGroupLocked(addr2) + res := g.IsLocallyJoinedRLocked(addr2) + mgp.mu.Unlock() + if !res { + t.Fatalf("got g.IsLocallyJoinedRLocked(%s) = false, want = true", addr2) + } + } + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Receiving a query should not send any reports. + mgp.mu.Lock() + g.HandleQueryLocked(addr1, time.Nanosecond) + mgp.mu.Unlock() + // Generic multicast protocol timers are expected to take the job mutex. + clock.Advance(time.Nanosecond) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Leaving groups should not send any leave messages. + { + mgp.mu.Lock() + addr2LeaveRes := g.LeaveGroupLocked(addr2) + addr1IsJoined := g.IsLocallyJoinedRLocked(addr1) + addr2IsJoined := g.IsLocallyJoinedRLocked(addr2) + mgp.mu.Unlock() + if !addr2LeaveRes { + t.Errorf("got g.LeaveGroupLocked(%s) = false, want = true", addr2) + } + if !addr1IsJoined { + t.Errorf("got g.IsLocallyJoinedRLocked(%s) = false, want = true", addr1) + } + if addr2IsJoined { + t.Errorf("got g.IsLocallyJoinedRLocked(%s) = true, want = false", addr2) + } + } + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } +} + +func TestQueuedPackets(t *testing.T) { + var g ip.GenericMulticastProtocolState + var mgp mockMulticastGroupProtocol + mgp.init() + clock := faketime.NewManualClock() + g.Init(&mgp.mu, ip.GenericMulticastProtocolOptions{ + Rand: rand.New(rand.NewSource(4)), + Clock: clock, + Protocol: &mgp, + MaxUnsolicitedReportDelay: maxUnsolicitedReportDelay, + }) + + // Joining should trigger a SendReport, but mgp should report that we did not + // send the packet. + mgp.mu.Lock() + mgp.makeQueuePackets = true + g.JoinGroupLocked(addr1) + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // The delayed report timer should have been cancelled since we did not send + // the initial report earlier. + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Mock being able to successfully send the report. + mgp.mu.Lock() + mgp.makeQueuePackets = false + g.SendQueuedReportsLocked() + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // The delayed report (sent after the initial report) should now be sent. + clock.Advance(maxUnsolicitedReportDelay) + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Should not have anything else to send (we should be idle). + mgp.mu.Lock() + g.SendQueuedReportsLocked() + mgp.mu.Unlock() + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Receive a query but mock being unable to send reports again. + mgp.mu.Lock() + mgp.makeQueuePackets = true + g.HandleQueryLocked(addr1, time.Nanosecond) + mgp.mu.Unlock() + clock.Advance(time.Nanosecond) + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Mock being able to send reports again - we should have a packet queued to + // send. + mgp.mu.Lock() + mgp.makeQueuePackets = false + g.SendQueuedReportsLocked() + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Should not have anything else to send. + mgp.mu.Lock() + g.SendQueuedReportsLocked() + mgp.mu.Unlock() + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Receive a query again, but mock being unable to send reports. + mgp.mu.Lock() + mgp.makeQueuePackets = true + g.HandleQueryLocked(addr1, time.Nanosecond) + mgp.mu.Unlock() + clock.Advance(time.Nanosecond) + if diff := mgp.check([]tcpip.Address{addr1} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Receiving a report should should transition us into the idle member state, + // even if we had a packet queued. We should no longer have any packets to + // send. + mgp.mu.Lock() + g.HandleReportLocked(addr1) + g.SendQueuedReportsLocked() + mgp.mu.Unlock() + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // When we fail to send the initial set of reports, incoming reports should + // not affect a newly joined group's reports from being sent. + mgp.mu.Lock() + mgp.makeQueuePackets = true + g.JoinGroupLocked(addr2) + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + mgp.mu.Lock() + g.HandleReportLocked(addr2) + // Attempting to send queued reports while still unable to send reports should + // not change the host state. + g.SendQueuedReportsLocked() + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + // Mock being able to successfully send the report. + mgp.mu.Lock() + mgp.makeQueuePackets = false + g.SendQueuedReportsLocked() + mgp.mu.Unlock() + if diff := mgp.check([]tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + // The delayed report (sent after the initial report) should now be sent. + clock.Advance(maxUnsolicitedReportDelay) + if diff := mgp.check([]tcpip.Address{addr2} /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Errorf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } + + // Should not have anything else to send. + mgp.mu.Lock() + g.SendQueuedReportsLocked() + mgp.mu.Unlock() + clock.Advance(time.Hour) + if diff := mgp.check(nil /* sendReportGroupAddresses */, nil /* sendLeaveGroupAddresses */); diff != "" { + t.Fatalf("mockMulticastGroupProtocol mismatch (-want +got):\n%s", diff) + } +} diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index d49c44846..3005973d7 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -193,10 +193,6 @@ func (*testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt stack.PacketBu panic("not implemented") } -func (*testObject) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error { - return tcpip.ErrNotSupported -} - // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (*testObject) ARPHardwareType() header.ARPHardwareType { panic("not implemented") @@ -207,7 +203,7 @@ func (*testObject) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net panic("not implemented") } -func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) { +func buildIPv4Route(local, remote tcpip.Address) (*stack.Route, *tcpip.Error) { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol}, TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol}, @@ -223,7 +219,7 @@ func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) { return s.FindRoute(nicID, local, remote, ipv4.ProtocolNumber, false /* multicastLoop */) } -func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) { +func buildIPv6Route(local, remote tcpip.Address) (*stack.Route, *tcpip.Error) { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol}, TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol}, @@ -348,11 +344,11 @@ func TestSourceAddressValidation(t *testing.T) { pkt.SetChecksum(header.ICMPv6Checksum(pkt, src, localIPv6Addr, buffer.VectorisedView{})) ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: header.ICMPv6MinimumSize, - NextHeader: uint8(icmp.ProtocolNumber6), - HopLimit: ipv6.DefaultTTL, - SrcAddr: src, - DstAddr: localIPv6Addr, + PayloadLength: header.ICMPv6MinimumSize, + TransportProtocol: icmp.ProtocolNumber6, + HopLimit: ipv6.DefaultTTL, + SrcAddr: src, + DstAddr: localIPv6Addr, }) e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), @@ -554,7 +550,7 @@ func TestIPv4Send(t *testing.T) { if err != nil { t.Fatalf("could not find route: %v", err) } - if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{ + if err := ep.WritePacket(r, nil /* gso */, stack.NetworkHeaderParams{ Protocol: 123, TTL: 123, TOS: stack.DefaultTOS, @@ -623,11 +619,11 @@ func TestReceive(t *testing.T) { view := buffer.NewView(header.IPv6MinimumSize + payloadLen) ip := header.IPv6(view) ip.Encode(&header.IPv6Fields{ - PayloadLength: payloadLen, - NextHeader: 10, - HopLimit: ipv6.DefaultTTL, - SrcAddr: remoteIPv6Addr, - DstAddr: localIPv6Addr, + PayloadLength: payloadLen, + TransportProtocol: 10, + HopLimit: ipv6.DefaultTTL, + SrcAddr: remoteIPv6Addr, + DstAddr: localIPv6Addr, }) // Make payload be non-zero. @@ -937,7 +933,7 @@ func TestIPv6Send(t *testing.T) { if err != nil { t.Fatalf("could not find route: %v", err) } - if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{ + if err := ep.WritePacket(r, nil /* gso */, stack.NetworkHeaderParams{ Protocol: 123, TTL: 123, TOS: stack.DefaultTOS, @@ -997,11 +993,11 @@ func TestIPv6ReceiveControl(t *testing.T) { // Create the outer IPv6 header. ip := header.IPv6(view) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(len(view) - header.IPv6MinimumSize - c.trunc), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: 20, - SrcAddr: outerSrcAddr, - DstAddr: localIPv6Addr, + PayloadLength: uint16(len(view) - header.IPv6MinimumSize - c.trunc), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: 20, + SrcAddr: outerSrcAddr, + DstAddr: localIPv6Addr, }) // Create the ICMP header. @@ -1011,28 +1007,27 @@ func TestIPv6ReceiveControl(t *testing.T) { icmp.SetIdent(0xdead) icmp.SetSequence(0xbeef) - // Create the inner IPv6 header. - ip = header.IPv6(view[header.IPv6MinimumSize+header.ICMPv6PayloadOffset:]) - ip.Encode(&header.IPv6Fields{ - PayloadLength: 100, - NextHeader: 10, - HopLimit: 20, - SrcAddr: localIPv6Addr, - DstAddr: remoteIPv6Addr, - }) - + var extHdrs header.IPv6ExtHdrSerializer // Build the fragmentation header if needed. if c.fragmentOffset != nil { - ip.SetNextHeader(header.IPv6FragmentHeader) - frag := header.IPv6Fragment(view[2*header.IPv6MinimumSize+header.ICMPv6MinimumSize:]) - frag.Encode(&header.IPv6FragmentFields{ - NextHeader: 10, + extHdrs = append(extHdrs, &header.IPv6SerializableFragmentExtHdr{ FragmentOffset: *c.fragmentOffset, M: true, Identification: 0x12345678, }) } + // Create the inner IPv6 header. + ip = header.IPv6(view[header.IPv6MinimumSize+header.ICMPv6PayloadOffset:]) + ip.Encode(&header.IPv6Fields{ + PayloadLength: 100, + TransportProtocol: 10, + HopLimit: 20, + SrcAddr: localIPv6Addr, + DstAddr: remoteIPv6Addr, + ExtensionHeaders: extHdrs, + }) + // Make payload be non-zero. for i := dataOffset; i < len(view); i++ { view[i] = uint8(i) @@ -1093,7 +1088,19 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { dataBuf := [dataLen]byte{1, 2, 3, 4} data := dataBuf[:] - ipv4Options := header.IPv4Options{0, 1, 0, 1} + ipv4Options := header.IPv4OptionsSerializer{ + &header.IPv4SerializableListEndOption{}, + &header.IPv4SerializableNOPOption{}, + &header.IPv4SerializableListEndOption{}, + &header.IPv4SerializableNOPOption{}, + } + + expectOptions := header.IPv4Options{ + byte(header.IPv4OptionListEndType), + byte(header.IPv4OptionNOPType), + byte(header.IPv4OptionListEndType), + byte(header.IPv4OptionNOPType), + } ipv6FragmentExtHdrBuf := [header.IPv6FragmentExtHdrLength]byte{transportProto, 0, 62, 4, 1, 2, 3, 4} ipv6FragmentExtHdr := ipv6FragmentExtHdrBuf[:] @@ -1243,7 +1250,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { nicAddr: localIPv4Addr, remoteAddr: remoteIPv4Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { - ipHdrLen := header.IPv4MinimumSize + ipv4Options.SizeWithPadding() + ipHdrLen := int(header.IPv4MinimumSize + ipv4Options.Length()) totalLen := ipHdrLen + len(data) hdr := buffer.NewPrependable(totalLen) if n := copy(hdr.Prepend(len(data)), data); n != len(data) { @@ -1266,7 +1273,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { netHdr := pkt.NetworkHeader() - hdrLen := header.IPv4MinimumSize + len(ipv4Options) + hdrLen := int(header.IPv4MinimumSize + ipv4Options.Length()) if len(netHdr.View()) != hdrLen { t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), hdrLen) } @@ -1276,7 +1283,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { checker.DstAddr(remoteIPv4Addr), checker.IPv4HeaderLength(hdrLen), checker.IPFullLength(uint16(hdrLen+len(data))), - checker.IPv4Options(ipv4Options), + checker.IPv4Options(expectOptions), checker.IPPayload(data), ) }, @@ -1288,7 +1295,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { nicAddr: localIPv4Addr, remoteAddr: remoteIPv4Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { - ip := header.IPv4(make([]byte, header.IPv4MinimumSize+ipv4Options.SizeWithPadding())) + ip := header.IPv4(make([]byte, header.IPv4MinimumSize+ipv4Options.Length())) ip.Encode(&header.IPv4Fields{ Protocol: transportProto, TTL: ipv4.DefaultTTL, @@ -1307,7 +1314,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { netHdr := pkt.NetworkHeader() - hdrLen := header.IPv4MinimumSize + len(ipv4Options) + hdrLen := int(header.IPv4MinimumSize + ipv4Options.Length()) if len(netHdr.View()) != hdrLen { t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), hdrLen) } @@ -1317,7 +1324,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { checker.DstAddr(remoteIPv4Addr), checker.IPv4HeaderLength(hdrLen), checker.IPFullLength(uint16(hdrLen+len(data))), - checker.IPv4Options(ipv4Options), + checker.IPv4Options(expectOptions), checker.IPPayload(data), ) }, @@ -1336,10 +1343,10 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { } ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - NextHeader: transportProto, - HopLimit: ipv6.DefaultTTL, - SrcAddr: src, - DstAddr: header.IPv4Any, + TransportProtocol: transportProto, + HopLimit: ipv6.DefaultTTL, + SrcAddr: src, + DstAddr: header.IPv4Any, }) return hdr.View().ToVectorisedView() }, @@ -1379,10 +1386,12 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { } ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - NextHeader: uint8(header.IPv6FragmentExtHdrIdentifier), - HopLimit: ipv6.DefaultTTL, - SrcAddr: src, - DstAddr: header.IPv4Any, + // NB: we're lying about transport protocol here to verify the raw + // fragment header bytes. + TransportProtocol: tcpip.TransportProtocolNumber(header.IPv6FragmentExtHdrIdentifier), + HopLimit: ipv6.DefaultTTL, + SrcAddr: src, + DstAddr: header.IPv4Any, }) return hdr.View().ToVectorisedView() }, @@ -1414,10 +1423,10 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { ip := header.IPv6(make([]byte, header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - NextHeader: transportProto, - HopLimit: ipv6.DefaultTTL, - SrcAddr: src, - DstAddr: header.IPv4Any, + TransportProtocol: transportProto, + HopLimit: ipv6.DefaultTTL, + SrcAddr: src, + DstAddr: header.IPv4Any, }) return buffer.View(ip).ToVectorisedView() }, @@ -1449,10 +1458,10 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { ip := header.IPv6(make([]byte, header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - NextHeader: transportProto, - HopLimit: ipv6.DefaultTTL, - SrcAddr: src, - DstAddr: header.IPv4Any, + TransportProtocol: transportProto, + HopLimit: ipv6.DefaultTTL, + SrcAddr: src, + DstAddr: header.IPv4Any, }) return buffer.View(ip[:len(ip)-1]).ToVectorisedView() }, diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD index 6252614ec..32f53f217 100644 --- a/pkg/tcpip/network/ipv4/BUILD +++ b/pkg/tcpip/network/ipv4/BUILD @@ -6,6 +6,7 @@ go_library( name = "ipv4", srcs = [ "icmp.go", + "igmp.go", "ipv4.go", ], visibility = ["//visibility:public"], @@ -17,6 +18,7 @@ go_library( "//pkg/tcpip/header/parse", "//pkg/tcpip/network/fragmentation", "//pkg/tcpip/network/hash", + "//pkg/tcpip/network/ip", "//pkg/tcpip/stack", ], ) @@ -24,7 +26,10 @@ go_library( go_test( name = "ipv4_test", size = "small", - srcs = ["ipv4_test.go"], + srcs = [ + "igmp_test.go", + "ipv4_test.go", + ], deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go index 488945226..8e392f86c 100644 --- a/pkg/tcpip/network/ipv4/icmp.go +++ b/pkg/tcpip/network/ipv4/icmp.go @@ -63,7 +63,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) { stats := e.protocol.stack.Stats() - received := stats.ICMP.V4PacketsReceived + received := stats.ICMP.V4.PacketsReceived // TODO(gvisor.dev/issue/170): ICMP packets don't have their // TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a // full explanation. @@ -130,7 +130,7 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) { case header.ICMPv4Echo: received.Echo.Increment() - sent := stats.ICMP.V4PacketsSent + sent := stats.ICMP.V4.PacketsSent if !e.protocol.stack.AllowICMPMessage() { sent.RateLimited.Increment() return @@ -379,7 +379,7 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi } defer route.Release() - sent := p.stack.Stats().ICMP.V4PacketsSent + sent := p.stack.Stats().ICMP.V4.PacketsSent if !p.stack.AllowICMPMessage() { sent.RateLimited.Increment() return nil diff --git a/pkg/tcpip/network/ipv4/igmp.go b/pkg/tcpip/network/ipv4/igmp.go new file mode 100644 index 000000000..da88d65d1 --- /dev/null +++ b/pkg/tcpip/network/ipv4/igmp.go @@ -0,0 +1,345 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ipv4 + +import ( + "fmt" + "sync/atomic" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/network/ip" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +const ( + // igmpV1PresentDefault is the initial state for igmpV1Present in the + // igmpState. As per RFC 2236 Page 9 says "No IGMPv1 Router Present ... is + // the initial state." + igmpV1PresentDefault = 0 + + // v1RouterPresentTimeout from RFC 2236 Section 8.11, Page 18 + // See note on igmpState.igmpV1Present for more detail. + v1RouterPresentTimeout = 400 * time.Second + + // v1MaxRespTime from RFC 2236 Section 4, Page 5. "The IGMPv1 router + // will send General Queries with the Max Response Time set to 0. This MUST + // be interpreted as a value of 100 (10 seconds)." + // + // Note that the Max Response Time field is a value in units of deciseconds. + v1MaxRespTime = 10 * time.Second + + // UnsolicitedReportIntervalMax is the maximum delay between sending + // unsolicited IGMP reports. + // + // Obtained from RFC 2236 Section 8.10, Page 19. + UnsolicitedReportIntervalMax = 10 * time.Second +) + +// IGMPOptions holds options for IGMP. +type IGMPOptions struct { + // Enabled indicates whether IGMP will be performed. + // + // When enabled, IGMP may transmit IGMP report and leave messages when + // joining and leaving multicast groups respectively, and handle incoming + // IGMP packets. + // + // This field is ignored and is always assumed to be false for interfaces + // without neighbouring nodes (e.g. loopback). + Enabled bool +} + +var _ ip.MulticastGroupProtocol = (*igmpState)(nil) + +// igmpState is the per-interface IGMP state. +// +// igmpState.init() MUST be called after creating an IGMP state. +type igmpState struct { + // The IPv4 endpoint this igmpState is for. + ep *endpoint + + genericMulticastProtocol ip.GenericMulticastProtocolState + + // igmpV1Present is for maintaining compatibility with IGMPv1 Routers, from + // RFC 2236 Section 4 Page 6: "The IGMPv1 router expects Version 1 + // Membership Reports in response to its Queries, and will not pay + // attention to Version 2 Membership Reports. Therefore, a state variable + // MUST be kept for each interface, describing whether the multicast + // Querier on that interface is running IGMPv1 or IGMPv2. This variable + // MUST be based upon whether or not an IGMPv1 query was heard in the last + // [Version 1 Router Present Timeout] seconds". + // + // Must be accessed with atomic operations. Holds a value of 1 when true, 0 + // when false. + igmpV1Present uint32 + + // igmpV1Job is scheduled when this interface receives an IGMPv1 style + // message, upon expiration the igmpV1Present flag is cleared. + // igmpV1Job may not be nil once igmpState is initialized. + igmpV1Job *tcpip.Job +} + +// Enabled implements ip.MulticastGroupProtocol. +func (igmp *igmpState) Enabled() bool { + // No need to perform IGMP on loopback interfaces since they don't have + // neighbouring nodes. + return igmp.ep.protocol.options.IGMP.Enabled && !igmp.ep.nic.IsLoopback() && igmp.ep.Enabled() +} + +// SendReport implements ip.MulticastGroupProtocol. +// +// Precondition: igmp.ep.mu must be read locked. +func (igmp *igmpState) SendReport(groupAddress tcpip.Address) (bool, *tcpip.Error) { + igmpType := header.IGMPv2MembershipReport + if igmp.v1Present() { + igmpType = header.IGMPv1MembershipReport + } + return igmp.writePacket(groupAddress, groupAddress, igmpType) +} + +// SendLeave implements ip.MulticastGroupProtocol. +// +// Precondition: igmp.ep.mu must be read locked. +func (igmp *igmpState) SendLeave(groupAddress tcpip.Address) *tcpip.Error { + // As per RFC 2236 Section 6, Page 8: "If the interface state says the + // Querier is running IGMPv1, this action SHOULD be skipped. If the flag + // saying we were the last host to report is cleared, this action MAY be + // skipped." + if igmp.v1Present() { + return nil + } + _, err := igmp.writePacket(header.IPv4AllRoutersGroup, groupAddress, header.IGMPLeaveGroup) + return err +} + +// init sets up an igmpState struct, and is required to be called before using +// a new igmpState. +// +// Must only be called once for the lifetime of igmp. +func (igmp *igmpState) init(ep *endpoint) { + igmp.ep = ep + igmp.genericMulticastProtocol.Init(&ep.mu.RWMutex, ip.GenericMulticastProtocolOptions{ + Rand: ep.protocol.stack.Rand(), + Clock: ep.protocol.stack.Clock(), + Protocol: igmp, + MaxUnsolicitedReportDelay: UnsolicitedReportIntervalMax, + AllNodesAddress: header.IPv4AllSystems, + }) + igmp.igmpV1Present = igmpV1PresentDefault + igmp.igmpV1Job = ep.protocol.stack.NewJob(&ep.mu, func() { + igmp.setV1Present(false) + }) +} + +// handleIGMP handles an IGMP packet. +// +// Precondition: igmp.ep.mu must be locked. +func (igmp *igmpState) handleIGMP(pkt *stack.PacketBuffer) { + stats := igmp.ep.protocol.stack.Stats() + received := stats.IGMP.PacketsReceived + headerView, ok := pkt.Data.PullUp(header.IGMPMinimumSize) + if !ok { + received.Invalid.Increment() + return + } + h := header.IGMP(headerView) + + // Temporarily reset the checksum field to 0 in order to calculate the proper + // checksum. + wantChecksum := h.Checksum() + h.SetChecksum(0) + gotChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */) + h.SetChecksum(wantChecksum) + + if gotChecksum != wantChecksum { + received.ChecksumErrors.Increment() + return + } + + switch h.Type() { + case header.IGMPMembershipQuery: + received.MembershipQuery.Increment() + if len(headerView) < header.IGMPQueryMinimumSize { + received.Invalid.Increment() + return + } + igmp.handleMembershipQuery(h.GroupAddress(), h.MaxRespTime()) + case header.IGMPv1MembershipReport: + received.V1MembershipReport.Increment() + if len(headerView) < header.IGMPReportMinimumSize { + received.Invalid.Increment() + return + } + igmp.handleMembershipReport(h.GroupAddress()) + case header.IGMPv2MembershipReport: + received.V2MembershipReport.Increment() + if len(headerView) < header.IGMPReportMinimumSize { + received.Invalid.Increment() + return + } + igmp.handleMembershipReport(h.GroupAddress()) + case header.IGMPLeaveGroup: + received.LeaveGroup.Increment() + // As per RFC 2236 Section 6, Page 7: "IGMP messages other than Query or + // Report, are ignored in all states" + + default: + // As per RFC 2236 Section 2.1 Page 3: "Unrecognized message types should + // be silently ignored. New message types may be used by newer versions of + // IGMP, by multicast routing protocols, or other uses." + received.Unrecognized.Increment() + } +} + +func (igmp *igmpState) v1Present() bool { + return atomic.LoadUint32(&igmp.igmpV1Present) == 1 +} + +func (igmp *igmpState) setV1Present(v bool) { + if v { + atomic.StoreUint32(&igmp.igmpV1Present, 1) + } else { + atomic.StoreUint32(&igmp.igmpV1Present, 0) + } +} + +// handleMembershipQuery handles a membership query. +// +// Precondition: igmp.ep.mu must be locked. +func (igmp *igmpState) handleMembershipQuery(groupAddress tcpip.Address, maxRespTime time.Duration) { + // As per RFC 2236 Section 6, Page 10: If the maximum response time is zero + // then change the state to note that an IGMPv1 router is present and + // schedule the query received Job. + if maxRespTime == 0 && igmp.Enabled() { + igmp.igmpV1Job.Cancel() + igmp.igmpV1Job.Schedule(v1RouterPresentTimeout) + igmp.setV1Present(true) + maxRespTime = v1MaxRespTime + } + + igmp.genericMulticastProtocol.HandleQueryLocked(groupAddress, maxRespTime) +} + +// handleMembershipReport handles a membership report. +// +// Precondition: igmp.ep.mu must be locked. +func (igmp *igmpState) handleMembershipReport(groupAddress tcpip.Address) { + igmp.genericMulticastProtocol.HandleReportLocked(groupAddress) +} + +// writePacket assembles and sends an IGMP packet. +// +// Precondition: igmp.ep.mu must be read locked. +func (igmp *igmpState) writePacket(destAddress tcpip.Address, groupAddress tcpip.Address, igmpType header.IGMPType) (bool, *tcpip.Error) { + igmpData := header.IGMP(buffer.NewView(header.IGMPReportMinimumSize)) + igmpData.SetType(igmpType) + igmpData.SetGroupAddress(groupAddress) + igmpData.SetChecksum(header.IGMPCalculateChecksum(igmpData)) + + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: int(igmp.ep.MaxHeaderLength()), + Data: buffer.View(igmpData).ToVectorisedView(), + }) + + addressEndpoint := igmp.ep.acquireOutgoingPrimaryAddressRLocked(destAddress, false /* allowExpired */) + if addressEndpoint == nil { + return false, nil + } + localAddr := addressEndpoint.AddressWithPrefix().Address + addressEndpoint.DecRef() + addressEndpoint = nil + igmp.ep.addIPHeader(localAddr, destAddress, pkt, stack.NetworkHeaderParams{ + Protocol: header.IGMPProtocolNumber, + TTL: header.IGMPTTL, + TOS: stack.DefaultTOS, + }, header.IPv4OptionsSerializer{ + &header.IPv4SerializableRouterAlertOption{}, + }) + + sentStats := igmp.ep.protocol.stack.Stats().IGMP.PacketsSent + if err := igmp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv4Address(destAddress), nil /* gso */, ProtocolNumber, pkt); err != nil { + sentStats.Dropped.Increment() + return false, err + } + switch igmpType { + case header.IGMPv1MembershipReport: + sentStats.V1MembershipReport.Increment() + case header.IGMPv2MembershipReport: + sentStats.V2MembershipReport.Increment() + case header.IGMPLeaveGroup: + sentStats.LeaveGroup.Increment() + default: + panic(fmt.Sprintf("unrecognized igmp type = %d", igmpType)) + } + return true, nil +} + +// joinGroup handles adding a new group to the membership map, setting up the +// IGMP state for the group, and sending and scheduling the required +// messages. +// +// If the group already exists in the membership map, returns +// tcpip.ErrDuplicateAddress. +// +// Precondition: igmp.ep.mu must be locked. +func (igmp *igmpState) joinGroup(groupAddress tcpip.Address) { + igmp.genericMulticastProtocol.JoinGroupLocked(groupAddress) +} + +// isInGroup returns true if the specified group has been joined locally. +// +// Precondition: igmp.ep.mu must be read locked. +func (igmp *igmpState) isInGroup(groupAddress tcpip.Address) bool { + return igmp.genericMulticastProtocol.IsLocallyJoinedRLocked(groupAddress) +} + +// leaveGroup handles removing the group from the membership map, cancels any +// delay timers associated with that group, and sends the Leave Group message +// if required. +// +// Precondition: igmp.ep.mu must be locked. +func (igmp *igmpState) leaveGroup(groupAddress tcpip.Address) *tcpip.Error { + // LeaveGroup returns false only if the group was not joined. + if igmp.genericMulticastProtocol.LeaveGroupLocked(groupAddress) { + return nil + } + + return tcpip.ErrBadLocalAddress +} + +// softLeaveAll leaves all groups from the perspective of IGMP, but remains +// joined locally. +// +// Precondition: igmp.ep.mu must be locked. +func (igmp *igmpState) softLeaveAll() { + igmp.genericMulticastProtocol.MakeAllNonMemberLocked() +} + +// initializeAll attemps to initialize the IGMP state for each group that has +// been joined locally. +// +// Precondition: igmp.ep.mu must be locked. +func (igmp *igmpState) initializeAll() { + igmp.genericMulticastProtocol.InitializeGroupsLocked() +} + +// sendQueuedReports attempts to send any reports that are queued for sending. +// +// Precondition: igmp.ep.mu must be locked. +func (igmp *igmpState) sendQueuedReports() { + igmp.genericMulticastProtocol.SendQueuedReportsLocked() +} diff --git a/pkg/tcpip/network/ipv4/igmp_test.go b/pkg/tcpip/network/ipv4/igmp_test.go new file mode 100644 index 000000000..1ee573ac8 --- /dev/null +++ b/pkg/tcpip/network/ipv4/igmp_test.go @@ -0,0 +1,215 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ipv4_test + +import ( + "testing" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/checker" + "gvisor.dev/gvisor/pkg/tcpip/faketime" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/link/channel" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +const ( + linkAddr = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06") + addr = tcpip.Address("\x0a\x00\x00\x01") + multicastAddr = tcpip.Address("\xe0\x00\x00\x03") + nicID = 1 +) + +// validateIgmpPacket checks that a passed PacketInfo is an IPv4 IGMP packet +// sent to the provided address with the passed fields set. Raises a t.Error if +// any field does not match. +func validateIgmpPacket(t *testing.T, p channel.PacketInfo, remoteAddress tcpip.Address, igmpType header.IGMPType, maxRespTime byte, groupAddress tcpip.Address) { + t.Helper() + + payload := header.IPv4(stack.PayloadSince(p.Pkt.NetworkHeader())) + checker.IPv4(t, payload, + checker.SrcAddr(addr), + checker.DstAddr(remoteAddress), + // TTL for an IGMP message must be 1 as per RFC 2236 section 2. + checker.TTL(1), + checker.IPv4RouterAlert(), + checker.IGMP( + checker.IGMPType(igmpType), + checker.IGMPMaxRespTime(header.DecisecondToDuration(maxRespTime)), + checker.IGMPGroupAddress(groupAddress), + ), + ) +} + +func createStack(t *testing.T, igmpEnabled bool) (*channel.Endpoint, *stack.Stack, *faketime.ManualClock) { + t.Helper() + + // Create an endpoint of queue size 1, since no more than 1 packets are ever + // queued in the tests in this file. + e := channel.New(1, 1280, linkAddr) + clock := faketime.NewManualClock() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocolWithOptions(ipv4.Options{ + IGMP: ipv4.IGMPOptions{ + Enabled: igmpEnabled, + }, + })}, + Clock: clock, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + return e, s, clock +} + +func createAndInjectIGMPPacket(e *channel.Endpoint, igmpType header.IGMPType, maxRespTime byte, groupAddress tcpip.Address) { + buf := buffer.NewView(header.IPv4MinimumSize + header.IGMPQueryMinimumSize) + + ip := header.IPv4(buf) + ip.Encode(&header.IPv4Fields{ + TotalLength: uint16(len(buf)), + TTL: 1, + Protocol: uint8(header.IGMPProtocolNumber), + SrcAddr: header.IPv4Any, + DstAddr: header.IPv4AllSystems, + }) + ip.SetChecksum(^ip.CalculateChecksum()) + + igmp := header.IGMP(buf[header.IPv4MinimumSize:]) + igmp.SetType(igmpType) + igmp.SetMaxRespTime(maxRespTime) + igmp.SetGroupAddress(groupAddress) + igmp.SetChecksum(header.IGMPCalculateChecksum(igmp)) + + e.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{ + Data: buf.ToVectorisedView(), + }) +} + +// TestIgmpV1Present tests the handling of the case where an IGMPv1 router is +// present on the network. The IGMP stack will then send IGMPv1 Membership +// reports for backwards compatibility. +func TestIgmpV1Present(t *testing.T) { + e, s, clock := createStack(t, true) + if err := s.AddAddress(nicID, ipv4.ProtocolNumber, addr); err != nil { + t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv4.ProtocolNumber, addr, err) + } + + if err := s.JoinGroup(ipv4.ProtocolNumber, nicID, multicastAddr); err != nil { + t.Fatalf("JoinGroup(ipv4, nic, %s) = %s", multicastAddr, err) + } + + // This NIC will send an IGMPv2 report immediately, before this test can get + // the IGMPv1 General Membership Query in. + p, ok := e.Read() + if !ok { + t.Fatal("unable to Read IGMP packet, expected V2MembershipReport") + } + if got := s.Stats().IGMP.PacketsSent.V2MembershipReport.Value(); got != 1 { + t.Fatalf("got V2MembershipReport messages sent = %d, want = 1", got) + } + validateIgmpPacket(t, p, multicastAddr, header.IGMPv2MembershipReport, 0, multicastAddr) + if t.Failed() { + t.FailNow() + } + + // Inject an IGMPv1 General Membership Query which is identical to a standard + // membership query except the Max Response Time is set to 0, which will tell + // the stack that this is a router using IGMPv1. Send it to the all systems + // group which is the only group this host belongs to. + createAndInjectIGMPPacket(e, header.IGMPMembershipQuery, 0, header.IPv4AllSystems) + if got := s.Stats().IGMP.PacketsReceived.MembershipQuery.Value(); got != 1 { + t.Fatalf("got Membership Queries received = %d, want = 1", got) + } + + // Before advancing the clock, verify that this host has not sent a + // V1MembershipReport yet. + if got := s.Stats().IGMP.PacketsSent.V1MembershipReport.Value(); got != 0 { + t.Fatalf("got V1MembershipReport messages sent = %d, want = 0", got) + } + + // Verify the solicited Membership Report is sent. Now that this NIC has seen + // an IGMPv1 query, it should send an IGMPv1 Membership Report. + p, ok = e.Read() + if ok { + t.Fatalf("sent unexpected packet, expected V1MembershipReport only after advancing the clock = %+v", p.Pkt) + } + clock.Advance(ipv4.UnsolicitedReportIntervalMax) + p, ok = e.Read() + if !ok { + t.Fatal("unable to Read IGMP packet, expected V1MembershipReport") + } + if got := s.Stats().IGMP.PacketsSent.V1MembershipReport.Value(); got != 1 { + t.Fatalf("got V1MembershipReport messages sent = %d, want = 1", got) + } + validateIgmpPacket(t, p, multicastAddr, header.IGMPv1MembershipReport, 0, multicastAddr) +} + +func TestSendQueuedIGMPReports(t *testing.T) { + e, s, clock := createStack(t, true) + + // Joining a group without an assigned address should queue IGMP packets; none + // should be sent without an assigned address. + if err := s.JoinGroup(ipv4.ProtocolNumber, nicID, multicastAddr); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", ipv4.ProtocolNumber, nicID, multicastAddr, err) + } + reportStat := s.Stats().IGMP.PacketsSent.V2MembershipReport + if got := reportStat.Value(); got != 0 { + t.Errorf("got reportStat.Value() = %d, want = 0", got) + } + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("got unexpected packet = %#v", p) + } + + // The initial set of IGMP reports that were queued should be sent once an + // address is assigned. + if err := s.AddAddress(nicID, ipv4.ProtocolNumber, addr); err != nil { + t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv4.ProtocolNumber, addr, err) + } + if got := reportStat.Value(); got != 1 { + t.Errorf("got reportStat.Value() = %d, want = 1", got) + } + if p, ok := e.Read(); !ok { + t.Error("expected to send an IGMP membership report") + } else { + validateIgmpPacket(t, p, multicastAddr, header.IGMPv2MembershipReport, 0, multicastAddr) + } + if t.Failed() { + t.FailNow() + } + clock.Advance(ipv4.UnsolicitedReportIntervalMax) + if got := reportStat.Value(); got != 2 { + t.Errorf("got reportStat.Value() = %d, want = 2", got) + } + if p, ok := e.Read(); !ok { + t.Error("expected to send an IGMP membership report") + } else { + validateIgmpPacket(t, p, multicastAddr, header.IGMPv2MembershipReport, 0, multicastAddr) + } + if t.Failed() { + t.FailNow() + } + + // Should have no more packets to send after the initial set of unsolicited + // reports. + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("got unexpected packet = %#v", p) + } +} diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index 1efe6297a..e9ff70d04 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -83,6 +83,7 @@ type endpoint struct { sync.RWMutex addressableEndpointState stack.AddressableEndpointState + igmp igmpState } } @@ -93,7 +94,10 @@ func (p *protocol) NewEndpoint(nic stack.NetworkInterface, _ stack.LinkAddressCa dispatcher: dispatcher, protocol: p, } + e.mu.Lock() e.mu.addressableEndpointState.Init(e) + e.mu.igmp.init(e) + e.mu.Unlock() return e } @@ -121,11 +125,22 @@ func (e *endpoint) Enable() *tcpip.Error { // We have no need for the address endpoint. ep.DecRef() + // Groups may have been joined while the endpoint was disabled, or the + // endpoint may have left groups from the perspective of IGMP when the + // endpoint was disabled. Either way, we need to let routers know to + // send us multicast traffic. + e.mu.igmp.initializeAll() + // As per RFC 1122 section 3.3.7, all hosts should join the all-hosts // multicast group. Note, the IANA calls the all-hosts multicast group the // all-systems multicast group. - _, err = e.mu.addressableEndpointState.JoinGroup(header.IPv4AllSystems) - return err + if err := e.joinGroupLocked(header.IPv4AllSystems); err != nil { + // joinGroupLocked only returns an error if the group address is not a valid + // IPv4 multicast address. + panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllSystems, err)) + } + + return nil } // Enabled implements stack.NetworkEndpoint. @@ -157,19 +172,27 @@ func (e *endpoint) Disable() { } func (e *endpoint) disableLocked() { - if !e.setEnabled(false) { + if !e.isEnabled() { return } // The endpoint may have already left the multicast group. - if _, err := e.mu.addressableEndpointState.LeaveGroup(header.IPv4AllSystems); err != nil && err != tcpip.ErrBadLocalAddress { + if err := e.leaveGroupLocked(header.IPv4AllSystems); err != nil && err != tcpip.ErrBadLocalAddress { panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err)) } + // Leave groups from the perspective of IGMP so that routers know that + // we are no longer interested in the group. + e.mu.igmp.softLeaveAll() + // The address may have already been removed. if err := e.mu.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err != nil && err != tcpip.ErrBadLocalAddress { panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err)) } + + if !e.setEnabled(false) { + panic("should have only done work to disable the endpoint if it was enabled") + } } // DefaultTTL is the default time-to-live value for this endpoint. @@ -198,37 +221,34 @@ func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber { return e.protocol.Number() } -func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) { +func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, options header.IPv4OptionsSerializer) { hdrLen := header.IPv4MinimumSize - var opts header.IPv4Options - if params.Options != nil { - var ok bool - if opts, ok = params.Options.(header.IPv4Options); !ok { - panic(fmt.Sprintf("want IPv4Options, got %T", params.Options)) - } - hdrLen += opts.SizeWithPadding() - if hdrLen > header.IPv4MaximumHeaderSize { - // Since we have no way to report an error we must either panic or create - // a packet which is different to what was requested. Choose panic as this - // would be a programming error that should be caught in testing. - panic(fmt.Sprintf("IPv4 Options %d bytes, Max %d", params.Options.SizeWithPadding(), header.IPv4MaximumOptionsSize)) - } + var optLen int + if options != nil { + optLen = int(options.Length()) + } + hdrLen += optLen + if hdrLen > header.IPv4MaximumHeaderSize { + // Since we have no way to report an error we must either panic or create + // a packet which is different to what was requested. Choose panic as this + // would be a programming error that should be caught in testing. + panic(fmt.Sprintf("IPv4 Options %d bytes, Max %d", optLen, header.IPv4MaximumOptionsSize)) } ip := header.IPv4(pkt.NetworkHeader().Push(hdrLen)) length := uint16(pkt.Size()) // RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic // datagrams. Since the DF bit is never being set here, all datagrams // are non-atomic and need an ID. - id := atomic.AddUint32(&e.protocol.ids[hashRoute(r, params.Protocol, e.protocol.hashIV)%buckets], 1) + id := atomic.AddUint32(&e.protocol.ids[hashRoute(srcAddr, dstAddr, params.Protocol, e.protocol.hashIV)%buckets], 1) ip.Encode(&header.IPv4Fields{ TotalLength: length, ID: uint16(id), TTL: params.TTL, TOS: params.TOS, Protocol: uint8(params.Protocol), - SrcAddr: r.LocalAddress, - DstAddr: r.RemoteAddress, - Options: opts, + SrcAddr: srcAddr, + DstAddr: dstAddr, + Options: options, }) ip.SetChecksum(^ip.CalculateChecksum()) pkt.NetworkProtocolNumber = ProtocolNumber @@ -259,7 +279,7 @@ func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU ui // WritePacket writes a packet to the given destination address and protocol. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error { - e.addIPHeader(r, pkt, params) + e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* options */) // iptables filtering. All packets that reach here are locally // generated. @@ -347,7 +367,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe } for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { - e.addIPHeader(r, pkt, params) + e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* options */) networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size())) if err != nil { r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len())) @@ -461,7 +481,7 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu // non-atomic datagrams, so assign an ID to all such datagrams // according to the definition given in RFC 6864 section 4. if ip.Flags()&header.IPv4FlagDontFragment == 0 || ip.Flags()&header.IPv4FlagMoreFragments != 0 || ip.FragmentOffset() > 0 { - ip.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r, 0 /* protocol */, e.protocol.hashIV)%buckets], 1))) + ip.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r.LocalAddress, r.RemoteAddress, 0 /* protocol */, e.protocol.hashIV)%buckets], 1))) } } @@ -566,21 +586,6 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { stats.IP.MalformedPacketsReceived.Increment() return } - srcAddr := h.SourceAddress() - dstAddr := h.DestinationAddress() - - addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint) - if addressEndpoint == nil { - if !e.protocol.Forwarding() { - stats.IP.InvalidDestinationAddressesReceived.Increment() - return - } - - _ = e.forwardPacket(pkt) - return - } - subnet := addressEndpoint.AddressWithPrefix().Subnet() - addressEndpoint.DecRef() // There has been some confusion regarding verifying checksums. We need // just look for negative 0 (0xffff) as the checksum, as it's not possible to @@ -608,16 +613,42 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { return } + srcAddr := h.SourceAddress() + dstAddr := h.DestinationAddress() + // As per RFC 1122 section 3.2.1.3: // When a host sends any datagram, the IP source address MUST // be one of its own IP addresses (but not a broadcast or // multicast address). - if directedBroadcast := subnet.IsBroadcast(srcAddr); directedBroadcast || srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) { + if srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) { stats.IP.InvalidSourceAddressesReceived.Increment() return } + // Make sure the source address is not a subnet-local broadcast address. + if addressEndpoint := e.AcquireAssignedAddress(srcAddr, false /* createTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil { + subnet := addressEndpoint.Subnet() + addressEndpoint.DecRef() + if subnet.IsBroadcast(srcAddr) { + stats.IP.InvalidSourceAddressesReceived.Increment() + return + } + } + + // The destination address should be an address we own or a group we joined + // for us to receive the packet. Otherwise, attempt to forward the packet. + if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil { + subnet := addressEndpoint.AddressWithPrefix().Subnet() + addressEndpoint.DecRef() + pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast + } else if !e.IsInGroup(dstAddr) { + if !e.protocol.Forwarding() { + stats.IP.InvalidDestinationAddressesReceived.Increment() + return + } - pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast + _ = e.forwardPacket(pkt) + return + } // iptables filtering. All packets that reach here are intended for // this machine and will not be forwarded. @@ -692,6 +723,12 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { e.handleICMP(pkt) return } + if p == header.IGMPProtocolNumber { + e.mu.Lock() + e.mu.igmp.handleIGMP(pkt) + e.mu.Unlock() + return + } if opts := h.Options(); len(opts) != 0 { // TODO(gvisor.dev/issue/4586): // When we add forwarding support we should use the verified options @@ -747,7 +784,12 @@ func (e *endpoint) Close() { func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, *tcpip.Error) { e.mu.Lock() defer e.mu.Unlock() - return e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated) + + ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated) + if err == nil { + e.mu.igmp.sendQueuedReports() + } + return ep, err } // RemovePermanentAddress implements stack.AddressableEndpoint. @@ -770,34 +812,26 @@ func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp boo defer e.mu.Unlock() loopback := e.nic.IsLoopback() - addressEndpoint := e.mu.addressableEndpointState.ReadOnly().AddrOrMatching(localAddr, allowTemp, func(addressEndpoint stack.AddressEndpoint) bool { + return e.mu.addressableEndpointState.AcquireAssignedAddressOrMatching(localAddr, func(addressEndpoint stack.AddressEndpoint) bool { subnet := addressEndpoint.Subnet() // IPv4 has a notion of a subnet broadcast address and considers the // loopback interface bound to an address's whole subnet (on linux). return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr)) - }) - if addressEndpoint != nil { - return addressEndpoint - } - - if !allowTemp { - return nil - } - - addr := localAddr.WithPrefix() - addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquireTemporaryAddress(addr, tempPEB) - if err != nil { - // AddAddress only returns an error if the address is already assigned, - // but we just checked above if the address exists so we expect no error. - panic(fmt.Sprintf("e.mu.addressableEndpointState.AddAndAcquireTemporaryAddress(%s, %d): %s", addr, tempPEB, err)) - } - return addressEndpoint + }, allowTemp, tempPEB) } // AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint. func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint { e.mu.RLock() defer e.mu.RUnlock() + return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired) +} + +// acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress +// but with locking requirements +// +// Precondition: igmp.ep.mu must be read locked. +func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint { return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired) } @@ -816,28 +850,43 @@ func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix { } // JoinGroup implements stack.GroupAddressableEndpoint. -func (e *endpoint) JoinGroup(addr tcpip.Address) (bool, *tcpip.Error) { +func (e *endpoint) JoinGroup(addr tcpip.Address) *tcpip.Error { + e.mu.Lock() + defer e.mu.Unlock() + return e.joinGroupLocked(addr) +} + +// joinGroupLocked is like JoinGroup but with locking requirements. +// +// Precondition: e.mu must be locked. +func (e *endpoint) joinGroupLocked(addr tcpip.Address) *tcpip.Error { if !header.IsV4MulticastAddress(addr) { - return false, tcpip.ErrBadAddress + return tcpip.ErrBadAddress } - e.mu.Lock() - defer e.mu.Unlock() - return e.mu.addressableEndpointState.JoinGroup(addr) + e.mu.igmp.joinGroup(addr) + return nil } // LeaveGroup implements stack.GroupAddressableEndpoint. -func (e *endpoint) LeaveGroup(addr tcpip.Address) (bool, *tcpip.Error) { +func (e *endpoint) LeaveGroup(addr tcpip.Address) *tcpip.Error { e.mu.Lock() defer e.mu.Unlock() - return e.mu.addressableEndpointState.LeaveGroup(addr) + return e.leaveGroupLocked(addr) +} + +// leaveGroupLocked is like LeaveGroup but with locking requirements. +// +// Precondition: e.mu must be locked. +func (e *endpoint) leaveGroupLocked(addr tcpip.Address) *tcpip.Error { + return e.mu.igmp.leaveGroup(addr) } // IsInGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) IsInGroup(addr tcpip.Address) bool { e.mu.RLock() defer e.mu.RUnlock() - return e.mu.addressableEndpointState.IsInGroup(addr) + return e.mu.igmp.isInGroup(addr) } var _ stack.ForwardingNetworkProtocol = (*protocol)(nil) @@ -863,6 +912,8 @@ type protocol struct { hashIV uint32 fragmentation *fragmentation.Fragmentation + + options Options } // Number returns the ipv4 protocol number. @@ -987,17 +1038,23 @@ func addressToUint32(addr tcpip.Address) uint32 { return uint32(addr[0]) | uint32(addr[1])<<8 | uint32(addr[2])<<16 | uint32(addr[3])<<24 } -// hashRoute calculates a hash value for the given route. It uses the source & -// destination address, the transport protocol number and a 32-bit number to -// generate the hash. -func hashRoute(r *stack.Route, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 { - a := addressToUint32(r.LocalAddress) - b := addressToUint32(r.RemoteAddress) +// hashRoute calculates a hash value for the given source/destination pair using +// the addresses, transport protocol number and a 32-bit number to generate the +// hash. +func hashRoute(srcAddr, dstAddr tcpip.Address, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 { + a := addressToUint32(srcAddr) + b := addressToUint32(dstAddr) return hash.Hash3Words(a, b, uint32(protocol), hashIV) } -// NewProtocol returns an IPv4 network protocol. -func NewProtocol(s *stack.Stack) stack.NetworkProtocol { +// Options holds options to configure a new protocol. +type Options struct { + // IGMP holds options for IGMP. + IGMP IGMPOptions +} + +// NewProtocolWithOptions returns an IPv4 network protocol. +func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory { ids := make([]uint32, buckets) // Randomly initialize hashIV and the ids. @@ -1007,14 +1064,22 @@ func NewProtocol(s *stack.Stack) stack.NetworkProtocol { } hashIV := r[buckets] - p := &protocol{ - stack: s, - ids: ids, - hashIV: hashIV, - defaultTTL: DefaultTTL, + return func(s *stack.Stack) stack.NetworkProtocol { + p := &protocol{ + stack: s, + ids: ids, + hashIV: hashIV, + defaultTTL: DefaultTTL, + options: opts, + } + p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p) + return p } - p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p) - return p +} + +// NewProtocol is equivalent to NewProtocolWithOptions with an empty Options. +func NewProtocol(s *stack.Stack) stack.NetworkProtocol { + return NewProtocolWithOptions(Options{})(s) } func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) { @@ -1129,6 +1194,12 @@ func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Addres } pointer := tsOpt.Pointer() + // RFC 791 page 22 states: "The smallest legal value is 5." + // Since the pointer is 1 based, and the header is 4 bytes long the + // pointer must point beyond the header therefore 4 or less is bad. + if pointer <= header.IPv4OptionTimestampHdrLength { + return header.IPv4OptTSPointerOffset, errIPv4TimestampOptInvalidPointer + } // To simplify processing below, base further work on the array of timestamps // beyond the header, rather than on the whole option. Also to aid // calculations set 'nextSlot' to be 0 based as in the packet it is 1 based. @@ -1215,7 +1286,15 @@ func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Ad return header.IPv4OptionLengthOffset, errIPv4RecordRouteOptInvalidLength } - nextSlot := rrOpt.Pointer() - 1 // Pointer is 1 based. + pointer := rrOpt.Pointer() + // RFC 791 page 20 states: + // The pointer is relative to this option, and the + // smallest legal value for the pointer is 4. + // Since the pointer is 1 based, and the header is 3 bytes long the + // pointer must point beyond the header therefore 3 or less is bad. + if pointer <= header.IPv4OptionRecordRouteHdrLength { + return header.IPv4OptRRPointerOffset, errIPv4RecordRouteOptInvalidPointer + } // RFC 791 page 21 says // If the route data area is already full (the pointer exceeds the @@ -1230,14 +1309,14 @@ func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Ad // do this (as do most implementations). It is probable that the inclusion // of these words is a copy/paste error from the timestamp option where // there are two failure reasons given. - if nextSlot >= optlen { + if pointer > optlen { return 0, nil } // The data area isn't full but there isn't room for a new entry. // Either Length or Pointer could be bad. We must select Pointer for Linux - // compatibility, even if only the length is bad. - if nextSlot+header.IPv4AddressSize > optlen { + // compatibility, even if only the length is bad. NB. pointer is 1 based. + if pointer+header.IPv4AddressSize > optlen+1 { if false { // This is what we would do if we were not being Linux compatible. // Check for bad pointer or length value. Must be a multiple of 4 after diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go index 4e4e1f3b4..9e2d2cfd6 100644 --- a/pkg/tcpip/network/ipv4/ipv4_test.go +++ b/pkg/tcpip/network/ipv4/ipv4_test.go @@ -103,105 +103,6 @@ func TestExcludeBroadcast(t *testing.T) { }) } -// TestIPv4Encode checks that ipv4.Encode correctly fills out the requested -// fields when options are supplied. -func TestIPv4EncodeOptions(t *testing.T) { - tests := []struct { - name string - options header.IPv4Options - encodedOptions header.IPv4Options // reply should look like this - wantIHL int - }{ - { - name: "valid no options", - wantIHL: header.IPv4MinimumSize, - }, - { - name: "one byte options", - options: header.IPv4Options{1}, - encodedOptions: header.IPv4Options{1, 0, 0, 0}, - wantIHL: header.IPv4MinimumSize + 4, - }, - { - name: "two byte options", - options: header.IPv4Options{1, 1}, - encodedOptions: header.IPv4Options{1, 1, 0, 0}, - wantIHL: header.IPv4MinimumSize + 4, - }, - { - name: "three byte options", - options: header.IPv4Options{1, 1, 1}, - encodedOptions: header.IPv4Options{1, 1, 1, 0}, - wantIHL: header.IPv4MinimumSize + 4, - }, - { - name: "four byte options", - options: header.IPv4Options{1, 1, 1, 1}, - encodedOptions: header.IPv4Options{1, 1, 1, 1}, - wantIHL: header.IPv4MinimumSize + 4, - }, - { - name: "five byte options", - options: header.IPv4Options{1, 1, 1, 1, 1}, - encodedOptions: header.IPv4Options{1, 1, 1, 1, 1, 0, 0, 0}, - wantIHL: header.IPv4MinimumSize + 8, - }, - { - name: "thirty nine byte options", - options: header.IPv4Options{ - 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16, - 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32, - 33, 34, 35, 36, 37, 38, 39, - }, - encodedOptions: header.IPv4Options{ - 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16, - 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32, - 33, 34, 35, 36, 37, 38, 39, 0, - }, - wantIHL: header.IPv4MinimumSize + 40, - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - paddedOptionLength := test.options.SizeWithPadding() - ipHeaderLength := header.IPv4MinimumSize + paddedOptionLength - if ipHeaderLength > header.IPv4MaximumHeaderSize { - t.Fatalf("IP header length too large: got = %d, want <= %d ", ipHeaderLength, header.IPv4MaximumHeaderSize) - } - totalLen := uint16(ipHeaderLength) - hdr := buffer.NewPrependable(int(totalLen)) - ip := header.IPv4(hdr.Prepend(ipHeaderLength)) - // To check the padding works, poison the last byte of the options space. - if paddedOptionLength != len(test.options) { - ip.SetHeaderLength(uint8(ipHeaderLength)) - ip.Options()[paddedOptionLength-1] = 0xff - ip.SetHeaderLength(0) - } - ip.Encode(&header.IPv4Fields{ - Options: test.options, - }) - options := ip.Options() - wantOptions := test.encodedOptions - if got, want := int(ip.HeaderLength()), test.wantIHL; got != want { - t.Errorf("got IHL of %d, want %d", got, want) - } - - // cmp.Diff does not consider nil slices equal to empty slices, but we do. - if len(wantOptions) == 0 && len(options) == 0 { - return - } - - if diff := cmp.Diff(wantOptions, options); diff != "" { - t.Errorf("options mismatch (-want +got):\n%s", diff) - } - }) - } -} - func TestForwarding(t *testing.T) { const ( nicID1 = 1 @@ -453,14 +354,6 @@ func TestIPv4Sanity(t *testing.T) { replyOptions: header.IPv4Options{1, 1, 0, 0}, }, { - name: "Check option padding", - maxTotalLength: ipv4.MaxTotalSize, - transportProtocol: uint8(header.ICMPv4ProtocolNumber), - TTL: ttl, - options: header.IPv4Options{1, 1, 1}, - replyOptions: header.IPv4Options{1, 1, 1, 0}, - }, - { name: "bad header length", headerLength: header.IPv4MinimumSize - 1, maxTotalLength: ipv4.MaxTotalSize, @@ -583,7 +476,7 @@ func TestIPv4Sanity(t *testing.T) { 68, 7, 5, 0, // ^ ^ Linux points here which is wrong. // | Not a multiple of 4 - 1, 2, 3, + 1, 2, 3, 0, }, shouldFail: true, expectErrorICMP: true, @@ -662,6 +555,56 @@ func TestIPv4Sanity(t *testing.T) { }, }, { + // Timestamp pointer uses one based counting so 0 is invalid. + name: "timestamp pointer invalid", + maxTotalLength: ipv4.MaxTotalSize, + transportProtocol: uint8(header.ICMPv4ProtocolNumber), + TTL: ttl, + options: header.IPv4Options{ + 68, 8, 0, 0x00, + // ^ 0 instead of 5 or more. + 0, 0, 0, 0, + }, + shouldFail: true, + expectErrorICMP: true, + ICMPType: header.ICMPv4ParamProblem, + ICMPCode: header.ICMPv4UnusedCode, + paramProblemPointer: header.IPv4MinimumSize + 2, + }, + { + // Timestamp pointer cannot be less than 5. It must point past the header + // which is 4 bytes. (1 based counting) + name: "timestamp pointer too small by 1", + maxTotalLength: ipv4.MaxTotalSize, + transportProtocol: uint8(header.ICMPv4ProtocolNumber), + TTL: ttl, + options: header.IPv4Options{ + 68, 8, header.IPv4OptionTimestampHdrLength, 0x00, + // ^ header is 4 bytes, so 4 should fail. + 0, 0, 0, 0, + }, + shouldFail: true, + expectErrorICMP: true, + ICMPType: header.ICMPv4ParamProblem, + ICMPCode: header.ICMPv4UnusedCode, + paramProblemPointer: header.IPv4MinimumSize + 2, + }, + { + name: "valid timestamp pointer", + maxTotalLength: ipv4.MaxTotalSize, + transportProtocol: uint8(header.ICMPv4ProtocolNumber), + TTL: ttl, + options: header.IPv4Options{ + 68, 8, header.IPv4OptionTimestampHdrLength + 1, 0x00, + // ^ header is 4 bytes, so 5 should succeed. + 0, 0, 0, 0, + }, + replyOptions: header.IPv4Options{ + 68, 8, 9, 0x00, + 0x00, 0xad, 0x1c, 0x40, // time we expect from fakeclock + }, + }, + { // Needs 8 bytes for a type 1 timestamp but there are only 4 free. name: "bad timer element alignment", maxTotalLength: ipv4.MaxTotalSize, @@ -792,7 +735,61 @@ func TestIPv4Sanity(t *testing.T) { }, }, { - // Confirm linux bug for bug compatibility. + // Pointer uses one based counting so 0 is invalid. + name: "record route pointer zero", + maxTotalLength: ipv4.MaxTotalSize, + transportProtocol: uint8(header.ICMPv4ProtocolNumber), + TTL: ttl, + options: header.IPv4Options{ + 7, 8, 0, // 3 byte header + 0, 0, 0, 0, + 0, + }, + shouldFail: true, + expectErrorICMP: true, + ICMPType: header.ICMPv4ParamProblem, + ICMPCode: header.ICMPv4UnusedCode, + paramProblemPointer: header.IPv4MinimumSize + 2, + }, + { + // Pointer must be 4 or more as it must point past the 3 byte header + // using 1 based counting. 3 should fail. + name: "record route pointer too small by 1", + maxTotalLength: ipv4.MaxTotalSize, + transportProtocol: uint8(header.ICMPv4ProtocolNumber), + TTL: ttl, + options: header.IPv4Options{ + 7, 8, header.IPv4OptionRecordRouteHdrLength, // 3 byte header + 0, 0, 0, 0, + 0, + }, + shouldFail: true, + expectErrorICMP: true, + ICMPType: header.ICMPv4ParamProblem, + ICMPCode: header.ICMPv4UnusedCode, + paramProblemPointer: header.IPv4MinimumSize + 2, + }, + { + // Pointer must be 4 or more as it must point past the 3 byte header + // using 1 based counting. Check 4 passes. (Duplicates "single + // record route with room") + name: "valid record route pointer", + maxTotalLength: ipv4.MaxTotalSize, + transportProtocol: uint8(header.ICMPv4ProtocolNumber), + TTL: ttl, + options: header.IPv4Options{ + 7, 7, header.IPv4OptionRecordRouteHdrLength + 1, // 3 byte header + 0, 0, 0, 0, + 0, + }, + replyOptions: header.IPv4Options{ + 7, 7, 8, // 3 byte header + 192, 168, 1, 58, // New IP Address. + 0, // padding to multiple of 4 bytes. + }, + }, + { + // Confirm Linux bug for bug compatibility. // Linux returns slot 22 but the error is in slot 21. name: "multiple record route with not enough room", maxTotalLength: ipv4.MaxTotalSize, @@ -863,8 +860,10 @@ func TestIPv4Sanity(t *testing.T) { }, }) - paddedOptionLength := test.options.SizeWithPadding() - ipHeaderLength := header.IPv4MinimumSize + paddedOptionLength + if len(test.options)%4 != 0 { + t.Fatalf("options must be aligned to 32 bits, invalid test options: %x (len=%d)", test.options, len(test.options)) + } + ipHeaderLength := header.IPv4MinimumSize + len(test.options) if ipHeaderLength > header.IPv4MaximumHeaderSize { t.Fatalf("IP header length too large: got = %d, want <= %d ", ipHeaderLength, header.IPv4MaximumHeaderSize) } @@ -883,11 +882,6 @@ func TestIPv4Sanity(t *testing.T) { if test.maxTotalLength < totalLen { totalLen = test.maxTotalLength } - // To check the padding works, poison the options space. - if paddedOptionLength != len(test.options) { - ip.SetHeaderLength(uint8(ipHeaderLength)) - ip.Options()[paddedOptionLength-1] = 0x01 - } ip.Encode(&header.IPv4Fields{ TotalLength: totalLen, @@ -895,10 +889,19 @@ func TestIPv4Sanity(t *testing.T) { TTL: test.TTL, SrcAddr: remoteIPv4Addr, DstAddr: ipv4Addr.Address, - Options: test.options, }) if test.headerLength != 0 { ip.SetHeaderLength(test.headerLength) + } else { + // Set the calculated header length, since we may manually add options. + ip.SetHeaderLength(uint8(ipHeaderLength)) + } + if len(test.options) != 0 { + // Copy options manually. We do not use Encode for options so we can + // verify malformed options with handcrafted payloads. + if want, got := copy(ip.Options(), test.options), len(test.options); want != got { + t.Fatalf("got copy(ip.Options(), test.options) = %d, want = %d", got, want) + } } ip.SetChecksum(0) ipHeaderChecksum := ip.CalculateChecksum() @@ -1003,7 +1006,7 @@ func TestIPv4Sanity(t *testing.T) { } // If the IP options change size then the packet will change size, so // some IP header fields will need to be adjusted for the checks. - sizeChange := len(test.replyOptions) - paddedOptionLength + sizeChange := len(test.replyOptions) - len(test.options) checker.IPv4(t, replyIPHeader, checker.IPv4HeaderLength(ipHeaderLength+sizeChange), @@ -2320,6 +2323,28 @@ func TestReceiveFragments(t *testing.T) { }, expectedPayloads: [][]byte{udpPayload4Addr1ToAddr2}, }, + { + name: "Two fragments with MF flag reassembled into a maximum UDP packet", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 0, + payload: ipv4Payload4Addr1ToAddr2[:65512], + }, + { + srcAddr: addr1, + dstAddr: addr2, + id: 1, + flags: header.IPv4FlagMoreFragments, + fragmentOffset: 65512, + payload: ipv4Payload4Addr1ToAddr2[65512:], + }, + }, + expectedPayloads: nil, + }, } for _, test := range tests { @@ -2513,7 +2538,7 @@ func TestWriteStats(t *testing.T) { test.setup(t, rt.Stack()) - nWritten, _ := writer.writePackets(&rt, pkts) + nWritten, _ := writer.writePackets(rt, pkts) if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent { t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent) @@ -2530,7 +2555,7 @@ func TestWriteStats(t *testing.T) { } } -func buildRoute(t *testing.T, ep stack.LinkEndpoint) stack.Route { +func buildRoute(t *testing.T, ep stack.LinkEndpoint) *stack.Route { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol}, }) @@ -2644,8 +2669,8 @@ func TestPacketQueing(t *testing.T) { if p.Proto != header.IPv4ProtocolNumber { t.Errorf("got p.Proto = %d, want = %d", p.Proto, header.IPv4ProtocolNumber) } - if p.Route.RemoteLinkAddress != host2NICLinkAddr { - t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr) + if got := p.Route.RemoteLinkAddress(); got != host2NICLinkAddr { + t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, host2NICLinkAddr) } checker.IPv4(t, stack.PayloadSince(p.Pkt.NetworkHeader()), checker.SrcAddr(host1IPv4Addr.AddressWithPrefix.Address), @@ -2687,8 +2712,8 @@ func TestPacketQueing(t *testing.T) { if p.Proto != header.IPv4ProtocolNumber { t.Errorf("got p.Proto = %d, want = %d", p.Proto, header.IPv4ProtocolNumber) } - if p.Route.RemoteLinkAddress != host2NICLinkAddr { - t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr) + if got := p.Route.RemoteLinkAddress(); got != host2NICLinkAddr { + t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, host2NICLinkAddr) } checker.IPv4(t, stack.PayloadSince(p.Pkt.NetworkHeader()), checker.SrcAddr(host1IPv4Addr.AddressWithPrefix.Address), @@ -2736,8 +2761,8 @@ func TestPacketQueing(t *testing.T) { if p.Proto != arp.ProtocolNumber { t.Errorf("got p.Proto = %d, want = %d", p.Proto, arp.ProtocolNumber) } - if p.Route.RemoteLinkAddress != header.EthernetBroadcastAddress { - t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, header.EthernetBroadcastAddress) + if got := p.Route.RemoteLinkAddress(); got != header.EthernetBroadcastAddress { + t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, header.EthernetBroadcastAddress) } rep := header.ARP(p.Pkt.NetworkHeader().View()) if got := rep.Op(); got != header.ARPRequest { diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD index 0ac24a6fb..afa45aefe 100644 --- a/pkg/tcpip/network/ipv6/BUILD +++ b/pkg/tcpip/network/ipv6/BUILD @@ -8,6 +8,7 @@ go_library( "dhcpv6configurationfromndpra_string.go", "icmp.go", "ipv6.go", + "mld.go", "ndp.go", ], visibility = ["//visibility:public"], @@ -19,6 +20,7 @@ go_library( "//pkg/tcpip/header/parse", "//pkg/tcpip/network/fragmentation", "//pkg/tcpip/network/hash", + "//pkg/tcpip/network/ip", "//pkg/tcpip/stack", ], ) @@ -49,3 +51,19 @@ go_test( "@com_github_google_go_cmp//cmp:go_default_library", ], ) + +go_test( + name = "ipv6_x_test", + size = "small", + srcs = ["mld_test.go"], + deps = [ + ":ipv6", + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/tcpip/checker", + "//pkg/tcpip/faketime", + "//pkg/tcpip/header", + "//pkg/tcpip/link/channel", + "//pkg/tcpip/stack", + ], +) diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go index beb8f562e..6ee162713 100644 --- a/pkg/tcpip/network/ipv6/icmp.go +++ b/pkg/tcpip/network/ipv6/icmp.go @@ -126,8 +126,8 @@ func getTargetLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) { func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool) { stats := e.protocol.stack.Stats().ICMP - sent := stats.V6PacketsSent - received := stats.V6PacketsReceived + sent := stats.V6.PacketsSent + received := stats.V6.PacketsReceived // TODO(gvisor.dev/issue/170): ICMP packets don't have their // TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a // full explanation. @@ -163,7 +163,7 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool) { } // TODO(b/112892170): Meaningfully handle all ICMP types. - switch h.Type() { + switch icmpType := h.Type(); icmpType { case header.ICMPv6PacketTooBig: received.PacketTooBig.Increment() hdr, ok := pkt.Data.PullUp(header.ICMPv6PacketTooBigMinimumSize) @@ -358,7 +358,7 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool) { pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber packet := header.ICMPv6(pkt.TransportHeader().Push(neighborAdvertSize)) packet.SetType(header.ICMPv6NeighborAdvert) - na := header.NDPNeighborAdvert(packet.NDPPayload()) + na := header.NDPNeighborAdvert(packet.MessageBody()) // As per RFC 4861 section 7.2.4: // @@ -644,8 +644,39 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool) { return } + case header.ICMPv6MulticastListenerQuery, header.ICMPv6MulticastListenerReport, header.ICMPv6MulticastListenerDone: + switch icmpType { + case header.ICMPv6MulticastListenerQuery: + received.MulticastListenerQuery.Increment() + case header.ICMPv6MulticastListenerReport: + received.MulticastListenerReport.Increment() + case header.ICMPv6MulticastListenerDone: + received.MulticastListenerDone.Increment() + default: + panic(fmt.Sprintf("unrecognized MLD message = %d", icmpType)) + } + + if pkt.Data.Size()-header.ICMPv6HeaderSize < header.MLDMinimumSize { + received.Invalid.Increment() + return + } + + switch icmpType { + case header.ICMPv6MulticastListenerQuery: + e.mu.Lock() + e.mu.mld.handleMulticastListenerQuery(header.MLD(payload.ToView())) + e.mu.Unlock() + case header.ICMPv6MulticastListenerReport: + e.mu.Lock() + e.mu.mld.handleMulticastListenerReport(header.MLD(payload.ToView())) + e.mu.Unlock() + case header.ICMPv6MulticastListenerDone: + default: + panic(fmt.Sprintf("unrecognized MLD message = %d", icmpType)) + } + default: - received.Invalid.Increment() + received.Unrecognized.Increment() } } @@ -681,12 +712,12 @@ func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remot pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber packet := header.ICMPv6(pkt.TransportHeader().Push(neighborSolicitSize)) packet.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(packet.NDPPayload()) + ns := header.NDPNeighborSolicit(packet.MessageBody()) ns.SetTargetAddress(targetAddr) ns.Options().Serialize(optsSerializer) packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) - stat := p.stack.Stats().ICMP.V6PacketsSent + stat := p.stack.Stats().ICMP.V6.PacketsSent if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{ Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, @@ -796,7 +827,8 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi allowResponseToMulticast = reason.respondToMulticast } - if (!allowResponseToMulticast && header.IsV6MulticastAddress(origIPHdrDst)) || origIPHdrSrc == header.IPv6Any { + isOrigDstMulticast := header.IsV6MulticastAddress(origIPHdrDst) + if (!allowResponseToMulticast && isOrigDstMulticast) || origIPHdrSrc == header.IPv6Any { return nil } @@ -812,8 +844,13 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi // If we are operating as a router, do not use the packet's destination // address as the response's source address as we should not own the // destination address of a packet we are forwarding. + // + // If the packet was originally destined to a multicast address, then do not + // use the packet's destination address as the source for the response ICMP + // packet as "multicast addresses must not be used as source addresses in IPv6 + // packets", as per RFC 4291 section 2.7. localAddr := origIPHdrDst - if _, ok := reason.(*icmpReasonHopLimitExceeded); ok { + if _, ok := reason.(*icmpReasonHopLimitExceeded); ok || isOrigDstMulticast { localAddr = "" } // Even if we were able to receive a packet from some remote, we may not have @@ -827,7 +864,7 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) *tcpi defer route.Release() stats := p.stack.Stats().ICMP - sent := stats.V6PacketsSent + sent := stats.V6.PacketsSent if !p.stack.AllowICMPMessage() { sent.RateLimited.Increment() return nil diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go index 9bc02d851..02b18e9a5 100644 --- a/pkg/tcpip/network/ipv6/icmp_test.go +++ b/pkg/tcpip/network/ipv6/icmp_test.go @@ -150,9 +150,9 @@ func (*testInterface) Promiscuous() bool { func (t *testInterface) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { r := stack.Route{ - NetProto: protocol, - RemoteLinkAddress: remoteLinkAddr, + NetProto: protocol, } + r.ResolveWith(remoteLinkAddr) return t.LinkEndpoint.WritePacket(&r, gso, protocol, pkt) } @@ -271,6 +271,22 @@ func TestICMPCounts(t *testing.T) { typ: header.ICMPv6RedirectMsg, size: header.ICMPv6MinimumSize, }, + { + typ: header.ICMPv6MulticastListenerQuery, + size: header.MLDMinimumSize + header.ICMPv6HeaderSize, + }, + { + typ: header.ICMPv6MulticastListenerReport, + size: header.MLDMinimumSize + header.ICMPv6HeaderSize, + }, + { + typ: header.ICMPv6MulticastListenerDone, + size: header.MLDMinimumSize + header.ICMPv6HeaderSize, + }, + { + typ: 255, /* Unrecognized */ + size: 50, + }, } handleIPv6Payload := func(icmp header.ICMPv6) { @@ -280,11 +296,11 @@ func TestICMPCounts(t *testing.T) { }) ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(len(icmp)), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: header.NDPHopLimit, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(len(icmp)), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: header.NDPHopLimit, + SrcAddr: lladdr1, + DstAddr: lladdr0, }) ep.HandlePacket(pkt) } @@ -301,7 +317,7 @@ func TestICMPCounts(t *testing.T) { // Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented. handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize))) - icmpv6Stats := s.Stats().ICMP.V6PacketsReceived + icmpv6Stats := s.Stats().ICMP.V6.PacketsReceived visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) { if got, want := s.Value(), uint64(1); got != want { t.Errorf("got %s = %d, want = %d", name, got, want) @@ -413,6 +429,22 @@ func TestICMPCountsWithNeighborCache(t *testing.T) { typ: header.ICMPv6RedirectMsg, size: header.ICMPv6MinimumSize, }, + { + typ: header.ICMPv6MulticastListenerQuery, + size: header.MLDMinimumSize + header.ICMPv6HeaderSize, + }, + { + typ: header.ICMPv6MulticastListenerReport, + size: header.MLDMinimumSize + header.ICMPv6HeaderSize, + }, + { + typ: header.ICMPv6MulticastListenerDone, + size: header.MLDMinimumSize + header.ICMPv6HeaderSize, + }, + { + typ: 255, /* Unrecognized */ + size: 50, + }, } handleIPv6Payload := func(icmp header.ICMPv6) { @@ -422,11 +454,11 @@ func TestICMPCountsWithNeighborCache(t *testing.T) { }) ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(len(icmp)), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: header.NDPHopLimit, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(len(icmp)), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: header.NDPHopLimit, + SrcAddr: lladdr1, + DstAddr: lladdr0, }) ep.HandlePacket(pkt) } @@ -443,7 +475,7 @@ func TestICMPCountsWithNeighborCache(t *testing.T) { // Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented. handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize))) - icmpv6Stats := s.Stats().ICMP.V6PacketsReceived + icmpv6Stats := s.Stats().ICMP.V6.PacketsReceived visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) { if got, want := s.Value(), uint64(1); got != want { t.Errorf("got %s = %d, want = %d", name, got, want) @@ -568,8 +600,8 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header. return } - if len(args.remoteLinkAddr) != 0 && args.remoteLinkAddr != pi.Route.RemoteLinkAddress { - t.Errorf("got remote link address = %s, want = %s", pi.Route.RemoteLinkAddress, args.remoteLinkAddr) + if got := pi.Route.RemoteLinkAddress(); len(args.remoteLinkAddr) != 0 && got != args.remoteLinkAddr { + t.Errorf("got remote link address = %s, want = %s", got, args.remoteLinkAddr) } // Pull the full payload since network header. Needed for header.IPv6 to @@ -821,11 +853,11 @@ func TestICMPChecksumValidationSimple(t *testing.T) { } ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(len(icmp)), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: header.NDPHopLimit, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(len(icmp)), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: header.NDPHopLimit, + SrcAddr: lladdr1, + DstAddr: lladdr0, }) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}), @@ -833,7 +865,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) { e.InjectInbound(ProtocolNumber, pkt) } - stats := s.Stats().ICMP.V6PacketsReceived + stats := s.Stats().ICMP.V6.PacketsReceived invalid := stats.Invalid routerOnly := stats.RouterOnlyPacketsDroppedByHost typStat := typ.statCounter(stats) @@ -898,11 +930,11 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) { errorICMPBody := func(view buffer.View) { ip := header.IPv6(view) ip.Encode(&header.IPv6Fields{ - PayloadLength: simpleBodySize, - NextHeader: 10, - HopLimit: 20, - SrcAddr: lladdr0, - DstAddr: lladdr1, + PayloadLength: simpleBodySize, + TransportProtocol: 10, + HopLimit: 20, + SrcAddr: lladdr0, + DstAddr: lladdr1, }) simpleBody(view[header.IPv6MinimumSize:]) } @@ -1016,11 +1048,11 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) { ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(icmpSize), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: header.NDPHopLimit, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(icmpSize), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: header.NDPHopLimit, + SrcAddr: lladdr1, + DstAddr: lladdr0, }) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), @@ -1028,7 +1060,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) { e.InjectInbound(ProtocolNumber, pkt) } - stats := s.Stats().ICMP.V6PacketsReceived + stats := s.Stats().ICMP.V6.PacketsReceived invalid := stats.Invalid typStat := typ.statCounter(stats) @@ -1076,11 +1108,11 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) { errorICMPBody := func(view buffer.View) { ip := header.IPv6(view) ip.Encode(&header.IPv6Fields{ - PayloadLength: simpleBodySize, - NextHeader: 10, - HopLimit: 20, - SrcAddr: lladdr0, - DstAddr: lladdr1, + PayloadLength: simpleBodySize, + TransportProtocol: 10, + HopLimit: 20, + SrcAddr: lladdr0, + DstAddr: lladdr1, }) simpleBody(view[header.IPv6MinimumSize:]) } @@ -1195,11 +1227,11 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) { ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(size + payloadSize), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: header.NDPHopLimit, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(size + payloadSize), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: header.NDPHopLimit, + SrcAddr: lladdr1, + DstAddr: lladdr0, }) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize, []buffer.View{hdr.View(), payload}), @@ -1207,7 +1239,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) { e.InjectInbound(ProtocolNumber, pkt) } - stats := s.Stats().ICMP.V6PacketsReceived + stats := s.Stats().ICMP.V6.PacketsReceived invalid := stats.Invalid typStat := typ.statCounter(stats) @@ -1349,8 +1381,8 @@ func TestLinkAddressRequest(t *testing.T) { if !ok { t.Fatal("expected to send a link address request") } - if pkt.Route.RemoteLinkAddress != test.expectedRemoteLinkAddr { - t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", pkt.Route.RemoteLinkAddress, test.expectedRemoteLinkAddr) + if got := pkt.Route.RemoteLinkAddress(); got != test.expectedRemoteLinkAddr { + t.Errorf("got pkt.Route.RemoteLinkAddress() = %s, want = %s", got, test.expectedRemoteLinkAddr) } if pkt.Route.RemoteAddress != test.expectedRemoteAddr { t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", pkt.Route.RemoteAddress, test.expectedRemoteAddr) @@ -1413,11 +1445,11 @@ func TestPacketQueing(t *testing.T) { payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(udp.ProtocolNumber), - HopLimit: DefaultTTL, - SrcAddr: host2IPv6Addr.AddressWithPrefix.Address, - DstAddr: host1IPv6Addr.AddressWithPrefix.Address, + PayloadLength: uint16(payloadLength), + TransportProtocol: udp.ProtocolNumber, + HopLimit: DefaultTTL, + SrcAddr: host2IPv6Addr.AddressWithPrefix.Address, + DstAddr: host1IPv6Addr.AddressWithPrefix.Address, }) e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), @@ -1431,8 +1463,8 @@ func TestPacketQueing(t *testing.T) { if p.Proto != ProtocolNumber { t.Errorf("got p.Proto = %d, want = %d", p.Proto, ProtocolNumber) } - if p.Route.RemoteLinkAddress != host2NICLinkAddr { - t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr) + if got := p.Route.RemoteLinkAddress(); got != host2NICLinkAddr { + t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, host2NICLinkAddr) } checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()), checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address), @@ -1455,11 +1487,11 @@ func TestPacketQueing(t *testing.T) { pkt.SetChecksum(header.ICMPv6Checksum(pkt, host2IPv6Addr.AddressWithPrefix.Address, host1IPv6Addr.AddressWithPrefix.Address, buffer.VectorisedView{})) ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: header.ICMPv6MinimumSize, - NextHeader: uint8(icmp.ProtocolNumber6), - HopLimit: DefaultTTL, - SrcAddr: host2IPv6Addr.AddressWithPrefix.Address, - DstAddr: host1IPv6Addr.AddressWithPrefix.Address, + PayloadLength: header.ICMPv6MinimumSize, + TransportProtocol: icmp.ProtocolNumber6, + HopLimit: DefaultTTL, + SrcAddr: host2IPv6Addr.AddressWithPrefix.Address, + DstAddr: host1IPv6Addr.AddressWithPrefix.Address, }) e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), @@ -1473,8 +1505,8 @@ func TestPacketQueing(t *testing.T) { if p.Proto != ProtocolNumber { t.Errorf("got p.Proto = %d, want = %d", p.Proto, ProtocolNumber) } - if p.Route.RemoteLinkAddress != host2NICLinkAddr { - t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr) + if got := p.Route.RemoteLinkAddress(); got != host2NICLinkAddr { + t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, host2NICLinkAddr) } checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()), checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address), @@ -1524,8 +1556,8 @@ func TestPacketQueing(t *testing.T) { t.Errorf("got Proto = %d, want = %d", p.Proto, ProtocolNumber) } snmc := header.SolicitedNodeAddr(host2IPv6Addr.AddressWithPrefix.Address) - if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want { - t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want) + if got, want := p.Route.RemoteLinkAddress(), header.EthernetAddressFromMulticastIPv6Address(snmc); got != want { + t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, want) } checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()), checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address), @@ -1543,7 +1575,7 @@ func TestPacketQueing(t *testing.T) { hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize) pkt := header.ICMPv6(hdr.Prepend(naSize)) pkt.SetType(header.ICMPv6NeighborAdvert) - na := header.NDPNeighborAdvert(pkt.NDPPayload()) + na := header.NDPNeighborAdvert(pkt.MessageBody()) na.SetSolicitedFlag(true) na.SetOverrideFlag(true) na.SetTargetAddress(host2IPv6Addr.AddressWithPrefix.Address) @@ -1554,11 +1586,11 @@ func TestPacketQueing(t *testing.T) { payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(icmp.ProtocolNumber6), - HopLimit: header.NDPHopLimit, - SrcAddr: host2IPv6Addr.AddressWithPrefix.Address, - DstAddr: host1IPv6Addr.AddressWithPrefix.Address, + PayloadLength: uint16(payloadLength), + TransportProtocol: icmp.ProtocolNumber6, + HopLimit: header.NDPHopLimit, + SrcAddr: host2IPv6Addr.AddressWithPrefix.Address, + DstAddr: host1IPv6Addr.AddressWithPrefix.Address, }) e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), @@ -1592,7 +1624,7 @@ func TestCallsToNeighborCache(t *testing.T) { nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize icmp := header.ICMPv6(buffer.NewView(nsSize)) icmp.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(icmp.NDPPayload()) + ns := header.NDPNeighborSolicit(icmp.MessageBody()) ns.SetTargetAddress(lladdr0) return icmp }, @@ -1612,7 +1644,7 @@ func TestCallsToNeighborCache(t *testing.T) { nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize icmp := header.ICMPv6(buffer.NewView(nsSize)) icmp.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(icmp.NDPPayload()) + ns := header.NDPNeighborSolicit(icmp.MessageBody()) ns.SetTargetAddress(lladdr0) ns.Options().Serialize(header.NDPOptionsSerializer{ header.NDPSourceLinkLayerAddressOption(linkAddr1), @@ -1629,7 +1661,7 @@ func TestCallsToNeighborCache(t *testing.T) { nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize icmp := header.ICMPv6(buffer.NewView(nsSize)) icmp.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(icmp.NDPPayload()) + ns := header.NDPNeighborSolicit(icmp.MessageBody()) ns.SetTargetAddress(lladdr0) return icmp }, @@ -1645,7 +1677,7 @@ func TestCallsToNeighborCache(t *testing.T) { nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize icmp := header.ICMPv6(buffer.NewView(nsSize)) icmp.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(icmp.NDPPayload()) + ns := header.NDPNeighborSolicit(icmp.MessageBody()) ns.SetTargetAddress(lladdr0) ns.Options().Serialize(header.NDPOptionsSerializer{ header.NDPSourceLinkLayerAddressOption(linkAddr1), @@ -1662,7 +1694,7 @@ func TestCallsToNeighborCache(t *testing.T) { naSize := header.ICMPv6NeighborAdvertMinimumSize icmp := header.ICMPv6(buffer.NewView(naSize)) icmp.SetType(header.ICMPv6NeighborAdvert) - na := header.NDPNeighborAdvert(icmp.NDPPayload()) + na := header.NDPNeighborAdvert(icmp.MessageBody()) na.SetSolicitedFlag(true) na.SetOverrideFlag(false) na.SetTargetAddress(lladdr1) @@ -1683,7 +1715,7 @@ func TestCallsToNeighborCache(t *testing.T) { naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize icmp := header.ICMPv6(buffer.NewView(naSize)) icmp.SetType(header.ICMPv6NeighborAdvert) - na := header.NDPNeighborAdvert(icmp.NDPPayload()) + na := header.NDPNeighborAdvert(icmp.MessageBody()) na.SetSolicitedFlag(true) na.SetOverrideFlag(false) na.SetTargetAddress(lladdr1) @@ -1702,7 +1734,7 @@ func TestCallsToNeighborCache(t *testing.T) { naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize icmp := header.ICMPv6(buffer.NewView(naSize)) icmp.SetType(header.ICMPv6NeighborAdvert) - na := header.NDPNeighborAdvert(icmp.NDPPayload()) + na := header.NDPNeighborAdvert(icmp.MessageBody()) na.SetSolicitedFlag(false) na.SetOverrideFlag(false) na.SetTargetAddress(lladdr1) @@ -1722,7 +1754,7 @@ func TestCallsToNeighborCache(t *testing.T) { naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize icmp := header.ICMPv6(buffer.NewView(naSize)) icmp.SetType(header.ICMPv6NeighborAdvert) - na := header.NDPNeighborAdvert(icmp.NDPPayload()) + na := header.NDPNeighborAdvert(icmp.MessageBody()) na.SetSolicitedFlag(false) na.SetOverrideFlag(false) na.SetTargetAddress(lladdr1) @@ -1796,11 +1828,11 @@ func TestCallsToNeighborCache(t *testing.T) { }) ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(len(icmp)), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: header.NDPHopLimit, - SrcAddr: test.source, - DstAddr: test.destination, + PayloadLength: uint16(len(icmp)), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: header.NDPHopLimit, + SrcAddr: test.source, + DstAddr: test.destination, }) ep.HandlePacket(pkt) diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index 7a00f6314..a49b5ac77 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -19,6 +19,7 @@ import ( "encoding/binary" "fmt" "hash/fnv" + "math" "sort" "sync/atomic" "time" @@ -34,7 +35,9 @@ import ( ) const ( + // ReassembleTimeout controls how long a fragment will be held. // As per RFC 8200 section 4.5: + // // If insufficient fragments are received to complete reassembly of a packet // within 60 seconds of the reception of the first-arriving fragment of that // packet, reassembly of that packet must be abandoned. @@ -83,6 +86,7 @@ type endpoint struct { addressableEndpointState stack.AddressableEndpointState ndp ndpState + mld mldState } } @@ -118,6 +122,45 @@ type OpaqueInterfaceIdentifierOptions struct { SecretKey []byte } +// onAddressAssignedLocked handles an address being assigned. +// +// Precondition: e.mu must be exclusively locked. +func (e *endpoint) onAddressAssignedLocked(addr tcpip.Address) { + // As per RFC 2710 section 3, + // + // All MLD messages described in this document are sent with a link-local + // IPv6 Source Address, ... + // + // If we just completed DAD for a link-local address, then attempt to send any + // queued MLD reports. Note, we may have sent reports already for some of the + // groups before we had a valid link-local address to use as the source for + // the MLD messages, but that was only so that MLD snooping switches are aware + // of our membership to groups - routers would not have handled those reports. + // + // As per RFC 3590 section 4, + // + // MLD Report and Done messages are sent with a link-local address as + // the IPv6 source address, if a valid address is available on the + // interface. If a valid link-local address is not available (e.g., one + // has not been configured), the message is sent with the unspecified + // address (::) as the IPv6 source address. + // + // Once a valid link-local address is available, a node SHOULD generate + // new MLD Report messages for all multicast addresses joined on the + // interface. + // + // Routers receiving an MLD Report or Done message with the unspecified + // address as the IPv6 source address MUST silently discard the packet + // without taking any action on the packets contents. + // + // Snooping switches MUST manage multicast forwarding state based on MLD + // Report and Done messages sent with the unspecified address as the + // IPv6 source address. + if header.IsV6LinkLocalAddress(addr) { + e.mu.mld.sendQueuedReports() + } +} + // InvalidateDefaultRouter implements stack.NDPEndpoint. func (e *endpoint) InvalidateDefaultRouter(rtr tcpip.Address) { e.mu.Lock() @@ -224,6 +267,12 @@ func (e *endpoint) Enable() *tcpip.Error { return nil } + // Groups may have been joined when the endpoint was disabled, or the + // endpoint may have left groups from the perspective of MLD when the + // endpoint was disabled. Either way, we need to let routers know to + // send us multicast traffic. + e.mu.mld.initializeAll() + // Join the IPv6 All-Nodes Multicast group if the stack is configured to // use IPv6. This is required to ensure that this node properly receives // and responds to the various NDP messages that are destined to the @@ -241,8 +290,10 @@ func (e *endpoint) Enable() *tcpip.Error { // (NDP NS) messages may be sent to the All-Nodes multicast group if the // source address of the NDP NS is the unspecified address, as per RFC 4861 // section 7.2.4. - if _, err := e.mu.addressableEndpointState.JoinGroup(header.IPv6AllNodesMulticastAddress); err != nil { - return err + if err := e.joinGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil { + // joinGroupLocked only returns an error if the group address is not a valid + // IPv6 multicast address. + panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv6AllNodesMulticastAddress, err)) } // Perform DAD on the all the unicast IPv6 endpoints that are in the permanent @@ -251,7 +302,7 @@ func (e *endpoint) Enable() *tcpip.Error { // Addresses may have aleady completed DAD but in the time since the endpoint // was last enabled, other devices may have acquired the same addresses. var err *tcpip.Error - e.mu.addressableEndpointState.ReadOnly().ForEach(func(addressEndpoint stack.AddressEndpoint) bool { + e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { addr := addressEndpoint.AddressWithPrefix().Address if !header.IsV6UnicastAddress(addr) { return true @@ -273,7 +324,7 @@ func (e *endpoint) Enable() *tcpip.Error { } // Do not auto-generate an IPv6 link-local address for loopback devices. - if e.protocol.autoGenIPv6LinkLocal && !e.nic.IsLoopback() { + if e.protocol.options.AutoGenLinkLocal && !e.nic.IsLoopback() { // The valid and preferred lifetime is infinite for the auto-generated // link-local address. e.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime) @@ -322,7 +373,7 @@ func (e *endpoint) Disable() { } func (e *endpoint) disableLocked() { - if !e.setEnabled(false) { + if !e.Enabled() { return } @@ -331,9 +382,17 @@ func (e *endpoint) disableLocked() { e.stopDADForPermanentAddressesLocked() // The endpoint may have already left the multicast group. - if _, err := e.mu.addressableEndpointState.LeaveGroup(header.IPv6AllNodesMulticastAddress); err != nil && err != tcpip.ErrBadLocalAddress { + if err := e.leaveGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil && err != tcpip.ErrBadLocalAddress { panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv6AllNodesMulticastAddress, err)) } + + // Leave groups from the perspective of MLD so that routers know that + // we are no longer interested in the group. + e.mu.mld.softLeaveAll() + + if !e.setEnabled(false) { + panic("should have only done work to disable the endpoint if it was enabled") + } } // stopDADForPermanentAddressesLocked stops DAD for all permaneent addresses. @@ -341,7 +400,7 @@ func (e *endpoint) disableLocked() { // Precondition: e.mu must be write locked. func (e *endpoint) stopDADForPermanentAddressesLocked() { // Stop DAD for all the tentative unicast addresses. - e.mu.addressableEndpointState.ReadOnly().ForEach(func(addressEndpoint stack.AddressEndpoint) bool { + e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { if addressEndpoint.GetKind() != stack.PermanentTentative { return true } @@ -373,19 +432,27 @@ func (e *endpoint) MTU() uint32 { // MaxHeaderLength returns the maximum length needed by ipv6 headers (and // underlying protocols). func (e *endpoint) MaxHeaderLength() uint16 { + // TODO(gvisor.dev/issues/5035): The maximum header length returned here does + // not open the possibility for the caller to know about size required for + // extension headers. return e.nic.MaxHeaderLength() + header.IPv6MinimumSize } -func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) { - length := uint16(pkt.Size()) - ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize)) +func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, extensionHeaders header.IPv6ExtHdrSerializer) { + extHdrsLen := extensionHeaders.Length() + length := pkt.Size() + extensionHeaders.Length() + if length > math.MaxUint16 { + panic(fmt.Sprintf("IPv6 payload too large: %d, must be <= %d", length, math.MaxUint16)) + } + ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + extHdrsLen)) ip.Encode(&header.IPv6Fields{ - PayloadLength: length, - NextHeader: uint8(params.Protocol), - HopLimit: params.TTL, - TrafficClass: params.TOS, - SrcAddr: r.LocalAddress, - DstAddr: r.RemoteAddress, + PayloadLength: uint16(length), + TransportProtocol: params.Protocol, + HopLimit: params.TTL, + TrafficClass: params.TOS, + SrcAddr: srcAddr, + DstAddr: dstAddr, + ExtensionHeaders: extensionHeaders, }) pkt.NetworkProtocolNumber = ProtocolNumber } @@ -440,7 +507,7 @@ func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU ui // WritePacket writes a packet to the given destination address and protocol. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error { - e.addIPHeader(r, pkt, params) + e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* extensionHeaders */) // iptables filtering. All packets that reach here are locally // generated. @@ -529,7 +596,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe linkMTU := e.nic.MTU() for pb := pkts.Front(); pb != nil; pb = pb.Next() { - e.addIPHeader(r, pb, params) + e.addIPHeader(r.LocalAddress, r.RemoteAddress, pb, params, nil /* extensionHeaders */) networkMTU, err := calculateNetworkMTU(linkMTU, uint32(pb.NetworkHeader().View().Size())) if err != nil { @@ -737,8 +804,11 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { return } - addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint) - if addressEndpoint == nil { + // The destination address should be an address we own or a group we joined + // for us to receive the packet. Otherwise, attempt to forward the packet. + if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil { + addressEndpoint.DecRef() + } else if !e.IsInGroup(dstAddr) { if !e.protocol.Forwarding() { stats.IP.InvalidDestinationAddressesReceived.Increment() return @@ -747,7 +817,6 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { _ = e.forwardPacket(pkt) return } - addressEndpoint.DecRef() // vv consists of: // - Any IPv6 header bytes after the first 40 (i.e. extensions). @@ -1090,9 +1159,16 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { // // Which when taken together indicate that an unknown protocol should // be treated as an unrecognized next header value. + // The location of the Next Header field is in a different place in + // the initial IPv6 header than it is in the extension headers so + // treat it specially. + prevHdrIDOffset := uint32(header.IPv6NextHeaderOffset) + if previousHeaderStart != 0 { + prevHdrIDOffset = previousHeaderStart + } _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6UnknownHeader, - pointer: it.ParseOffset(), + pointer: prevHdrIDOffset, }, pkt) default: panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res)) @@ -1100,12 +1176,11 @@ func (e *endpoint) handlePacket(pkt *stack.PacketBuffer) { } default: - _ = e.protocol.returnError(&icmpReasonParameterProblem{ - code: header.ICMPv6UnknownHeader, - pointer: it.ParseOffset(), - }, pkt) - stats.UnknownProtocolRcvdPackets.Increment() - return + // Since the iterator returns IPv6RawPayloadHeader for unknown Extension + // Header IDs this should never happen unless we missed a supported type + // here. + panic(fmt.Sprintf("unrecognized type from it.Next() = %T", extHdr)) + } } } @@ -1153,11 +1228,6 @@ func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPre return addressEndpoint, nil } - snmc := header.SolicitedNodeAddr(addr.Address) - if _, err := e.mu.addressableEndpointState.JoinGroup(snmc); err != nil { - return nil, err - } - addressEndpoint.SetKind(stack.PermanentTentative) if e.Enabled() { @@ -1166,6 +1236,13 @@ func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPre } } + snmc := header.SolicitedNodeAddr(addr.Address) + if err := e.joinGroupLocked(snmc); err != nil { + // joinGroupLocked only returns an error if the group address is not a valid + // IPv6 multicast address. + panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", snmc, err)) + } + return addressEndpoint, nil } @@ -1211,7 +1288,8 @@ func (e *endpoint) removePermanentEndpointLocked(addressEndpoint stack.AddressEn } snmc := header.SolicitedNodeAddr(addr.Address) - if _, err := e.mu.addressableEndpointState.LeaveGroup(snmc); err != nil && err != tcpip.ErrBadLocalAddress { + // The endpoint may have already left the multicast group. + if err := e.leaveGroupLocked(snmc); err != nil && err != tcpip.ErrBadLocalAddress { return err } @@ -1234,7 +1312,7 @@ func (e *endpoint) hasPermanentAddressRLocked(addr tcpip.Address) bool { // // Precondition: e.mu must be read or write locked. func (e *endpoint) getAddressRLocked(localAddr tcpip.Address) stack.AddressEndpoint { - return e.mu.addressableEndpointState.ReadOnly().Lookup(localAddr) + return e.mu.addressableEndpointState.GetAddress(localAddr) } // MainAddress implements stack.AddressableEndpoint. @@ -1266,6 +1344,26 @@ func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allow return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired) } +// getLinkLocalAddressRLocked returns a link-local address from the primary list +// of addresses, if one is available. +// +// See stack.PrimaryEndpointBehavior for more details about the primary list. +// +// Precondition: e.mu must be read locked. +func (e *endpoint) getLinkLocalAddressRLocked() tcpip.Address { + var linkLocalAddr tcpip.Address + e.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { + if addressEndpoint.IsAssigned(false /* allowExpired */) { + if addr := addressEndpoint.AddressWithPrefix().Address; header.IsV6LinkLocalAddress(addr) { + linkLocalAddr = addr + return false + } + } + return true + }) + return linkLocalAddr +} + // acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress // but with locking requirements. // @@ -1285,10 +1383,10 @@ func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address // Create a candidate set of available addresses we can potentially use as a // source address. var cs []addrCandidate - e.mu.addressableEndpointState.ReadOnly().ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) { + e.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { // If r is not valid for outgoing connections, it is not a valid endpoint. if !addressEndpoint.IsAssigned(allowExpired) { - return + return true } addr := addressEndpoint.AddressWithPrefix().Address @@ -1304,6 +1402,8 @@ func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address addressEndpoint: addressEndpoint, scope: scope, }) + + return true }) remoteScope, err := header.ScopeForIPv6Address(remoteAddr) @@ -1376,28 +1476,43 @@ func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix { } // JoinGroup implements stack.GroupAddressableEndpoint. -func (e *endpoint) JoinGroup(addr tcpip.Address) (bool, *tcpip.Error) { +func (e *endpoint) JoinGroup(addr tcpip.Address) *tcpip.Error { + e.mu.Lock() + defer e.mu.Unlock() + return e.joinGroupLocked(addr) +} + +// joinGroupLocked is like JoinGroup but with locking requirements. +// +// Precondition: e.mu must be locked. +func (e *endpoint) joinGroupLocked(addr tcpip.Address) *tcpip.Error { if !header.IsV6MulticastAddress(addr) { - return false, tcpip.ErrBadAddress + return tcpip.ErrBadAddress } - e.mu.Lock() - defer e.mu.Unlock() - return e.mu.addressableEndpointState.JoinGroup(addr) + e.mu.mld.joinGroup(addr) + return nil } // LeaveGroup implements stack.GroupAddressableEndpoint. -func (e *endpoint) LeaveGroup(addr tcpip.Address) (bool, *tcpip.Error) { +func (e *endpoint) LeaveGroup(addr tcpip.Address) *tcpip.Error { e.mu.Lock() defer e.mu.Unlock() - return e.mu.addressableEndpointState.LeaveGroup(addr) + return e.leaveGroupLocked(addr) +} + +// leaveGroupLocked is like LeaveGroup but with locking requirements. +// +// Precondition: e.mu must be locked. +func (e *endpoint) leaveGroupLocked(addr tcpip.Address) *tcpip.Error { + return e.mu.mld.leaveGroup(addr) } // IsInGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) IsInGroup(addr tcpip.Address) bool { e.mu.RLock() defer e.mu.RUnlock() - return e.mu.addressableEndpointState.IsInGroup(addr) + return e.mu.mld.isInGroup(addr) } var _ stack.ForwardingNetworkProtocol = (*protocol)(nil) @@ -1405,7 +1520,8 @@ var _ stack.NetworkProtocol = (*protocol)(nil) var _ fragmentation.TimeoutHandler = (*protocol)(nil) type protocol struct { - stack *stack.Stack + stack *stack.Stack + options Options mu struct { sync.RWMutex @@ -1429,26 +1545,6 @@ type protocol struct { forwarding uint32 fragmentation *fragmentation.Fragmentation - - // ndpDisp is the NDP event dispatcher that is used to send the netstack - // integrator NDP related events. - ndpDisp NDPDispatcher - - // ndpConfigs is the default NDP configurations used by an IPv6 endpoint. - ndpConfigs NDPConfigurations - - // opaqueIIDOpts hold the options for generating opaque interface identifiers - // (IIDs) as outlined by RFC 7217. - opaqueIIDOpts OpaqueInterfaceIdentifierOptions - - // tempIIDSeed is used to seed the initial temporary interface identifier - // history value used to generate IIDs for temporary SLAAC addresses. - tempIIDSeed []byte - - // autoGenIPv6LinkLocal determines whether or not the stack attempts to - // auto-generate an IPv6 link-local address for newly enabled non-loopback - // NICs. See the AutoGenIPv6LinkLocal field of Options for more details. - autoGenIPv6LinkLocal bool } // Number returns the ipv6 protocol number. @@ -1481,16 +1577,11 @@ func (p *protocol) NewEndpoint(nic stack.NetworkInterface, linkAddrCache stack.L dispatcher: dispatcher, protocol: p, } + e.mu.Lock() e.mu.addressableEndpointState.Init(e) - e.mu.ndp = ndpState{ - ep: e, - configs: p.ndpConfigs, - dad: make(map[tcpip.Address]dadState), - defaultRouters: make(map[tcpip.Address]defaultRouterState), - onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState), - slaacPrefixes: make(map[tcpip.Subnet]slaacPrefixState), - } - e.mu.ndp.initializeTempAddrState() + e.mu.ndp.init(e) + e.mu.mld.init(e) + e.mu.Unlock() p.mu.Lock() defer p.mu.Unlock() @@ -1613,17 +1704,17 @@ type Options struct { // NDPConfigs is the default NDP configurations used by interfaces. NDPConfigs NDPConfigurations - // AutoGenIPv6LinkLocal determines whether or not the stack attempts to - // auto-generate an IPv6 link-local address for newly enabled non-loopback + // AutoGenLinkLocal determines whether or not the stack attempts to + // auto-generate a link-local address for newly enabled non-loopback // NICs. // // Note, setting this to true does not mean that a link-local address is // assigned right away, or at all. If Duplicate Address Detection is enabled, // an address is only assigned if it successfully resolves. If it fails, no - // further attempts are made to auto-generate an IPv6 link-local adddress. + // further attempts are made to auto-generate a link-local adddress. // // The generated link-local address follows RFC 4291 Appendix A guidelines. - AutoGenIPv6LinkLocal bool + AutoGenLinkLocal bool // NDPDisp is the NDP event dispatcher that an integrator can provide to // receive NDP related events. @@ -1647,6 +1738,9 @@ type Options struct { // seed that is too small would reduce randomness and increase predictability, // defeating the purpose of temporary SLAAC addresses. TempIIDSeed []byte + + // MLD holds options for MLD. + MLD MLDOptions } // NewProtocolWithOptions returns an IPv6 network protocol. @@ -1658,15 +1752,11 @@ func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory { return func(s *stack.Stack) stack.NetworkProtocol { p := &protocol{ - stack: s, + stack: s, + options: opts, + ids: ids, hashIV: hashIV, - - ndpDisp: opts.NDPDisp, - ndpConfigs: opts.NDPConfigs, - opaqueIIDOpts: opts.OpaqueIIDOpts, - tempIIDSeed: opts.TempIIDSeed, - autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal, } p.fragmentation = fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p) p.mu.eps = make(map[*endpoint]struct{}) @@ -1712,24 +1802,25 @@ func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeaders hea fragPkt.NetworkProtocolNumber = ProtocolNumber originalIPHeadersLength := len(originalIPHeaders) - fragmentIPHeadersLength := originalIPHeadersLength + header.IPv6FragmentHeaderSize + + s := header.IPv6ExtHdrSerializer{&header.IPv6SerializableFragmentExtHdr{ + FragmentOffset: uint16(offset / header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit), + M: more, + Identification: id, + }} + + fragmentIPHeadersLength := originalIPHeadersLength + s.Length() fragmentIPHeaders := header.IPv6(fragPkt.NetworkHeader().Push(fragmentIPHeadersLength)) - fragPkt.NetworkProtocolNumber = ProtocolNumber // Copy the IPv6 header and any extension headers already populated. if copied := copy(fragmentIPHeaders, originalIPHeaders); copied != originalIPHeadersLength { panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got %d, want %d", copied, originalIPHeadersLength)) } - fragmentIPHeaders.SetNextHeader(header.IPv6FragmentHeader) - fragmentIPHeaders.SetPayloadLength(uint16(copied + fragmentIPHeadersLength - header.IPv6MinimumSize)) - fragmentHeader := header.IPv6Fragment(fragmentIPHeaders[originalIPHeadersLength:]) - fragmentHeader.Encode(&header.IPv6FragmentFields{ - M: more, - FragmentOffset: uint16(offset / header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit), - Identification: id, - NextHeader: uint8(transportProto), - }) + nextHeader, _ := s.Serialize(transportProto, fragmentIPHeaders[originalIPHeadersLength:]) + + fragmentIPHeaders.SetNextHeader(nextHeader) + fragmentIPHeaders.SetPayloadLength(uint16(copied + fragmentIPHeadersLength - header.IPv6MinimumSize)) return fragPkt, more } diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go index a671d4bac..5f07d3af8 100644 --- a/pkg/tcpip/network/ipv6/ipv6_test.go +++ b/pkg/tcpip/network/ipv6/ipv6_test.go @@ -51,6 +51,7 @@ const ( fragmentExtHdrID = uint8(header.IPv6FragmentExtHdrIdentifier) destinationExtHdrID = uint8(header.IPv6DestinationOptionsExtHdrIdentifier) noNextHdrID = uint8(header.IPv6NoNextHeaderIdentifier) + unknownHdrID = uint8(header.IPv6UnknownExtHdrIdentifier) extraHeaderReserve = 50 ) @@ -68,18 +69,18 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: 255, - SrcAddr: src, - DstAddr: dst, + PayloadLength: uint16(payloadLength), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: 255, + SrcAddr: src, + DstAddr: dst, }) e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), })) - stats := s.Stats().ICMP.V6PacketsReceived + stats := s.Stats().ICMP.V6.PacketsReceived if got := stats.NeighborAdvert.Value(); got != want { t.Fatalf("got NeighborAdvert = %d, want = %d", got, want) @@ -126,11 +127,11 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(udp.ProtocolNumber), - HopLimit: 255, - SrcAddr: src, - DstAddr: dst, + PayloadLength: uint16(payloadLength), + TransportProtocol: udp.ProtocolNumber, + HopLimit: 255, + SrcAddr: src, + DstAddr: dst, }) e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ @@ -573,6 +574,33 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) { expectICMP: false, }, { + name: "unknown next header (first)", + extHdr: func(nextHdr uint8) ([]byte, uint8) { + return []byte{ + nextHdr, 0, 63, 4, 1, 2, 3, 4, + }, unknownHdrID + }, + shouldAccept: false, + expectICMP: true, + ICMPType: header.ICMPv6ParamProblem, + ICMPCode: header.ICMPv6UnknownHeader, + pointer: header.IPv6NextHeaderOffset, + }, + { + name: "unknown next header (not first)", + extHdr: func(nextHdr uint8) ([]byte, uint8) { + return []byte{ + unknownHdrID, 0, + 63, 4, 1, 2, 3, 4, + }, hopByHopExtHdrID + }, + shouldAccept: false, + expectICMP: true, + ICMPType: header.ICMPv6ParamProblem, + ICMPCode: header.ICMPv6UnknownHeader, + pointer: header.IPv6FixedHeaderSize, + }, + { name: "destination with unknown option skippable action", extHdr: func(nextHdr uint8) ([]byte, uint8) { return []byte{ @@ -755,11 +783,6 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) { pointer: header.IPv6FixedHeaderSize, }, { - name: "No next header", - extHdr: func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID }, - shouldAccept: false, - }, - { name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with skippable unknown)", extHdr: func(nextHdr uint8) ([]byte, uint8) { return []byte{ @@ -873,7 +896,13 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) { Length: uint16(udpLength), }) copy(u.Payload(), udpPayload) - sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength)) + + dstAddr := tcpip.Address(addr2) + if test.multicast { + dstAddr = header.IPv6AllNodesMulticastAddress + } + + sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, dstAddr, uint16(udpLength)) sum = header.Checksum(udpPayload, sum) u.SetChecksum(^u.CalculateChecksum(sum)) @@ -884,16 +913,14 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) { // Serialize IPv6 fixed header. payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) - dstAddr := tcpip.Address(addr2) - if test.multicast { - dstAddr = header.IPv6AllNodesMulticastAddress - } ip.Encode(&header.IPv6Fields{ PayloadLength: uint16(payloadLength), - NextHeader: ipv6NextHdr, - HopLimit: 255, - SrcAddr: addr1, - DstAddr: dstAddr, + // We're lying about transport protocol here to be able to generate + // raw extension headers from the test definitions. + TransportProtocol: tcpip.TransportProtocolNumber(ipv6NextHdr), + HopLimit: 255, + SrcAddr: addr1, + DstAddr: dstAddr, }) e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ @@ -982,9 +1009,10 @@ func TestReceiveIPv6Fragments(t *testing.T) { udpPayload2Length = 128 // Used to test cases where the fragment blocks are not a multiple of // the fragment block size of 8 (RFC 8200 section 4.5). - udpPayload3Length = 127 - udpPayload4Length = header.IPv6MaximumPayloadSize - header.UDPMinimumSize - fragmentExtHdrLen = 8 + udpPayload3Length = 127 + udpPayload4Length = header.IPv6MaximumPayloadSize - header.UDPMinimumSize + udpMaximumSizeMinus15 = header.UDPMaximumSize - 15 + fragmentExtHdrLen = 8 // Note, not all routing extension headers will be 8 bytes but this test // uses 8 byte routing extension headers for most sub tests. routingExtHdrLen = 8 @@ -1328,14 +1356,14 @@ func TestReceiveIPv6Fragments(t *testing.T) { dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+65520, + fragmentExtHdrLen+udpMaximumSizeMinus15, []buffer.View{ // Fragment extension header. // // Fragment offset = 0, More = true, ID = 1 buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), - ipv6Payload4Addr1ToAddr2[:65520], + ipv6Payload4Addr1ToAddr2[:udpMaximumSizeMinus15], }, ), }, @@ -1344,14 +1372,17 @@ func TestReceiveIPv6Fragments(t *testing.T) { dstAddr: addr2, nextHdr: fragmentExtHdrID, data: buffer.NewVectorisedView( - fragmentExtHdrLen+len(ipv6Payload4Addr1ToAddr2)-65520, + fragmentExtHdrLen+len(ipv6Payload4Addr1ToAddr2)-udpMaximumSizeMinus15, []buffer.View{ // Fragment extension header. // - // Fragment offset = 8190, More = false, ID = 1 - buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 255, 240, 0, 0, 0, 1}), + // Fragment offset = udpMaximumSizeMinus15/8, More = false, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, + udpMaximumSizeMinus15 >> 8, + udpMaximumSizeMinus15 & 0xff, + 0, 0, 0, 1}), - ipv6Payload4Addr1ToAddr2[65520:], + ipv6Payload4Addr1ToAddr2[udpMaximumSizeMinus15:], }, ), }, @@ -1359,6 +1390,47 @@ func TestReceiveIPv6Fragments(t *testing.T) { expectedPayloads: [][]byte{udpPayload4Addr1ToAddr2}, }, { + name: "Two fragments with MF flag reassembled into a maximum UDP packet", + fragments: []fragmentData{ + { + srcAddr: addr1, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+udpMaximumSizeMinus15, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = 0, More = true, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}), + + ipv6Payload4Addr1ToAddr2[:udpMaximumSizeMinus15], + }, + ), + }, + { + srcAddr: addr1, + dstAddr: addr2, + nextHdr: fragmentExtHdrID, + data: buffer.NewVectorisedView( + fragmentExtHdrLen+len(ipv6Payload4Addr1ToAddr2)-udpMaximumSizeMinus15, + []buffer.View{ + // Fragment extension header. + // + // Fragment offset = udpMaximumSizeMinus15/8, More = true, ID = 1 + buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, + udpMaximumSizeMinus15 >> 8, + (udpMaximumSizeMinus15 & 0xff) + 1, + 0, 0, 0, 1}), + + ipv6Payload4Addr1ToAddr2[udpMaximumSizeMinus15:], + }, + ), + }, + }, + expectedPayloads: nil, + }, + { name: "Two fragments with per-fragment routing header with zero segments left", fragments: []fragmentData{ { @@ -1877,10 +1949,12 @@ func TestReceiveIPv6Fragments(t *testing.T) { ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ PayloadLength: uint16(f.data.Size()), - NextHeader: f.nextHdr, - HopLimit: 255, - SrcAddr: f.srcAddr, - DstAddr: f.dstAddr, + // We're lying about transport protocol here so that we can generate + // raw extension headers for the tests. + TransportProtocol: tcpip.TransportProtocolNumber(f.nextHdr), + HopLimit: 255, + SrcAddr: f.srcAddr, + DstAddr: f.dstAddr, }) vv := hdr.View().ToVectorisedView() @@ -1925,7 +1999,7 @@ func TestInvalidIPv6Fragments(t *testing.T) { type fragmentData struct { ipv6Fields header.IPv6Fields - ipv6FragmentFields header.IPv6FragmentFields + ipv6FragmentFields header.IPv6SerializableFragmentExtHdr payload []byte } @@ -1944,14 +2018,13 @@ func TestInvalidIPv6Fragments(t *testing.T) { fragments: []fragmentData{ { ipv6Fields: header.IPv6Fields{ - PayloadLength: header.IPv6FragmentHeaderSize + 9, - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: header.IPv6FragmentHeaderSize + 9, + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: 0 >> 3, M: true, Identification: ident, @@ -1971,14 +2044,13 @@ func TestInvalidIPv6Fragments(t *testing.T) { fragments: []fragmentData{ { ipv6Fields: header.IPv6Fields{ - PayloadLength: header.IPv6FragmentHeaderSize + 16, - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: header.IPv6FragmentHeaderSize + 16, + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: ((header.IPv6MaximumPayloadSize + 1) - 16) >> 3, M: false, Identification: ident, @@ -2019,10 +2091,9 @@ func TestInvalidIPv6Fragments(t *testing.T) { hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.IPv6FragmentHeaderSize) ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize + header.IPv6FragmentHeaderSize)) - ip.Encode(&f.ipv6Fields) - - fragHDR := header.IPv6Fragment(hdr.View()[header.IPv6MinimumSize:]) - fragHDR.Encode(&f.ipv6FragmentFields) + encodeArgs := f.ipv6Fields + encodeArgs.ExtensionHeaders = append(encodeArgs.ExtensionHeaders, &f.ipv6FragmentFields) + ip.Encode(&encodeArgs) vv := hdr.View().ToVectorisedView() vv.AppendView(f.payload) @@ -2084,7 +2155,7 @@ func TestFragmentReassemblyTimeout(t *testing.T) { type fragmentData struct { ipv6Fields header.IPv6Fields - ipv6FragmentFields header.IPv6FragmentFields + ipv6FragmentFields header.IPv6SerializableFragmentExtHdr payload []byte } @@ -2098,14 +2169,13 @@ func TestFragmentReassemblyTimeout(t *testing.T) { fragments: []fragmentData{ { ipv6Fields: header.IPv6Fields{ - PayloadLength: header.IPv6FragmentHeaderSize + 16, - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: header.IPv6FragmentHeaderSize + 16, + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: 0, M: true, Identification: ident, @@ -2120,14 +2190,13 @@ func TestFragmentReassemblyTimeout(t *testing.T) { fragments: []fragmentData{ { ipv6Fields: header.IPv6Fields{ - PayloadLength: header.IPv6FragmentHeaderSize + 16, - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: header.IPv6FragmentHeaderSize + 16, + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: 0, M: true, Identification: ident, @@ -2136,14 +2205,13 @@ func TestFragmentReassemblyTimeout(t *testing.T) { }, { ipv6Fields: header.IPv6Fields{ - PayloadLength: header.IPv6FragmentHeaderSize + 16, - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: header.IPv6FragmentHeaderSize + 16, + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: 0, M: true, Identification: ident, @@ -2158,14 +2226,13 @@ func TestFragmentReassemblyTimeout(t *testing.T) { fragments: []fragmentData{ { ipv6Fields: header.IPv6Fields{ - PayloadLength: uint16(header.IPv6FragmentHeaderSize + len(data) - 16), - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: uint16(header.IPv6FragmentHeaderSize + len(data) - 16), + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: 8, M: false, Identification: ident, @@ -2180,14 +2247,13 @@ func TestFragmentReassemblyTimeout(t *testing.T) { fragments: []fragmentData{ { ipv6Fields: header.IPv6Fields{ - PayloadLength: header.IPv6FragmentHeaderSize + 16, - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: header.IPv6FragmentHeaderSize + 16, + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: 0, M: true, Identification: ident, @@ -2196,14 +2262,13 @@ func TestFragmentReassemblyTimeout(t *testing.T) { }, { ipv6Fields: header.IPv6Fields{ - PayloadLength: uint16(header.IPv6FragmentHeaderSize + len(data) - 16), - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: uint16(header.IPv6FragmentHeaderSize + len(data) - 16), + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: 8, M: false, Identification: ident, @@ -2218,14 +2283,13 @@ func TestFragmentReassemblyTimeout(t *testing.T) { fragments: []fragmentData{ { ipv6Fields: header.IPv6Fields{ - PayloadLength: uint16(header.IPv6FragmentHeaderSize + len(data) - 16), - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: uint16(header.IPv6FragmentHeaderSize + len(data) - 16), + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: 8, M: false, Identification: ident, @@ -2234,14 +2298,13 @@ func TestFragmentReassemblyTimeout(t *testing.T) { }, { ipv6Fields: header.IPv6Fields{ - PayloadLength: header.IPv6FragmentHeaderSize + 16, - NextHeader: header.IPv6FragmentHeader, - HopLimit: hoplimit, - SrcAddr: addr1, - DstAddr: addr2, + PayloadLength: header.IPv6FragmentHeaderSize + 16, + TransportProtocol: header.UDPProtocolNumber, + HopLimit: hoplimit, + SrcAddr: addr1, + DstAddr: addr2, }, - ipv6FragmentFields: header.IPv6FragmentFields{ - NextHeader: uint8(header.UDPProtocolNumber), + ipv6FragmentFields: header.IPv6SerializableFragmentExtHdr{ FragmentOffset: 0, M: true, Identification: ident, @@ -2280,10 +2343,11 @@ func TestFragmentReassemblyTimeout(t *testing.T) { hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.IPv6FragmentHeaderSize) ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize + header.IPv6FragmentHeaderSize)) - ip.Encode(&f.ipv6Fields) + encodeArgs := f.ipv6Fields + encodeArgs.ExtensionHeaders = append(encodeArgs.ExtensionHeaders, &f.ipv6FragmentFields) + ip.Encode(&encodeArgs) fragHDR := header.IPv6Fragment(hdr.View()[header.IPv6MinimumSize:]) - fragHDR.Encode(&f.ipv6FragmentFields) vv := hdr.View().ToVectorisedView() vv.AppendView(f.payload) @@ -2439,7 +2503,7 @@ func TestWriteStats(t *testing.T) { test.setup(t, rt.Stack()) - nWritten, _ := writer.writePackets(&rt, pkts) + nWritten, _ := writer.writePackets(rt, pkts) if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent { t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent) @@ -2456,7 +2520,7 @@ func TestWriteStats(t *testing.T) { } } -func buildRoute(t *testing.T, ep stack.LinkEndpoint) stack.Route { +func buildRoute(t *testing.T, ep stack.LinkEndpoint) *stack.Route { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol}, }) @@ -2924,11 +2988,11 @@ func TestForwarding(t *testing.T) { icmp.SetChecksum(header.ICMPv6Checksum(icmp, remoteIPv6Addr1, remoteIPv6Addr2, buffer.VectorisedView{})) ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: header.ICMPv6MinimumSize, - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: test.TTL, - SrcAddr: remoteIPv6Addr1, - DstAddr: remoteIPv6Addr2, + PayloadLength: header.ICMPv6MinimumSize, + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: test.TTL, + SrcAddr: remoteIPv6Addr1, + DstAddr: remoteIPv6Addr2, }) requestPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), diff --git a/pkg/tcpip/network/ipv6/mld.go b/pkg/tcpip/network/ipv6/mld.go new file mode 100644 index 000000000..e8d1e7a79 --- /dev/null +++ b/pkg/tcpip/network/ipv6/mld.go @@ -0,0 +1,262 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ipv6 + +import ( + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/network/ip" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +const ( + // UnsolicitedReportIntervalMax is the maximum delay between sending + // unsolicited MLD reports. + // + // Obtained from RFC 2710 Section 7.10. + UnsolicitedReportIntervalMax = 10 * time.Second +) + +// MLDOptions holds options for MLD. +type MLDOptions struct { + // Enabled indicates whether MLD will be performed. + // + // When enabled, MLD may transmit MLD report and done messages when + // joining and leaving multicast groups respectively, and handle incoming + // MLD packets. + // + // This field is ignored and is always assumed to be false for interfaces + // without neighbouring nodes (e.g. loopback). + Enabled bool +} + +var _ ip.MulticastGroupProtocol = (*mldState)(nil) + +// mldState is the per-interface MLD state. +// +// mldState.init MUST be called to initialize the MLD state. +type mldState struct { + // The IPv6 endpoint this mldState is for. + ep *endpoint + + genericMulticastProtocol ip.GenericMulticastProtocolState +} + +// Enabled implements ip.MulticastGroupProtocol. +func (mld *mldState) Enabled() bool { + // No need to perform MLD on loopback interfaces since they don't have + // neighbouring nodes. + return mld.ep.protocol.options.MLD.Enabled && !mld.ep.nic.IsLoopback() && mld.ep.Enabled() +} + +// SendReport implements ip.MulticastGroupProtocol. +// +// Precondition: mld.ep.mu must be read locked. +func (mld *mldState) SendReport(groupAddress tcpip.Address) (bool, *tcpip.Error) { + return mld.writePacket(groupAddress, groupAddress, header.ICMPv6MulticastListenerReport) +} + +// SendLeave implements ip.MulticastGroupProtocol. +// +// Precondition: mld.ep.mu must be read locked. +func (mld *mldState) SendLeave(groupAddress tcpip.Address) *tcpip.Error { + _, err := mld.writePacket(header.IPv6AllRoutersMulticastAddress, groupAddress, header.ICMPv6MulticastListenerDone) + return err +} + +// init sets up an mldState struct, and is required to be called before using +// a new mldState. +// +// Must only be called once for the lifetime of mld. +func (mld *mldState) init(ep *endpoint) { + mld.ep = ep + mld.genericMulticastProtocol.Init(&ep.mu.RWMutex, ip.GenericMulticastProtocolOptions{ + Rand: ep.protocol.stack.Rand(), + Clock: ep.protocol.stack.Clock(), + Protocol: mld, + MaxUnsolicitedReportDelay: UnsolicitedReportIntervalMax, + AllNodesAddress: header.IPv6AllNodesMulticastAddress, + }) +} + +// handleMulticastListenerQuery handles a query message. +// +// Precondition: mld.ep.mu must be locked. +func (mld *mldState) handleMulticastListenerQuery(mldHdr header.MLD) { + mld.genericMulticastProtocol.HandleQueryLocked(mldHdr.MulticastAddress(), mldHdr.MaximumResponseDelay()) +} + +// handleMulticastListenerReport handles a report message. +// +// Precondition: mld.ep.mu must be locked. +func (mld *mldState) handleMulticastListenerReport(mldHdr header.MLD) { + mld.genericMulticastProtocol.HandleReportLocked(mldHdr.MulticastAddress()) +} + +// joinGroup handles joining a new group and sending and scheduling the required +// messages. +// +// If the group is already joined, returns tcpip.ErrDuplicateAddress. +// +// Precondition: mld.ep.mu must be locked. +func (mld *mldState) joinGroup(groupAddress tcpip.Address) { + mld.genericMulticastProtocol.JoinGroupLocked(groupAddress) +} + +// isInGroup returns true if the specified group has been joined locally. +// +// Precondition: mld.ep.mu must be read locked. +func (mld *mldState) isInGroup(groupAddress tcpip.Address) bool { + return mld.genericMulticastProtocol.IsLocallyJoinedRLocked(groupAddress) +} + +// leaveGroup handles removing the group from the membership map, cancels any +// delay timers associated with that group, and sends the Done message, if +// required. +// +// Precondition: mld.ep.mu must be locked. +func (mld *mldState) leaveGroup(groupAddress tcpip.Address) *tcpip.Error { + // LeaveGroup returns false only if the group was not joined. + if mld.genericMulticastProtocol.LeaveGroupLocked(groupAddress) { + return nil + } + + return tcpip.ErrBadLocalAddress +} + +// softLeaveAll leaves all groups from the perspective of MLD, but remains +// joined locally. +// +// Precondition: mld.ep.mu must be locked. +func (mld *mldState) softLeaveAll() { + mld.genericMulticastProtocol.MakeAllNonMemberLocked() +} + +// initializeAll attemps to initialize the MLD state for each group that has +// been joined locally. +// +// Precondition: mld.ep.mu must be locked. +func (mld *mldState) initializeAll() { + mld.genericMulticastProtocol.InitializeGroupsLocked() +} + +// sendQueuedReports attempts to send any reports that are queued for sending. +// +// Precondition: mld.ep.mu must be locked. +func (mld *mldState) sendQueuedReports() { + mld.genericMulticastProtocol.SendQueuedReportsLocked() +} + +// writePacket assembles and sends an MLD packet. +// +// Precondition: mld.ep.mu must be read locked. +func (mld *mldState) writePacket(destAddress, groupAddress tcpip.Address, mldType header.ICMPv6Type) (bool, *tcpip.Error) { + sentStats := mld.ep.protocol.stack.Stats().ICMP.V6.PacketsSent + var mldStat *tcpip.StatCounter + switch mldType { + case header.ICMPv6MulticastListenerReport: + mldStat = sentStats.MulticastListenerReport + case header.ICMPv6MulticastListenerDone: + mldStat = sentStats.MulticastListenerDone + default: + panic(fmt.Sprintf("unrecognized mld type = %d", mldType)) + } + + icmp := header.ICMPv6(buffer.NewView(header.ICMPv6HeaderSize + header.MLDMinimumSize)) + icmp.SetType(mldType) + header.MLD(icmp.MessageBody()).SetMulticastAddress(groupAddress) + // As per RFC 2710 section 3, + // + // All MLD messages described in this document are sent with a link-local + // IPv6 Source Address, an IPv6 Hop Limit of 1, and an IPv6 Router Alert + // option in a Hop-by-Hop Options header. + // + // However, this would cause problems with Duplicate Address Detection with + // the first address as MLD snooping switches may not send multicast traffic + // that DAD depends on to the node performing DAD without the MLD report, as + // documented in RFC 4816: + // + // Note that when a node joins a multicast address, it typically sends a + // Multicast Listener Discovery (MLD) report message [RFC2710] [RFC3810] + // for the multicast address. In the case of Duplicate Address + // Detection, the MLD report message is required in order to inform MLD- + // snooping switches, rather than routers, to forward multicast packets. + // In the above description, the delay for joining the multicast address + // thus means delaying transmission of the corresponding MLD report + // message. Since the MLD specifications do not request a random delay + // to avoid race conditions, just delaying Neighbor Solicitation would + // cause congestion by the MLD report messages. The congestion would + // then prevent the MLD-snooping switches from working correctly and, as + // a result, prevent Duplicate Address Detection from working. The + // requirement to include the delay for the MLD report in this case + // avoids this scenario. [RFC3590] also talks about some interaction + // issues between Duplicate Address Detection and MLD, and specifies + // which source address should be used for the MLD report in this case. + // + // As per RFC 3590 section 4, we should still send out MLD reports with an + // unspecified source address if we do not have an assigned link-local + // address to use as the source address to ensure DAD works as expected on + // networks with MLD snooping switches: + // + // MLD Report and Done messages are sent with a link-local address as + // the IPv6 source address, if a valid address is available on the + // interface. If a valid link-local address is not available (e.g., one + // has not been configured), the message is sent with the unspecified + // address (::) as the IPv6 source address. + // + // Once a valid link-local address is available, a node SHOULD generate + // new MLD Report messages for all multicast addresses joined on the + // interface. + // + // Routers receiving an MLD Report or Done message with the unspecified + // address as the IPv6 source address MUST silently discard the packet + // without taking any action on the packets contents. + // + // Snooping switches MUST manage multicast forwarding state based on MLD + // Report and Done messages sent with the unspecified address as the + // IPv6 source address. + localAddress := mld.ep.getLinkLocalAddressRLocked() + if len(localAddress) == 0 { + localAddress = header.IPv6Any + } + + icmp.SetChecksum(header.ICMPv6Checksum(icmp, localAddress, destAddress, buffer.VectorisedView{})) + + extensionHeaders := header.IPv6ExtHdrSerializer{ + header.IPv6SerializableHopByHopExtHdr{ + &header.IPv6RouterAlertOption{Value: header.IPv6RouterAlertMLD}, + }, + } + + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: int(mld.ep.MaxHeaderLength()) + extensionHeaders.Length(), + Data: buffer.View(icmp).ToVectorisedView(), + }) + + mld.ep.addIPHeader(localAddress, destAddress, pkt, stack.NetworkHeaderParams{ + Protocol: header.ICMPv6ProtocolNumber, + TTL: header.MLDHopLimit, + }, extensionHeaders) + if err := mld.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(destAddress), nil /* gso */, ProtocolNumber, pkt); err != nil { + sentStats.Dropped.Increment() + return false, err + } + mldStat.Increment() + return localAddress != header.IPv6Any, nil +} diff --git a/pkg/tcpip/network/ipv6/mld_test.go b/pkg/tcpip/network/ipv6/mld_test.go new file mode 100644 index 000000000..e2778b656 --- /dev/null +++ b/pkg/tcpip/network/ipv6/mld_test.go @@ -0,0 +1,297 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ipv6_test + +import ( + "testing" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/checker" + "gvisor.dev/gvisor/pkg/tcpip/faketime" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/link/channel" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +const ( + linkLocalAddr = "\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" + globalAddr = "\x0a\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" + globalMulticastAddr = "\xff\x05\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" +) + +var ( + linkLocalAddrSNMC = header.SolicitedNodeAddr(linkLocalAddr) + globalAddrSNMC = header.SolicitedNodeAddr(globalAddr) +) + +func validateMLDPacket(t *testing.T, p buffer.View, localAddress, remoteAddress tcpip.Address, mldType header.ICMPv6Type, groupAddress tcpip.Address) { + t.Helper() + + checker.IPv6WithExtHdr(t, p, + checker.IPv6ExtHdr( + checker.IPv6HopByHopExtensionHeader(checker.IPv6RouterAlert(header.IPv6RouterAlertMLD)), + ), + checker.SrcAddr(localAddress), + checker.DstAddr(remoteAddress), + // Hop Limit for an MLD message must be 1 as per RFC 2710 section 3. + checker.TTL(1), + checker.MLD(mldType, header.MLDMinimumSize, + checker.MLDMaxRespDelay(0), + checker.MLDMulticastAddress(groupAddress), + ), + ) +} + +func TestIPv6JoinLeaveSolicitedNodeAddressPerformsMLD(t *testing.T) { + const nicID = 1 + + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ + MLD: ipv6.MLDOptions{ + Enabled: true, + }, + })}, + }) + e := channel.New(1, header.IPv6MinimumMTU, "") + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID, err) + } + + // The stack will join an address's solicited node multicast address when + // an address is added. An MLD report message should be sent for the + // solicited-node group. + if err := s.AddAddress(nicID, ipv6.ProtocolNumber, linkLocalAddr); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ipv6.ProtocolNumber, linkLocalAddr, err) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + validateMLDPacket(t, stack.PayloadSince(p.Pkt.NetworkHeader()), linkLocalAddr, linkLocalAddrSNMC, header.ICMPv6MulticastListenerReport, linkLocalAddrSNMC) + } + + // The stack will leave an address's solicited node multicast address when + // an address is removed. An MLD done message should be sent for the + // solicited-node group. + if err := s.RemoveAddress(nicID, linkLocalAddr); err != nil { + t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, linkLocalAddr, err) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a done message to be sent") + } else { + validateMLDPacket(t, stack.PayloadSince(p.Pkt.NetworkHeader()), header.IPv6Any, header.IPv6AllRoutersMulticastAddress, header.ICMPv6MulticastListenerDone, linkLocalAddrSNMC) + } +} + +func TestSendQueuedMLDReports(t *testing.T) { + const ( + nicID = 1 + maxReports = 2 + ) + + tests := []struct { + name string + dadTransmits uint8 + retransmitTimer time.Duration + }{ + { + name: "DAD Disabled", + dadTransmits: 0, + retransmitTimer: 0, + }, + { + name: "DAD Enabled", + dadTransmits: 1, + retransmitTimer: time.Second, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + dadResolutionTime := test.retransmitTimer * time.Duration(test.dadTransmits) + clock := faketime.NewManualClock() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ + NDPConfigs: ipv6.NDPConfigurations{ + DupAddrDetectTransmits: test.dadTransmits, + RetransmitTimer: test.retransmitTimer, + }, + MLD: ipv6.MLDOptions{ + Enabled: true, + }, + })}, + Clock: clock, + }) + + // Allow space for an extra packet so we can observe packets that were + // unexpectedly sent. + e := channel.New(maxReports+int(test.dadTransmits)+1 /* extra */, header.IPv6MinimumMTU, "") + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID, err) + } + + resolveDAD := func(addr, snmc tcpip.Address) { + clock.Advance(dadResolutionTime) + if p, ok := e.Read(); !ok { + t.Fatal("expected DAD packet") + } else { + checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()), + checker.SrcAddr(header.IPv6Any), + checker.DstAddr(snmc), + checker.TTL(header.NDPHopLimit), + checker.NDPNS( + checker.NDPNSTargetAddress(addr), + checker.NDPNSOptions(nil), + )) + } + } + + var reportCounter uint64 + reportStat := s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport + if got := reportStat.Value(); got != reportCounter { + t.Errorf("got reportStat.Value() = %d, want = %d", got, reportCounter) + } + var doneCounter uint64 + doneStat := s.Stats().ICMP.V6.PacketsSent.MulticastListenerDone + if got := doneStat.Value(); got != doneCounter { + t.Errorf("got doneStat.Value() = %d, want = %d", got, doneCounter) + } + + // Joining a group without an assigned address should send an MLD report + // with the unspecified address. + if err := s.JoinGroup(ipv6.ProtocolNumber, nicID, globalMulticastAddr); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", ipv6.ProtocolNumber, nicID, globalMulticastAddr, err) + } + reportCounter++ + if got := reportStat.Value(); got != reportCounter { + t.Errorf("got reportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Errorf("expected MLD report for %s", globalMulticastAddr) + } else { + validateMLDPacket(t, stack.PayloadSince(p.Pkt.NetworkHeader()), header.IPv6Any, globalMulticastAddr, header.ICMPv6MulticastListenerReport, globalMulticastAddr) + } + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Errorf("got unexpected packet = %#v", p) + } + if t.Failed() { + t.FailNow() + } + + // Adding a global address should not send reports for the already joined + // group since we should only send queued reports when a link-local + // addres sis assigned. + // + // Note, we will still expect to send a report for the global address's + // solicited node address from the unspecified address as per RFC 3590 + // section 4. + if err := s.AddAddressWithOptions(nicID, ipv6.ProtocolNumber, globalAddr, stack.FirstPrimaryEndpoint); err != nil { + t.Fatalf("AddAddressWithOptions(%d, %d, %s, %d): %s", nicID, ipv6.ProtocolNumber, globalAddr, stack.FirstPrimaryEndpoint, err) + } + reportCounter++ + if got := reportStat.Value(); got != reportCounter { + t.Errorf("got reportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Errorf("expected MLD report for %s", globalAddrSNMC) + } else { + validateMLDPacket(t, stack.PayloadSince(p.Pkt.NetworkHeader()), header.IPv6Any, globalAddrSNMC, header.ICMPv6MulticastListenerReport, globalAddrSNMC) + } + if dadResolutionTime != 0 { + // Reports should not be sent when the address resolves. + resolveDAD(globalAddr, globalAddrSNMC) + if got := reportStat.Value(); got != reportCounter { + t.Errorf("got reportStat.Value() = %d, want = %d", got, reportCounter) + } + } + // Leave the group since we don't care about the global address's + // solicited node multicast group membership. + if err := s.LeaveGroup(ipv6.ProtocolNumber, nicID, globalAddrSNMC); err != nil { + t.Fatalf("LeaveGroup(%d, %d, %s): %s", ipv6.ProtocolNumber, nicID, globalAddrSNMC, err) + } + if got := doneStat.Value(); got != doneCounter { + t.Errorf("got doneStat.Value() = %d, want = %d", got, doneCounter) + } + if p, ok := e.Read(); ok { + t.Errorf("got unexpected packet = %#v", p) + } + if t.Failed() { + t.FailNow() + } + + // Adding a link-local address should send a report for its solicited node + // address and globalMulticastAddr. + if err := s.AddAddressWithOptions(nicID, ipv6.ProtocolNumber, linkLocalAddr, stack.CanBePrimaryEndpoint); err != nil { + t.Fatalf("AddAddressWithOptions(%d, %d, %s, %d): %s", nicID, ipv6.ProtocolNumber, linkLocalAddr, stack.CanBePrimaryEndpoint, err) + } + if dadResolutionTime != 0 { + reportCounter++ + if got := reportStat.Value(); got != reportCounter { + t.Errorf("got reportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Errorf("expected MLD report for %s", linkLocalAddrSNMC) + } else { + validateMLDPacket(t, stack.PayloadSince(p.Pkt.NetworkHeader()), header.IPv6Any, linkLocalAddrSNMC, header.ICMPv6MulticastListenerReport, linkLocalAddrSNMC) + } + resolveDAD(linkLocalAddr, linkLocalAddrSNMC) + } + + // We expect two batches of reports to be sent (1 batch when the + // link-local address is assigned, and another after the maximum + // unsolicited report interval. + for i := 0; i < 2; i++ { + // We expect reports to be sent (one for globalMulticastAddr and another + // for linkLocalAddrSNMC). + reportCounter += maxReports + if got := reportStat.Value(); got != reportCounter { + t.Errorf("got reportStat.Value() = %d, want = %d", got, reportCounter) + } + + addrs := map[tcpip.Address]bool{ + globalMulticastAddr: false, + linkLocalAddrSNMC: false, + } + for _ = range addrs { + p, ok := e.Read() + if !ok { + t.Fatalf("expected MLD report for %s and %s; addrs = %#v", globalMulticastAddr, linkLocalAddrSNMC, addrs) + } + + addr := header.IPv6(stack.PayloadSince(p.Pkt.NetworkHeader())).DestinationAddress() + if seen, ok := addrs[addr]; !ok { + t.Fatalf("got unexpected packet destined to %s", addr) + } else if seen { + t.Fatalf("got another packet destined to %s", addr) + } + + addrs[addr] = true + validateMLDPacket(t, stack.PayloadSince(p.Pkt.NetworkHeader()), linkLocalAddr, addr, header.ICMPv6MulticastListenerReport, addr) + + clock.Advance(ipv6.UnsolicitedReportIntervalMax) + } + } + + // Should not send any more reports. + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Errorf("got unexpected packet = %#v", p) + } + }) + } +} diff --git a/pkg/tcpip/network/ipv6/ndp.go b/pkg/tcpip/network/ipv6/ndp.go index 40da011f8..d515eb622 100644 --- a/pkg/tcpip/network/ipv6/ndp.go +++ b/pkg/tcpip/network/ipv6/ndp.go @@ -20,6 +20,7 @@ import ( "math/rand" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -459,6 +460,9 @@ func (c *NDPConfigurations) validate() { // ndpState is the per-interface NDP state. type ndpState struct { + // Do not allow overwriting this state. + _ sync.NoCopy + // The IPv6 endpoint this ndpState is for. ep *endpoint @@ -471,17 +475,8 @@ type ndpState struct { // The default routers discovered through Router Advertisements. defaultRouters map[tcpip.Address]defaultRouterState - rtrSolicit struct { - // The timer used to send the next router solicitation message. - timer tcpip.Timer - - // Used to let the Router Solicitation timer know that it has been stopped. - // - // Must only be read from or written to while protected by the lock of - // the IPv6 endpoint this ndpState is associated with. MUST be set when the - // timer is set. - done *bool - } + // The job used to send the next router solicitation message. + rtrSolicitJob *tcpip.Job // The on-link prefixes discovered through Router Advertisements' Prefix // Information option. @@ -507,7 +502,7 @@ type ndpState struct { // to the DAD goroutine that DAD should stop. type dadState struct { // The DAD timer to send the next NS message, or resolve the address. - timer tcpip.Timer + job *tcpip.Job // Used to let the DAD timer know that it has been stopped. // @@ -648,96 +643,73 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, addressE // Consider DAD to have resolved even if no DAD messages were actually // transmitted. - if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil { + if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, true, nil) } + ndp.ep.onAddressAssignedLocked(addr) return nil } - var done bool - var timer tcpip.Timer - // We initially start a timer to fire immediately because some of the DAD work - // cannot be done while holding the IPv6 endpoint's lock. This is effectively - // the same as starting a goroutine but we use a timer that fires immediately - // so we can reset it for the next DAD iteration. - timer = ndp.ep.protocol.stack.Clock().AfterFunc(0, func() { - ndp.ep.mu.Lock() - defer ndp.ep.mu.Unlock() - - if done { - // If we reach this point, it means that the DAD timer fired after - // another goroutine already obtained the IPv6 endpoint lock and stopped - // DAD before this function obtained the NIC lock. Simply return here and - // do nothing further. - return - } + state := dadState{ + job: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() { + state, ok := ndp.dad[addr] + if !ok { + panic(fmt.Sprintf("ndpdad: DAD timer fired but missing state for %s on NIC(%d)", addr, ndp.ep.nic.ID())) + } - if addressEndpoint.GetKind() != stack.PermanentTentative { - // The endpoint should still be marked as tentative since we are still - // performing DAD on it. - panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.ep.nic.ID())) - } + if addressEndpoint.GetKind() != stack.PermanentTentative { + // The endpoint should still be marked as tentative since we are still + // performing DAD on it. + panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.ep.nic.ID())) + } - dadDone := remaining == 0 - - var err *tcpip.Error - if !dadDone { - // Use the unspecified address as the source address when performing DAD. - addressEndpoint := ndp.ep.acquireAddressOrCreateTempLocked(header.IPv6Any, true /* createTemp */, stack.NeverPrimaryEndpoint) - - // Do not hold the lock when sending packets which may be a long running - // task or may block link address resolution. We know this is safe - // because immediately after obtaining the lock again, we check if DAD - // has been stopped before doing any work with the IPv6 endpoint. Note, - // DAD would be stopped if the IPv6 endpoint was disabled or closed, or if - // the address was removed. - ndp.ep.mu.Unlock() - err = ndp.sendDADPacket(addr, addressEndpoint) - ndp.ep.mu.Lock() - addressEndpoint.DecRef() - } + dadDone := remaining == 0 - if done { - // If we reach this point, it means that DAD was stopped after we released - // the IPv6 endpoint's read lock and before we obtained the write lock. - return - } + var err *tcpip.Error + if !dadDone { + err = ndp.sendDADPacket(addr, addressEndpoint) + } - if dadDone { - // DAD has resolved. - addressEndpoint.SetKind(stack.Permanent) - } else if err == nil { - // DAD is not done and we had no errors when sending the last NDP NS, - // schedule the next DAD timer. - remaining-- - timer.Reset(ndp.configs.RetransmitTimer) - return - } + if dadDone { + // DAD has resolved. + addressEndpoint.SetKind(stack.Permanent) + } else if err == nil { + // DAD is not done and we had no errors when sending the last NDP NS, + // schedule the next DAD timer. + remaining-- + state.job.Schedule(ndp.configs.RetransmitTimer) + return + } - // At this point we know that either DAD is done or we hit an error sending - // the last NDP NS. Either way, clean up addr's DAD state and let the - // integrator know DAD has completed. - delete(ndp.dad, addr) + // At this point we know that either DAD is done or we hit an error + // sending the last NDP NS. Either way, clean up addr's DAD state and let + // the integrator know DAD has completed. + delete(ndp.dad, addr) - if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil { - ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, dadDone, err) - } + if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { + ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, dadDone, err) + } - // If DAD resolved for a stable SLAAC address, attempt generation of a - // temporary SLAAC address. - if dadDone && addressEndpoint.ConfigType() == stack.AddressConfigSlaac { - // Reset the generation attempts counter as we are starting the generation - // of a new address for the SLAAC prefix. - ndp.regenerateTempSLAACAddr(addressEndpoint.AddressWithPrefix().Subnet(), true /* resetGenAttempts */) - } - }) + if dadDone { + if addressEndpoint.ConfigType() == stack.AddressConfigSlaac { + // Reset the generation attempts counter as we are starting the + // generation of a new address for the SLAAC prefix. + ndp.regenerateTempSLAACAddr(addressEndpoint.AddressWithPrefix().Subnet(), true /* resetGenAttempts */) + } - ndp.dad[addr] = dadState{ - timer: timer, - done: &done, + ndp.ep.onAddressAssignedLocked(addr) + } + }), } + // We initially start a timer to fire immediately because some of the DAD work + // cannot be done while holding the IPv6 endpoint's lock. This is effectively + // the same as starting a goroutine but we use a timer that fires immediately + // so we can reset it for the next DAD iteration. + state.job.Schedule(0) + ndp.dad[addr] = state + return nil } @@ -745,55 +717,31 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, addressE // addr. // // addr must be a tentative IPv6 address on ndp's IPv6 endpoint. -// -// The IPv6 endpoint that ndp belongs to MUST NOT be locked. func (ndp *ndpState) sendDADPacket(addr tcpip.Address, addressEndpoint stack.AddressEndpoint) *tcpip.Error { snmc := header.SolicitedNodeAddr(addr) - r, err := ndp.ep.protocol.stack.FindRoute(ndp.ep.nic.ID(), header.IPv6Any, snmc, ProtocolNumber, false /* multicastLoop */) - if err != nil { - return err - } - defer r.Release() - - // Route should resolve immediately since snmc is a multicast address so a - // remote link address can be calculated without a resolution process. - if c, err := r.Resolve(nil); err != nil { - // Do not consider the NIC being unknown or disabled as a fatal error. - // Since this method is required to be called when the IPv6 endpoint is not - // locked, the NIC could have been disabled or removed by another goroutine. - if err == tcpip.ErrUnknownNICID || err != tcpip.ErrInvalidEndpointState { - return err - } - - panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.ep.nic.ID(), err)) - } else if c != nil { - panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.ep.nic.ID())) - } - - icmpData := header.ICMPv6(buffer.NewView(header.ICMPv6NeighborSolicitMinimumSize)) - icmpData.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(icmpData.NDPPayload()) + icmp := header.ICMPv6(buffer.NewView(header.ICMPv6NeighborSolicitMinimumSize)) + icmp.SetType(header.ICMPv6NeighborSolicit) + ns := header.NDPNeighborSolicit(icmp.MessageBody()) ns.SetTargetAddress(addr) - icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) + icmp.SetChecksum(header.ICMPv6Checksum(icmp, header.IPv6Any, snmc, buffer.VectorisedView{})) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: int(r.MaxHeaderLength()), - Data: buffer.View(icmpData).ToVectorisedView(), + ReserveHeaderBytes: int(ndp.ep.MaxHeaderLength()), + Data: buffer.View(icmp).ToVectorisedView(), }) - sent := r.Stats().ICMP.V6PacketsSent - if err := r.WritePacket(nil, - stack.NetworkHeaderParams{ - Protocol: header.ICMPv6ProtocolNumber, - TTL: header.NDPHopLimit, - }, pkt, - ); err != nil { + sent := ndp.ep.protocol.stack.Stats().ICMP.V6.PacketsSent + ndp.ep.addIPHeader(header.IPv6Any, snmc, pkt, stack.NetworkHeaderParams{ + Protocol: header.ICMPv6ProtocolNumber, + TTL: header.NDPHopLimit, + }, nil /* extensionHeaders */) + + if err := ndp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(snmc), nil /* gso */, ProtocolNumber, pkt); err != nil { sent.Dropped.Increment() return err } sent.NeighborSolicit.Increment() - return nil } @@ -812,18 +760,11 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) { return } - if dad.timer != nil { - dad.timer.Stop() - dad.timer = nil - - *dad.done = true - dad.done = nil - } - + dad.job.Cancel() delete(ndp.dad, addr) // Let the integrator know DAD did not resolve. - if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil { + if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, false, nil) } } @@ -846,7 +787,7 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { // Only worry about the DHCPv6 configuration if we have an NDPDispatcher as we // only inform the dispatcher on configuration changes. We do nothing else // with the information. - if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil { + if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { var configuration DHCPv6ConfigurationFromNDPRA switch { case ra.ManagedAddrConfFlag(): @@ -903,20 +844,20 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() { switch opt := opt.(type) { case header.NDPRecursiveDNSServer: - if ndp.ep.protocol.ndpDisp == nil { + if ndp.ep.protocol.options.NDPDisp == nil { continue } addrs, _ := opt.Addresses() - ndp.ep.protocol.ndpDisp.OnRecursiveDNSServerOption(ndp.ep.nic.ID(), addrs, opt.Lifetime()) + ndp.ep.protocol.options.NDPDisp.OnRecursiveDNSServerOption(ndp.ep.nic.ID(), addrs, opt.Lifetime()) case header.NDPDNSSearchList: - if ndp.ep.protocol.ndpDisp == nil { + if ndp.ep.protocol.options.NDPDisp == nil { continue } domainNames, _ := opt.DomainNames() - ndp.ep.protocol.ndpDisp.OnDNSSearchListOption(ndp.ep.nic.ID(), domainNames, opt.Lifetime()) + ndp.ep.protocol.options.NDPDisp.OnDNSSearchListOption(ndp.ep.nic.ID(), domainNames, opt.Lifetime()) case header.NDPPrefixInformation: prefix := opt.Subnet() @@ -964,7 +905,7 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) { delete(ndp.defaultRouters, ip) // Let the integrator know a discovered default router is invalidated. - if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil { + if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnDefaultRouterInvalidated(ndp.ep.nic.ID(), ip) } } @@ -976,7 +917,7 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) { // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) { - ndpDisp := ndp.ep.protocol.ndpDisp + ndpDisp := ndp.ep.protocol.options.NDPDisp if ndpDisp == nil { return } @@ -1006,7 +947,7 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) { // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) { - ndpDisp := ndp.ep.protocol.ndpDisp + ndpDisp := ndp.ep.protocol.options.NDPDisp if ndpDisp == nil { return } @@ -1047,7 +988,7 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) { delete(ndp.onLinkPrefixes, prefix) // Let the integrator know a discovered on-link prefix is invalidated. - if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil { + if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnOnLinkPrefixInvalidated(ndp.ep.nic.ID(), prefix) } } @@ -1225,7 +1166,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) addAndAcquireSLAACAddr(addr tcpip.AddressWithPrefix, configType stack.AddressConfigType, deprecated bool) stack.AddressEndpoint { // Inform the integrator that we have a new SLAAC address. - ndpDisp := ndp.ep.protocol.ndpDisp + ndpDisp := ndp.ep.protocol.options.NDPDisp if ndpDisp == nil { return nil } @@ -1272,7 +1213,7 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt } dadCounter := state.generationAttempts + state.stableAddr.localGenerationFailures - if oIID := ndp.ep.protocol.opaqueIIDOpts; oIID.NICNameFromID != nil { + if oIID := ndp.ep.protocol.options.OpaqueIIDOpts; oIID.NICNameFromID != nil { addrBytes = header.AppendOpaqueInterfaceIdentifier( addrBytes[:header.IIDOffsetInIPv6Address], prefix, @@ -1676,7 +1617,7 @@ func (ndp *ndpState) deprecateSLAACAddress(addressEndpoint stack.AddressEndpoint } addressEndpoint.SetDeprecated(true) - if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil { + if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnAutoGenAddressDeprecated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix()) } } @@ -1701,7 +1642,7 @@ func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, state slaacPrefi // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidatePrefix bool) { - if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil { + if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr) } @@ -1761,7 +1702,7 @@ func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLA // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidateAddr bool) { - if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil { + if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr) } @@ -1859,7 +1800,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) { // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) startSolicitingRouters() { - if ndp.rtrSolicit.timer != nil { + if ndp.rtrSolicitJob != nil { // We are already soliciting routers. return } @@ -1876,56 +1817,14 @@ func (ndp *ndpState) startSolicitingRouters() { delay = time.Duration(rand.Int63n(int64(ndp.configs.MaxRtrSolicitationDelay))) } - var done bool - ndp.rtrSolicit.done = &done - ndp.rtrSolicit.timer = ndp.ep.protocol.stack.Clock().AfterFunc(delay, func() { - ndp.ep.mu.Lock() - if done { - // If we reach this point, it means that the RS timer fired after another - // goroutine already obtained the IPv6 endpoint lock and stopped - // solicitations. Simply return here and do nothing further. - ndp.ep.mu.Unlock() - return - } - + ndp.rtrSolicitJob = ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() { // As per RFC 4861 section 4.1, the source of the RS is an address assigned // to the sending interface, or the unspecified address if no address is // assigned to the sending interface. - addressEndpoint := ndp.ep.acquireOutgoingPrimaryAddressRLocked(header.IPv6AllRoutersMulticastAddress, false) - if addressEndpoint == nil { - // Incase this ends up creating a new temporary address, we need to hold - // onto the endpoint until a route is obtained. If we decrement the - // reference count before obtaing a route, the address's resources would - // be released and attempting to obtain a route after would fail. Once a - // route is obtainted, it is safe to decrement the reference count since - // obtaining a route increments the address's reference count. - addressEndpoint = ndp.ep.acquireAddressOrCreateTempLocked(header.IPv6Any, true /* createTemp */, stack.NeverPrimaryEndpoint) - } - ndp.ep.mu.Unlock() - - localAddr := addressEndpoint.AddressWithPrefix().Address - r, err := ndp.ep.protocol.stack.FindRoute(ndp.ep.nic.ID(), localAddr, header.IPv6AllRoutersMulticastAddress, ProtocolNumber, false /* multicastLoop */) - addressEndpoint.DecRef() - if err != nil { - return - } - defer r.Release() - - // Route should resolve immediately since - // header.IPv6AllRoutersMulticastAddress is a multicast address so a - // remote link address can be calculated without a resolution process. - if c, err := r.Resolve(nil); err != nil { - // Do not consider the NIC being unknown or disabled as a fatal error. - // Since this method is required to be called when the IPv6 endpoint is - // not locked, the IPv6 endpoint could have been disabled or removed by - // another goroutine. - if err == tcpip.ErrUnknownNICID || err == tcpip.ErrInvalidEndpointState { - return - } - - panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.ep.nic.ID(), err)) - } else if c != nil { - panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.ep.nic.ID())) + localAddr := header.IPv6Any + if addressEndpoint := ndp.ep.acquireOutgoingPrimaryAddressRLocked(header.IPv6AllRoutersMulticastAddress, false); addressEndpoint != nil { + localAddr = addressEndpoint.AddressWithPrefix().Address + addressEndpoint.DecRef() } // As per RFC 4861 section 4.1, an NDP RS SHOULD include the source @@ -1936,30 +1835,31 @@ func (ndp *ndpState) startSolicitingRouters() { // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by // LinkEndpoint.LinkAddress) before reaching this point. var optsSerializer header.NDPOptionsSerializer - if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(r.LocalLinkAddress) { + linkAddress := ndp.ep.nic.LinkAddress() + if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(linkAddress) { optsSerializer = header.NDPOptionsSerializer{ - header.NDPSourceLinkLayerAddressOption(r.LocalLinkAddress), + header.NDPSourceLinkLayerAddressOption(linkAddress), } } payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize + int(optsSerializer.Length()) icmpData := header.ICMPv6(buffer.NewView(payloadSize)) icmpData.SetType(header.ICMPv6RouterSolicit) - rs := header.NDPRouterSolicit(icmpData.NDPPayload()) + rs := header.NDPRouterSolicit(icmpData.MessageBody()) rs.Options().Serialize(optsSerializer) - icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) + icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, localAddr, header.IPv6AllRoutersMulticastAddress, buffer.VectorisedView{})) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: int(r.MaxHeaderLength()), + ReserveHeaderBytes: int(ndp.ep.MaxHeaderLength()), Data: buffer.View(icmpData).ToVectorisedView(), }) - sent := r.Stats().ICMP.V6PacketsSent - if err := r.WritePacket(nil, - stack.NetworkHeaderParams{ - Protocol: header.ICMPv6ProtocolNumber, - TTL: header.NDPHopLimit, - }, pkt, - ); err != nil { + sent := ndp.ep.protocol.stack.Stats().ICMP.V6.PacketsSent + ndp.ep.addIPHeader(localAddr, header.IPv6AllRoutersMulticastAddress, pkt, stack.NetworkHeaderParams{ + Protocol: header.ICMPv6ProtocolNumber, + TTL: header.NDPHopLimit, + }, nil /* extensionHeaders */) + + if err := ndp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress), nil /* gso */, ProtocolNumber, pkt); err != nil { sent.Dropped.Increment() log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.ep.nic.ID(), err) // Don't send any more messages if we had an error. @@ -1969,21 +1869,12 @@ func (ndp *ndpState) startSolicitingRouters() { remaining-- } - ndp.ep.mu.Lock() - if done || remaining == 0 { - ndp.rtrSolicit.timer = nil - ndp.rtrSolicit.done = nil - } else if ndp.rtrSolicit.timer != nil { - // Note, we need to explicitly check to make sure that - // the timer field is not nil because if it was nil but - // we still reached this point, then we know the IPv6 endpoint - // was requested to stop soliciting routers so we don't - // need to send the next Router Solicitation message. - ndp.rtrSolicit.timer.Reset(ndp.configs.RtrSolicitationInterval) + if remaining != 0 { + ndp.rtrSolicitJob.Schedule(ndp.configs.RtrSolicitationInterval) } - ndp.ep.mu.Unlock() }) + ndp.rtrSolicitJob.Schedule(delay) } // stopSolicitingRouters stops soliciting routers. If routers are not currently @@ -1991,22 +1882,28 @@ func (ndp *ndpState) startSolicitingRouters() { // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) stopSolicitingRouters() { - if ndp.rtrSolicit.timer == nil { + if ndp.rtrSolicitJob == nil { // Nothing to do. return } - *ndp.rtrSolicit.done = true - ndp.rtrSolicit.timer.Stop() - ndp.rtrSolicit.timer = nil - ndp.rtrSolicit.done = nil + ndp.rtrSolicitJob.Cancel() + ndp.rtrSolicitJob = nil } -// initializeTempAddrState initializes state related to temporary SLAAC -// addresses. -func (ndp *ndpState) initializeTempAddrState() { - header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.ep.protocol.tempIIDSeed, ndp.ep.nic.ID()) +func (ndp *ndpState) init(ep *endpoint) { + if ndp.dad != nil { + panic("attempted to initialize NDP state twice") + } + + ndp.ep = ep + ndp.configs = ep.protocol.options.NDPConfigs + ndp.dad = make(map[tcpip.Address]dadState) + ndp.defaultRouters = make(map[tcpip.Address]defaultRouterState) + ndp.onLinkPrefixes = make(map[tcpip.Subnet]onLinkPrefixState) + ndp.slaacPrefixes = make(map[tcpip.Subnet]slaacPrefixState) + header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.ep.protocol.options.TempIIDSeed, ndp.ep.nic.ID()) if MaxDesyncFactor != 0 { ndp.temporaryAddressDesyncFactor = time.Duration(rand.Int63n(int64(MaxDesyncFactor))) } diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go index 37e8b1083..05a0d95b2 100644 --- a/pkg/tcpip/network/ipv6/ndp_test.go +++ b/pkg/tcpip/network/ipv6/ndp_test.go @@ -205,7 +205,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) { hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize) pkt := header.ICMPv6(hdr.Prepend(ndpNSSize)) pkt.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(pkt.NDPPayload()) + ns := header.NDPNeighborSolicit(pkt.MessageBody()) ns.SetTargetAddress(lladdr0) opts := ns.Options() copy(opts, test.optsBuf) @@ -213,14 +213,14 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) { payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: 255, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(payloadLength), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: 255, + SrcAddr: lladdr1, + DstAddr: lladdr0, }) - invalid := s.Stats().ICMP.V6PacketsReceived.Invalid + invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid // Invalid count should initially be 0. if got := invalid.Value(); got != 0 { @@ -311,7 +311,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache(t *testi hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize) pkt := header.ICMPv6(hdr.Prepend(ndpNSSize)) pkt.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(pkt.NDPPayload()) + ns := header.NDPNeighborSolicit(pkt.MessageBody()) ns.SetTargetAddress(lladdr0) opts := ns.Options() copy(opts, test.optsBuf) @@ -319,23 +319,23 @@ func TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache(t *testi payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: 255, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(payloadLength), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: 255, + SrcAddr: lladdr1, + DstAddr: lladdr0, }) - invalid := s.Stats().ICMP.V6PacketsReceived.Invalid + invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid // Invalid count should initially be 0. if got := invalid.Value(); got != 0 { t.Fatalf("got invalid = %d, want = 0", got) } - e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{ + e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), - }) + })) neighbors, err := s.Neighbors(nicID) if err != nil { @@ -591,7 +591,7 @@ func TestNeighorSolicitationResponse(t *testing.T) { hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize) pkt := header.ICMPv6(hdr.Prepend(ndpNSSize)) pkt.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(pkt.NDPPayload()) + ns := header.NDPNeighborSolicit(pkt.MessageBody()) ns.SetTargetAddress(nicAddr) opts := ns.Options() opts.Serialize(test.nsOpts) @@ -599,14 +599,14 @@ func TestNeighorSolicitationResponse(t *testing.T) { payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: 255, - SrcAddr: test.nsSrc, - DstAddr: test.nsDst, + PayloadLength: uint16(payloadLength), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: 255, + SrcAddr: test.nsSrc, + DstAddr: test.nsDst, }) - invalid := s.Stats().ICMP.V6PacketsReceived.Invalid + invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid // Invalid count should initially be 0. if got := invalid.Value(); got != 0 { @@ -650,8 +650,8 @@ func TestNeighorSolicitationResponse(t *testing.T) { if p.Route.RemoteAddress != respNSDst { t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, respNSDst) } - if want := header.EthernetAddressFromMulticastIPv6Address(respNSDst); p.Route.RemoteLinkAddress != want { - t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want) + if got, want := p.Route.RemoteLinkAddress(), header.EthernetAddressFromMulticastIPv6Address(respNSDst); got != want { + t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, want) } checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()), @@ -672,7 +672,7 @@ func TestNeighorSolicitationResponse(t *testing.T) { hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize) pkt := header.ICMPv6(hdr.Prepend(ndpNASize)) pkt.SetType(header.ICMPv6NeighborAdvert) - na := header.NDPNeighborAdvert(pkt.NDPPayload()) + na := header.NDPNeighborAdvert(pkt.MessageBody()) na.SetSolicitedFlag(true) na.SetOverrideFlag(true) na.SetTargetAddress(test.nsSrc) @@ -681,11 +681,11 @@ func TestNeighorSolicitationResponse(t *testing.T) { payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: header.NDPHopLimit, - SrcAddr: test.nsSrc, - DstAddr: nicAddr, + PayloadLength: uint16(payloadLength), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: header.NDPHopLimit, + SrcAddr: test.nsSrc, + DstAddr: nicAddr, }) e.InjectLinkAddr(ProtocolNumber, "", stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), @@ -706,8 +706,8 @@ func TestNeighorSolicitationResponse(t *testing.T) { if p.Route.RemoteAddress != test.naDst { t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, test.naDst) } - if p.Route.RemoteLinkAddress != test.naDstLinkAddr { - t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr) + if got := p.Route.RemoteLinkAddress(); got != test.naDstLinkAddr { + t.Errorf("got p.Route.RemoteLinkAddress() = %s, want = %s", got, test.naDstLinkAddr) } checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()), @@ -777,7 +777,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) { hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize) pkt := header.ICMPv6(hdr.Prepend(ndpNASize)) pkt.SetType(header.ICMPv6NeighborAdvert) - ns := header.NDPNeighborAdvert(pkt.NDPPayload()) + ns := header.NDPNeighborAdvert(pkt.MessageBody()) ns.SetTargetAddress(lladdr1) opts := ns.Options() copy(opts, test.optsBuf) @@ -785,14 +785,14 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) { payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: 255, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(payloadLength), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: 255, + SrcAddr: lladdr1, + DstAddr: lladdr0, }) - invalid := s.Stats().ICMP.V6PacketsReceived.Invalid + invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid // Invalid count should initially be 0. if got := invalid.Value(); got != 0 { @@ -890,7 +890,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache(t *test hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize) pkt := header.ICMPv6(hdr.Prepend(ndpNASize)) pkt.SetType(header.ICMPv6NeighborAdvert) - ns := header.NDPNeighborAdvert(pkt.NDPPayload()) + ns := header.NDPNeighborAdvert(pkt.MessageBody()) ns.SetTargetAddress(lladdr1) opts := ns.Options() copy(opts, test.optsBuf) @@ -898,23 +898,23 @@ func TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache(t *test payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: 255, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(payloadLength), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: 255, + SrcAddr: lladdr1, + DstAddr: lladdr0, }) - invalid := s.Stats().ICMP.V6PacketsReceived.Invalid + invalid := s.Stats().ICMP.V6.PacketsReceived.Invalid // Invalid count should initially be 0. if got := invalid.Value(); got != 0 { t.Fatalf("got invalid = %d, want = 0", got) } - e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{ + e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: hdr.View().ToVectorisedView(), - }) + })) neighbors, err := s.Neighbors(nicID) if err != nil { @@ -979,29 +979,25 @@ func TestNDPValidation(t *testing.T) { } handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint) { - nextHdr := uint8(header.ICMPv6ProtocolNumber) - var extensions buffer.View + var extHdrs header.IPv6ExtHdrSerializer if atomicFragment { - extensions = buffer.NewView(header.IPv6FragmentExtHdrLength) - extensions[0] = nextHdr - nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier) + extHdrs = append(extHdrs, &header.IPv6SerializableFragmentExtHdr{}) } + extHdrsLen := extHdrs.Length() pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: header.IPv6MinimumSize + len(extensions), + ReserveHeaderBytes: header.IPv6MinimumSize + extHdrsLen, Data: payload.ToVectorisedView(), }) - ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + len(extensions))) + ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + extHdrsLen)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(len(payload) + len(extensions)), - NextHeader: nextHdr, - HopLimit: hopLimit, - SrcAddr: lladdr1, - DstAddr: lladdr0, + PayloadLength: uint16(len(payload) + extHdrsLen), + TransportProtocol: header.ICMPv6ProtocolNumber, + HopLimit: hopLimit, + SrcAddr: lladdr1, + DstAddr: lladdr0, + ExtensionHeaders: extHdrs, }) - if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) { - t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n) - } ep.HandlePacket(pkt) } @@ -1122,7 +1118,7 @@ func TestNDPValidation(t *testing.T) { s.SetForwarding(ProtocolNumber, true) } - stats := s.Stats().ICMP.V6PacketsReceived + stats := s.Stats().ICMP.V6.PacketsReceived invalid := stats.Invalid routerOnly := stats.RouterOnlyPacketsDroppedByHost typStat := typ.statCounter(stats) @@ -1346,19 +1342,19 @@ func TestRouterAdvertValidation(t *testing.T) { pkt := header.ICMPv6(hdr.Prepend(icmpSize)) pkt.SetType(header.ICMPv6RouterAdvert) pkt.SetCode(test.code) - copy(pkt.NDPPayload(), test.ndpPayload) + copy(pkt.MessageBody(), test.ndpPayload) payloadLength := hdr.UsedLength() pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{})) ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(icmp.ProtocolNumber6), - HopLimit: test.hopLimit, - SrcAddr: test.src, - DstAddr: header.IPv6AllNodesMulticastAddress, + PayloadLength: uint16(payloadLength), + TransportProtocol: icmp.ProtocolNumber6, + HopLimit: test.hopLimit, + SrcAddr: test.src, + DstAddr: header.IPv6AllNodesMulticastAddress, }) - stats := s.Stats().ICMP.V6PacketsReceived + stats := s.Stats().ICMP.V6.PacketsReceived invalid := stats.Invalid rxRA := stats.RouterAdvert diff --git a/pkg/tcpip/network/multicast_group_test.go b/pkg/tcpip/network/multicast_group_test.go new file mode 100644 index 000000000..05d98a0a5 --- /dev/null +++ b/pkg/tcpip/network/multicast_group_test.go @@ -0,0 +1,1261 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ip_test + +import ( + "fmt" + "strings" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/checker" + "gvisor.dev/gvisor/pkg/tcpip/faketime" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/link/channel" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +const ( + linkAddr = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06") + + ipv4Addr = tcpip.Address("\x0a\x00\x00\x01") + ipv6Addr = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01") + + ipv4MulticastAddr1 = tcpip.Address("\xe0\x00\x00\x03") + ipv4MulticastAddr2 = tcpip.Address("\xe0\x00\x00\x04") + ipv4MulticastAddr3 = tcpip.Address("\xe0\x00\x00\x05") + ipv6MulticastAddr1 = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03") + ipv6MulticastAddr2 = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04") + ipv6MulticastAddr3 = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05") + + igmpMembershipQuery = uint8(header.IGMPMembershipQuery) + igmpv1MembershipReport = uint8(header.IGMPv1MembershipReport) + igmpv2MembershipReport = uint8(header.IGMPv2MembershipReport) + igmpLeaveGroup = uint8(header.IGMPLeaveGroup) + mldQuery = uint8(header.ICMPv6MulticastListenerQuery) + mldReport = uint8(header.ICMPv6MulticastListenerReport) + mldDone = uint8(header.ICMPv6MulticastListenerDone) + + maxUnsolicitedReports = 2 +) + +var ( + // unsolicitedIGMPReportIntervalMaxTenthSec is the maximum amount of time the + // NIC will wait before sending an unsolicited report after joining a + // multicast group, in deciseconds. + unsolicitedIGMPReportIntervalMaxTenthSec = func() uint8 { + const decisecond = time.Second / 10 + if ipv4.UnsolicitedReportIntervalMax%decisecond != 0 { + panic(fmt.Sprintf("UnsolicitedReportIntervalMax of %d is a lossy conversion to deciseconds", ipv4.UnsolicitedReportIntervalMax)) + } + return uint8(ipv4.UnsolicitedReportIntervalMax / decisecond) + }() + + ipv6AddrSNMC = header.SolicitedNodeAddr(ipv6Addr) +) + +// validateMLDPacket checks that a passed PacketInfo is an IPv6 MLD packet +// sent to the provided address with the passed fields set. +func validateMLDPacket(t *testing.T, p channel.PacketInfo, remoteAddress tcpip.Address, mldType uint8, maxRespTime byte, groupAddress tcpip.Address) { + t.Helper() + + payload := header.IPv6(stack.PayloadSince(p.Pkt.NetworkHeader())) + checker.IPv6WithExtHdr(t, payload, + checker.IPv6ExtHdr( + checker.IPv6HopByHopExtensionHeader(checker.IPv6RouterAlert(header.IPv6RouterAlertMLD)), + ), + checker.SrcAddr(ipv6Addr), + checker.DstAddr(remoteAddress), + // Hop Limit for an MLD message must be 1 as per RFC 2710 section 3. + checker.TTL(1), + checker.MLD(header.ICMPv6Type(mldType), header.MLDMinimumSize, + checker.MLDMaxRespDelay(time.Duration(maxRespTime)*time.Millisecond), + checker.MLDMulticastAddress(groupAddress), + ), + ) +} + +// validateIGMPPacket checks that a passed PacketInfo is an IPv4 IGMP packet +// sent to the provided address with the passed fields set. +func validateIGMPPacket(t *testing.T, p channel.PacketInfo, remoteAddress tcpip.Address, igmpType uint8, maxRespTime byte, groupAddress tcpip.Address) { + t.Helper() + + payload := header.IPv4(stack.PayloadSince(p.Pkt.NetworkHeader())) + checker.IPv4(t, payload, + checker.SrcAddr(ipv4Addr), + checker.DstAddr(remoteAddress), + // TTL for an IGMP message must be 1 as per RFC 2236 section 2. + checker.TTL(1), + checker.IPv4RouterAlert(), + checker.IGMP( + checker.IGMPType(header.IGMPType(igmpType)), + checker.IGMPMaxRespTime(header.DecisecondToDuration(maxRespTime)), + checker.IGMPGroupAddress(groupAddress), + ), + ) +} + +func createStack(t *testing.T, v4, mgpEnabled bool) (*channel.Endpoint, *stack.Stack, *faketime.ManualClock) { + t.Helper() + + e := channel.New(maxUnsolicitedReports, header.IPv6MinimumMTU, linkAddr) + s, clock := createStackWithLinkEndpoint(t, v4, mgpEnabled, e) + return e, s, clock +} + +func createStackWithLinkEndpoint(t *testing.T, v4, mgpEnabled bool, e stack.LinkEndpoint) (*stack.Stack, *faketime.ManualClock) { + t.Helper() + + igmpEnabled := v4 && mgpEnabled + mldEnabled := !v4 && mgpEnabled + + clock := faketime.NewManualClock() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ + ipv4.NewProtocolWithOptions(ipv4.Options{ + IGMP: ipv4.IGMPOptions{ + Enabled: igmpEnabled, + }, + }), + ipv6.NewProtocolWithOptions(ipv6.Options{ + MLD: ipv6.MLDOptions{ + Enabled: mldEnabled, + }, + }), + }, + Clock: clock, + }) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + if err := s.AddAddress(nicID, ipv4.ProtocolNumber, ipv4Addr); err != nil { + t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv4.ProtocolNumber, ipv4Addr, err) + } + if err := s.AddAddress(nicID, ipv6.ProtocolNumber, ipv6Addr); err != nil { + t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, ipv6Addr, err) + } + + return s, clock +} + +// checkInitialIPv6Groups checks the initial IPv6 groups that a NIC will join +// when it is created with an IPv6 address. +// +// To not interfere with tests, checkInitialIPv6Groups will leave the added +// address's solicited node multicast group so that the tests can all assume +// the NIC has not joined any IPv6 groups. +func checkInitialIPv6Groups(t *testing.T, e *channel.Endpoint, s *stack.Stack, clock *faketime.ManualClock) (reportCounter uint64, leaveCounter uint64) { + t.Helper() + + stats := s.Stats().ICMP.V6.PacketsSent + + reportCounter++ + if got := stats.MulticastListenerReport.Value(); got != reportCounter { + t.Errorf("got stats.MulticastListenerReport.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + validateMLDPacket(t, p, ipv6AddrSNMC, mldReport, 0, ipv6AddrSNMC) + } + + // Leave the group to not affect the tests. This is fine since we are not + // testing DAD or the solicited node address specifically. + if err := s.LeaveGroup(ipv6.ProtocolNumber, nicID, ipv6AddrSNMC); err != nil { + t.Fatalf("LeaveGroup(%d, %d, %s): %s", ipv6.ProtocolNumber, nicID, ipv6AddrSNMC, err) + } + leaveCounter++ + if got := stats.MulticastListenerDone.Value(); got != leaveCounter { + t.Errorf("got stats.MulticastListenerDone.Value() = %d, want = %d", got, leaveCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + validateMLDPacket(t, p, header.IPv6AllRoutersMulticastAddress, mldDone, 0, ipv6AddrSNMC) + } + + // Should not send any more packets. + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet = %#v", p) + } + + return reportCounter, leaveCounter +} + +// createAndInjectIGMPPacket creates and injects an IGMP packet with the +// specified fields. +// +// Note, the router alert option is not included in this packet. +// +// TODO(b/162198658): set the router alert option. +func createAndInjectIGMPPacket(e *channel.Endpoint, igmpType byte, maxRespTime byte, groupAddress tcpip.Address) { + buf := buffer.NewView(header.IPv4MinimumSize + header.IGMPQueryMinimumSize) + + ip := header.IPv4(buf) + ip.Encode(&header.IPv4Fields{ + TotalLength: uint16(len(buf)), + TTL: header.IGMPTTL, + Protocol: uint8(header.IGMPProtocolNumber), + SrcAddr: header.IPv4Any, + DstAddr: header.IPv4AllSystems, + }) + ip.SetChecksum(^ip.CalculateChecksum()) + + igmp := header.IGMP(buf[header.IPv4MinimumSize:]) + igmp.SetType(header.IGMPType(igmpType)) + igmp.SetMaxRespTime(maxRespTime) + igmp.SetGroupAddress(groupAddress) + igmp.SetChecksum(header.IGMPCalculateChecksum(igmp)) + + e.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{ + Data: buf.ToVectorisedView(), + }) +} + +// createAndInjectMLDPacket creates and injects an MLD packet with the +// specified fields. +// +// Note, the router alert option is not included in this packet. +// +// TODO(b/162198658): set the router alert option. +func createAndInjectMLDPacket(e *channel.Endpoint, mldType uint8, maxRespDelay byte, groupAddress tcpip.Address) { + icmpSize := header.ICMPv6HeaderSize + header.MLDMinimumSize + buf := buffer.NewView(header.IPv6MinimumSize + icmpSize) + + ip := header.IPv6(buf) + ip.Encode(&header.IPv6Fields{ + PayloadLength: uint16(icmpSize), + HopLimit: header.MLDHopLimit, + TransportProtocol: header.ICMPv6ProtocolNumber, + SrcAddr: header.IPv4Any, + DstAddr: header.IPv6AllNodesMulticastAddress, + }) + + icmp := header.ICMPv6(buf[header.IPv6MinimumSize:]) + icmp.SetType(header.ICMPv6Type(mldType)) + mld := header.MLD(icmp.MessageBody()) + mld.SetMaximumResponseDelay(uint16(maxRespDelay)) + mld.SetMulticastAddress(groupAddress) + icmp.SetChecksum(header.ICMPv6Checksum(icmp, header.IPv6Any, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{})) + + e.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{ + Data: buf.ToVectorisedView(), + }) +} + +// TestMGPDisabled tests that the multicast group protocol is not enabled by +// default. +func TestMGPDisabled(t *testing.T) { + tests := []struct { + name string + protoNum tcpip.NetworkProtocolNumber + multicastAddr tcpip.Address + sentReportStat func(*stack.Stack) *tcpip.StatCounter + receivedQueryStat func(*stack.Stack) *tcpip.StatCounter + rxQuery func(*channel.Endpoint) + }{ + { + name: "IGMP", + protoNum: ipv4.ProtocolNumber, + multicastAddr: ipv4MulticastAddr1, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.V2MembershipReport + }, + receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsReceived.MembershipQuery + }, + rxQuery: func(e *channel.Endpoint) { + createAndInjectIGMPPacket(e, igmpMembershipQuery, unsolicitedIGMPReportIntervalMaxTenthSec, header.IPv4Any) + }, + }, + { + name: "MLD", + protoNum: ipv6.ProtocolNumber, + multicastAddr: ipv6MulticastAddr1, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport + }, + receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerQuery + }, + rxQuery: func(e *channel.Endpoint) { + createAndInjectMLDPacket(e, mldQuery, 0, header.IPv6Any) + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + e, s, clock := createStack(t, test.protoNum == ipv4.ProtocolNumber /* v4 */, false /* mgpEnabled */) + + // This NIC may join multicast groups when it is enabled but since MGP is + // disabled, no reports should be sent. + sentReportStat := test.sentReportStat(s) + if got := sentReportStat.Value(); got != 0 { + t.Fatalf("got sentReportStat.Value() = %d, want = 0", got) + } + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet, stack with disabled MGP sent packet = %#v", p.Pkt) + } + + // Test joining a specific group explicitly and verify that no reports are + // sent. + if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err) + } + if got := sentReportStat.Value(); got != 0 { + t.Fatalf("got sentReportStat.Value() = %d, want = 0", got) + } + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet, stack with disabled IGMP sent packet = %#v", p.Pkt) + } + + // Inject a general query message. This should only trigger a report to be + // sent if the MGP was enabled. + test.rxQuery(e) + if got := test.receivedQueryStat(s).Value(); got != 1 { + t.Fatalf("got receivedQueryStat(_).Value() = %d, want = 1", got) + } + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet, stack with disabled IGMP sent packet = %+v", p.Pkt) + } + }) + } +} + +func TestMGPReceiveCounters(t *testing.T) { + tests := []struct { + name string + headerType uint8 + maxRespTime byte + groupAddress tcpip.Address + statCounter func(*stack.Stack) *tcpip.StatCounter + rxMGPkt func(*channel.Endpoint, byte, byte, tcpip.Address) + }{ + { + name: "IGMP Membership Query", + headerType: igmpMembershipQuery, + maxRespTime: unsolicitedIGMPReportIntervalMaxTenthSec, + groupAddress: header.IPv4Any, + statCounter: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsReceived.MembershipQuery + }, + rxMGPkt: createAndInjectIGMPPacket, + }, + { + name: "IGMPv1 Membership Report", + headerType: igmpv1MembershipReport, + maxRespTime: 0, + groupAddress: header.IPv4AllSystems, + statCounter: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsReceived.V1MembershipReport + }, + rxMGPkt: createAndInjectIGMPPacket, + }, + { + name: "IGMPv2 Membership Report", + headerType: igmpv2MembershipReport, + maxRespTime: 0, + groupAddress: header.IPv4AllSystems, + statCounter: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsReceived.V2MembershipReport + }, + rxMGPkt: createAndInjectIGMPPacket, + }, + { + name: "IGMP Leave Group", + headerType: igmpLeaveGroup, + maxRespTime: 0, + groupAddress: header.IPv4AllRoutersGroup, + statCounter: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsReceived.LeaveGroup + }, + rxMGPkt: createAndInjectIGMPPacket, + }, + { + name: "MLD Query", + headerType: mldQuery, + maxRespTime: 0, + groupAddress: header.IPv6Any, + statCounter: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerQuery + }, + rxMGPkt: createAndInjectMLDPacket, + }, + { + name: "MLD Report", + headerType: mldReport, + maxRespTime: 0, + groupAddress: header.IPv6Any, + statCounter: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerReport + }, + rxMGPkt: createAndInjectMLDPacket, + }, + { + name: "MLD Done", + headerType: mldDone, + maxRespTime: 0, + groupAddress: header.IPv6Any, + statCounter: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerDone + }, + rxMGPkt: createAndInjectMLDPacket, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + e, s, _ := createStack(t, len(test.groupAddress) == header.IPv4AddressSize /* v4 */, true /* mgpEnabled */) + + test.rxMGPkt(e, test.headerType, test.maxRespTime, test.groupAddress) + if got := test.statCounter(s).Value(); got != 1 { + t.Fatalf("got %s received = %d, want = 1", test.name, got) + } + }) + } +} + +// TestMGPJoinGroup tests that when explicitly joining a multicast group, the +// stack schedules and sends correct Membership Reports. +func TestMGPJoinGroup(t *testing.T) { + tests := []struct { + name string + protoNum tcpip.NetworkProtocolNumber + multicastAddr tcpip.Address + maxUnsolicitedResponseDelay time.Duration + sentReportStat func(*stack.Stack) *tcpip.StatCounter + receivedQueryStat func(*stack.Stack) *tcpip.StatCounter + validateReport func(*testing.T, channel.PacketInfo) + checkInitialGroups func(*testing.T, *channel.Endpoint, *stack.Stack, *faketime.ManualClock) (uint64, uint64) + }{ + { + name: "IGMP", + protoNum: ipv4.ProtocolNumber, + multicastAddr: ipv4MulticastAddr1, + maxUnsolicitedResponseDelay: ipv4.UnsolicitedReportIntervalMax, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.V2MembershipReport + }, + receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsReceived.MembershipQuery + }, + validateReport: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateIGMPPacket(t, p, ipv4MulticastAddr1, igmpv2MembershipReport, 0, ipv4MulticastAddr1) + }, + }, + { + name: "MLD", + protoNum: ipv6.ProtocolNumber, + multicastAddr: ipv6MulticastAddr1, + maxUnsolicitedResponseDelay: ipv6.UnsolicitedReportIntervalMax, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport + }, + receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerQuery + }, + validateReport: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateMLDPacket(t, p, ipv6MulticastAddr1, mldReport, 0, ipv6MulticastAddr1) + }, + checkInitialGroups: checkInitialIPv6Groups, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + e, s, clock := createStack(t, test.protoNum == ipv4.ProtocolNumber /* v4 */, true /* mgpEnabled */) + + var reportCounter uint64 + if test.checkInitialGroups != nil { + reportCounter, _ = test.checkInitialGroups(t, e, s, clock) + } + + // Test joining a specific address explicitly and verify a Report is sent + // immediately. + if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err) + } + reportCounter++ + sentReportStat := test.sentReportStat(s) + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + test.validateReport(t, p) + } + if t.Failed() { + t.FailNow() + } + + // Verify the second report is sent by the maximum unsolicited response + // interval. + p, ok := e.Read() + if ok { + t.Fatalf("sent unexpected packet, expected report only after advancing the clock = %#v", p.Pkt) + } + clock.Advance(test.maxUnsolicitedResponseDelay) + reportCounter++ + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + test.validateReport(t, p) + } + + // Should not send any more packets. + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet = %#v", p) + } + }) + } +} + +// TestMGPLeaveGroup tests that when leaving a previously joined multicast +// group the stack sends a leave/done message. +func TestMGPLeaveGroup(t *testing.T) { + tests := []struct { + name string + protoNum tcpip.NetworkProtocolNumber + multicastAddr tcpip.Address + sentReportStat func(*stack.Stack) *tcpip.StatCounter + sentLeaveStat func(*stack.Stack) *tcpip.StatCounter + validateReport func(*testing.T, channel.PacketInfo) + validateLeave func(*testing.T, channel.PacketInfo) + checkInitialGroups func(*testing.T, *channel.Endpoint, *stack.Stack, *faketime.ManualClock) (uint64, uint64) + }{ + { + name: "IGMP", + protoNum: ipv4.ProtocolNumber, + multicastAddr: ipv4MulticastAddr1, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.V2MembershipReport + }, + sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.LeaveGroup + }, + validateReport: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateIGMPPacket(t, p, ipv4MulticastAddr1, igmpv2MembershipReport, 0, ipv4MulticastAddr1) + }, + validateLeave: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateIGMPPacket(t, p, header.IPv4AllRoutersGroup, igmpLeaveGroup, 0, ipv4MulticastAddr1) + }, + }, + { + name: "MLD", + protoNum: ipv6.ProtocolNumber, + multicastAddr: ipv6MulticastAddr1, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport + }, + sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerDone + }, + validateReport: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateMLDPacket(t, p, ipv6MulticastAddr1, mldReport, 0, ipv6MulticastAddr1) + }, + validateLeave: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateMLDPacket(t, p, header.IPv6AllRoutersMulticastAddress, mldDone, 0, ipv6MulticastAddr1) + }, + checkInitialGroups: checkInitialIPv6Groups, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + e, s, clock := createStack(t, test.protoNum == ipv4.ProtocolNumber /* v4 */, true /* mgpEnabled */) + + var reportCounter uint64 + var leaveCounter uint64 + if test.checkInitialGroups != nil { + reportCounter, leaveCounter = test.checkInitialGroups(t, e, s, clock) + } + + if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err) + } + reportCounter++ + if got := test.sentReportStat(s).Value(); got != reportCounter { + t.Errorf("got sentReportStat(_).Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + test.validateReport(t, p) + } + if t.Failed() { + t.FailNow() + } + + // Leaving the group should trigger an leave/done message to be sent. + if err := s.LeaveGroup(test.protoNum, nicID, test.multicastAddr); err != nil { + t.Fatalf("LeaveGroup(%d, nic, %s): %s", test.protoNum, test.multicastAddr, err) + } + leaveCounter++ + if got := test.sentLeaveStat(s).Value(); got != leaveCounter { + t.Fatalf("got sentLeaveStat(_).Value() = %d, want = %d", got, leaveCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a leave message to be sent") + } else { + test.validateLeave(t, p) + } + + // Should not send any more packets. + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet = %#v", p) + } + }) + } +} + +// TestMGPQueryMessages tests that a report is sent in response to query +// messages. +func TestMGPQueryMessages(t *testing.T) { + tests := []struct { + name string + protoNum tcpip.NetworkProtocolNumber + multicastAddr tcpip.Address + maxUnsolicitedResponseDelay time.Duration + sentReportStat func(*stack.Stack) *tcpip.StatCounter + receivedQueryStat func(*stack.Stack) *tcpip.StatCounter + rxQuery func(*channel.Endpoint, uint8, tcpip.Address) + validateReport func(*testing.T, channel.PacketInfo) + maxRespTimeToDuration func(uint8) time.Duration + checkInitialGroups func(*testing.T, *channel.Endpoint, *stack.Stack, *faketime.ManualClock) (uint64, uint64) + }{ + { + name: "IGMP", + protoNum: ipv4.ProtocolNumber, + multicastAddr: ipv4MulticastAddr1, + maxUnsolicitedResponseDelay: ipv4.UnsolicitedReportIntervalMax, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.V2MembershipReport + }, + receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsReceived.MembershipQuery + }, + rxQuery: func(e *channel.Endpoint, maxRespTime uint8, groupAddress tcpip.Address) { + createAndInjectIGMPPacket(e, igmpMembershipQuery, maxRespTime, groupAddress) + }, + validateReport: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateIGMPPacket(t, p, ipv4MulticastAddr1, igmpv2MembershipReport, 0, ipv4MulticastAddr1) + }, + maxRespTimeToDuration: header.DecisecondToDuration, + }, + { + name: "MLD", + protoNum: ipv6.ProtocolNumber, + multicastAddr: ipv6MulticastAddr1, + maxUnsolicitedResponseDelay: ipv6.UnsolicitedReportIntervalMax, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport + }, + receivedQueryStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsReceived.MulticastListenerQuery + }, + rxQuery: func(e *channel.Endpoint, maxRespTime uint8, groupAddress tcpip.Address) { + createAndInjectMLDPacket(e, mldQuery, maxRespTime, groupAddress) + }, + validateReport: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateMLDPacket(t, p, ipv6MulticastAddr1, mldReport, 0, ipv6MulticastAddr1) + }, + maxRespTimeToDuration: func(d uint8) time.Duration { + return time.Duration(d) * time.Millisecond + }, + checkInitialGroups: checkInitialIPv6Groups, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + subTests := []struct { + name string + multicastAddr tcpip.Address + expectReport bool + }{ + { + name: "Unspecified", + multicastAddr: tcpip.Address(strings.Repeat("\x00", len(test.multicastAddr))), + expectReport: true, + }, + { + name: "Specified", + multicastAddr: test.multicastAddr, + expectReport: true, + }, + { + name: "Specified other address", + multicastAddr: func() tcpip.Address { + addrBytes := []byte(test.multicastAddr) + addrBytes[len(addrBytes)-1]++ + return tcpip.Address(addrBytes) + }(), + expectReport: false, + }, + } + + for _, subTest := range subTests { + t.Run(subTest.name, func(t *testing.T) { + e, s, clock := createStack(t, test.protoNum == ipv4.ProtocolNumber /* v4 */, true /* mgpEnabled */) + + var reportCounter uint64 + if test.checkInitialGroups != nil { + reportCounter, _ = test.checkInitialGroups(t, e, s, clock) + } + + if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err) + } + sentReportStat := test.sentReportStat(s) + for i := 0; i < maxUnsolicitedReports; i++ { + sentReportStat := test.sentReportStat(s) + reportCounter++ + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("(i=%d) got sentReportStat.Value() = %d, want = %d", i, got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatalf("expected %d-th report message to be sent", i) + } else { + test.validateReport(t, p) + } + clock.Advance(test.maxUnsolicitedResponseDelay) + } + if t.Failed() { + t.FailNow() + } + + // Should not send any more packets until a query. + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet = %#v", p) + } + + // Receive a query message which should trigger a report to be sent at + // some time before the maximum response time if the report is + // targeted at the host. + const maxRespTime = 100 + test.rxQuery(e, maxRespTime, subTest.multicastAddr) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet = %#v", p.Pkt) + } + + if subTest.expectReport { + clock.Advance(test.maxRespTimeToDuration(maxRespTime)) + reportCounter++ + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + test.validateReport(t, p) + } + } + + // Should not send any more packets. + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet = %#v", p) + } + }) + } + }) + } +} + +// TestMGPQueryMessages tests that no further reports or leave/done messages +// are sent after receiving a report. +func TestMGPReportMessages(t *testing.T) { + tests := []struct { + name string + protoNum tcpip.NetworkProtocolNumber + multicastAddr tcpip.Address + sentReportStat func(*stack.Stack) *tcpip.StatCounter + sentLeaveStat func(*stack.Stack) *tcpip.StatCounter + rxReport func(*channel.Endpoint) + validateReport func(*testing.T, channel.PacketInfo) + maxRespTimeToDuration func(uint8) time.Duration + checkInitialGroups func(*testing.T, *channel.Endpoint, *stack.Stack, *faketime.ManualClock) (uint64, uint64) + }{ + { + name: "IGMP", + protoNum: ipv4.ProtocolNumber, + multicastAddr: ipv4MulticastAddr1, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.V2MembershipReport + }, + sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.LeaveGroup + }, + rxReport: func(e *channel.Endpoint) { + createAndInjectIGMPPacket(e, igmpv2MembershipReport, 0, ipv4MulticastAddr1) + }, + validateReport: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateIGMPPacket(t, p, ipv4MulticastAddr1, igmpv2MembershipReport, 0, ipv4MulticastAddr1) + }, + maxRespTimeToDuration: header.DecisecondToDuration, + }, + { + name: "MLD", + protoNum: ipv6.ProtocolNumber, + multicastAddr: ipv6MulticastAddr1, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport + }, + sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerDone + }, + rxReport: func(e *channel.Endpoint) { + createAndInjectMLDPacket(e, mldReport, 0, ipv6MulticastAddr1) + }, + validateReport: func(t *testing.T, p channel.PacketInfo) { + t.Helper() + + validateMLDPacket(t, p, ipv6MulticastAddr1, mldReport, 0, ipv6MulticastAddr1) + }, + maxRespTimeToDuration: func(d uint8) time.Duration { + return time.Duration(d) * time.Millisecond + }, + checkInitialGroups: checkInitialIPv6Groups, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + e, s, clock := createStack(t, test.protoNum == ipv4.ProtocolNumber /* v4 */, true /* mgpEnabled */) + + var reportCounter uint64 + var leaveCounter uint64 + if test.checkInitialGroups != nil { + reportCounter, leaveCounter = test.checkInitialGroups(t, e, s, clock) + } + + if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err) + } + sentReportStat := test.sentReportStat(s) + reportCounter++ + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + test.validateReport(t, p) + } + if t.Failed() { + t.FailNow() + } + + // Receiving a report for a group we joined should cancel any further + // reports. + test.rxReport(e) + clock.Advance(time.Hour) + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); ok { + t.Errorf("sent unexpected packet = %#v", p) + } + if t.Failed() { + t.FailNow() + } + + // Leaving a group after getting a report should not send a leave/done + // message. + if err := s.LeaveGroup(test.protoNum, nicID, test.multicastAddr); err != nil { + t.Fatalf("LeaveGroup(%d, nic, %s): %s", test.protoNum, test.multicastAddr, err) + } + clock.Advance(time.Hour) + if got := test.sentLeaveStat(s).Value(); got != leaveCounter { + t.Fatalf("got sentLeaveStat(_).Value() = %d, want = %d", got, leaveCounter) + } + + // Should not send any more packets. + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet = %#v", p) + } + }) + } +} + +func TestMGPWithNICLifecycle(t *testing.T) { + tests := []struct { + name string + protoNum tcpip.NetworkProtocolNumber + multicastAddrs []tcpip.Address + finalMulticastAddr tcpip.Address + maxUnsolicitedResponseDelay time.Duration + sentReportStat func(*stack.Stack) *tcpip.StatCounter + sentLeaveStat func(*stack.Stack) *tcpip.StatCounter + validateReport func(*testing.T, channel.PacketInfo, tcpip.Address) + validateLeave func(*testing.T, channel.PacketInfo, tcpip.Address) + getAndCheckGroupAddress func(*testing.T, map[tcpip.Address]bool, channel.PacketInfo) tcpip.Address + checkInitialGroups func(*testing.T, *channel.Endpoint, *stack.Stack, *faketime.ManualClock) (uint64, uint64) + }{ + { + name: "IGMP", + protoNum: ipv4.ProtocolNumber, + multicastAddrs: []tcpip.Address{ipv4MulticastAddr1, ipv4MulticastAddr2}, + finalMulticastAddr: ipv4MulticastAddr3, + maxUnsolicitedResponseDelay: ipv4.UnsolicitedReportIntervalMax, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.V2MembershipReport + }, + sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.LeaveGroup + }, + validateReport: func(t *testing.T, p channel.PacketInfo, addr tcpip.Address) { + t.Helper() + + validateIGMPPacket(t, p, addr, igmpv2MembershipReport, 0, addr) + }, + validateLeave: func(t *testing.T, p channel.PacketInfo, addr tcpip.Address) { + t.Helper() + + validateIGMPPacket(t, p, header.IPv4AllRoutersGroup, igmpLeaveGroup, 0, addr) + }, + getAndCheckGroupAddress: func(t *testing.T, seen map[tcpip.Address]bool, p channel.PacketInfo) tcpip.Address { + t.Helper() + + ipv4 := header.IPv4(stack.PayloadSince(p.Pkt.NetworkHeader())) + if got := tcpip.TransportProtocolNumber(ipv4.Protocol()); got != header.IGMPProtocolNumber { + t.Fatalf("got ipv4.Protocol() = %d, want = %d", got, header.IGMPProtocolNumber) + } + addr := header.IGMP(ipv4.Payload()).GroupAddress() + s, ok := seen[addr] + if !ok { + t.Fatalf("unexpectedly got a packet for group %s", addr) + } + if s { + t.Fatalf("already saw packet for group %s", addr) + } + seen[addr] = true + return addr + }, + }, + { + name: "MLD", + protoNum: ipv6.ProtocolNumber, + multicastAddrs: []tcpip.Address{ipv6MulticastAddr1, ipv6MulticastAddr2}, + finalMulticastAddr: ipv6MulticastAddr3, + maxUnsolicitedResponseDelay: ipv6.UnsolicitedReportIntervalMax, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport + }, + sentLeaveStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerDone + }, + validateReport: func(t *testing.T, p channel.PacketInfo, addr tcpip.Address) { + t.Helper() + + validateMLDPacket(t, p, addr, mldReport, 0, addr) + }, + validateLeave: func(t *testing.T, p channel.PacketInfo, addr tcpip.Address) { + t.Helper() + + validateMLDPacket(t, p, header.IPv6AllRoutersMulticastAddress, mldDone, 0, addr) + }, + getAndCheckGroupAddress: func(t *testing.T, seen map[tcpip.Address]bool, p channel.PacketInfo) tcpip.Address { + t.Helper() + + ipv6 := header.IPv6(stack.PayloadSince(p.Pkt.NetworkHeader())) + + ipv6HeaderIter := header.MakeIPv6PayloadIterator( + header.IPv6ExtensionHeaderIdentifier(ipv6.NextHeader()), + buffer.View(ipv6.Payload()).ToVectorisedView(), + ) + + var transport header.IPv6RawPayloadHeader + for { + h, done, err := ipv6HeaderIter.Next() + if err != nil { + t.Fatalf("ipv6HeaderIter.Next(): %s", err) + } + if done { + t.Fatalf("ipv6HeaderIter.Next() = (%T, %t, _), want = (_, false, _)", h, done) + } + if t, ok := h.(header.IPv6RawPayloadHeader); ok { + transport = t + break + } + } + + if got := tcpip.TransportProtocolNumber(transport.Identifier); got != header.ICMPv6ProtocolNumber { + t.Fatalf("got ipv6.NextHeader() = %d, want = %d", got, header.ICMPv6ProtocolNumber) + } + icmpv6 := header.ICMPv6(transport.Buf.ToView()) + if got := icmpv6.Type(); got != header.ICMPv6MulticastListenerReport && got != header.ICMPv6MulticastListenerDone { + t.Fatalf("got icmpv6.Type() = %d, want = %d or %d", got, header.ICMPv6MulticastListenerReport, header.ICMPv6MulticastListenerDone) + } + addr := header.MLD(icmpv6.MessageBody()).MulticastAddress() + s, ok := seen[addr] + if !ok { + t.Fatalf("unexpectedly got a packet for group %s", addr) + } + if s { + t.Fatalf("already saw packet for group %s", addr) + } + seen[addr] = true + return addr + }, + checkInitialGroups: checkInitialIPv6Groups, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + e, s, clock := createStack(t, test.protoNum == ipv4.ProtocolNumber /* v4 */, true /* mgpEnabled */) + + var reportCounter uint64 + var leaveCounter uint64 + if test.checkInitialGroups != nil { + reportCounter, leaveCounter = test.checkInitialGroups(t, e, s, clock) + } + + sentReportStat := test.sentReportStat(s) + for _, a := range test.multicastAddrs { + if err := s.JoinGroup(test.protoNum, nicID, a); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, a, err) + } + reportCounter++ + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatalf("expected a report message to be sent for %s", a) + } else { + test.validateReport(t, p, a) + } + } + if t.Failed() { + t.FailNow() + } + + // Leave messages should be sent for the joined groups when the NIC is + // disabled. + if err := s.DisableNIC(nicID); err != nil { + t.Fatalf("DisableNIC(%d): %s", nicID, err) + } + sentLeaveStat := test.sentLeaveStat(s) + leaveCounter += uint64(len(test.multicastAddrs)) + if got := sentLeaveStat.Value(); got != leaveCounter { + t.Errorf("got sentLeaveStat.Value() = %d, want = %d", got, leaveCounter) + } + { + seen := make(map[tcpip.Address]bool) + for _, a := range test.multicastAddrs { + seen[a] = false + } + + for i, _ := range test.multicastAddrs { + p, ok := e.Read() + if !ok { + t.Fatalf("expected (%d-th) leave message to be sent", i) + } + + test.validateLeave(t, p, test.getAndCheckGroupAddress(t, seen, p)) + } + } + if t.Failed() { + t.FailNow() + } + + // Reports should be sent for the joined groups when the NIC is enabled. + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("EnableNIC(%d): %s", nicID, err) + } + reportCounter += uint64(len(test.multicastAddrs)) + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + { + seen := make(map[tcpip.Address]bool) + for _, a := range test.multicastAddrs { + seen[a] = false + } + + for i, _ := range test.multicastAddrs { + p, ok := e.Read() + if !ok { + t.Fatalf("expected (%d-th) report message to be sent", i) + } + + test.validateReport(t, p, test.getAndCheckGroupAddress(t, seen, p)) + } + } + if t.Failed() { + t.FailNow() + } + + // Joining/leaving a group while disabled should not send any messages. + if err := s.DisableNIC(nicID); err != nil { + t.Fatalf("DisableNIC(%d): %s", nicID, err) + } + leaveCounter += uint64(len(test.multicastAddrs)) + if got := sentLeaveStat.Value(); got != leaveCounter { + t.Errorf("got sentLeaveStat.Value() = %d, want = %d", got, leaveCounter) + } + for i, _ := range test.multicastAddrs { + if _, ok := e.Read(); !ok { + t.Fatalf("expected (%d-th) leave message to be sent", i) + } + } + for _, a := range test.multicastAddrs { + if err := s.LeaveGroup(test.protoNum, nicID, a); err != nil { + t.Fatalf("LeaveGroup(%d, nic, %s): %s", test.protoNum, a, err) + } + if got := sentLeaveStat.Value(); got != leaveCounter { + t.Errorf("got sentLeaveStat.Value() = %d, want = %d", got, leaveCounter) + } + if p, ok := e.Read(); ok { + t.Fatalf("leaving group %s on disabled NIC sent unexpected packet = %#v", a, p.Pkt) + } + } + if err := s.JoinGroup(test.protoNum, nicID, test.finalMulticastAddr); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.finalMulticastAddr, err) + } + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); ok { + t.Fatalf("joining group %s on disabled NIC sent unexpected packet = %#v", test.finalMulticastAddr, p.Pkt) + } + + // A report should only be sent for the group we last joined after + // enabling the NIC since the original groups were all left. + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("EnableNIC(%d): %s", nicID, err) + } + reportCounter++ + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + test.validateReport(t, p, test.finalMulticastAddr) + } + + clock.Advance(test.maxUnsolicitedResponseDelay) + reportCounter++ + if got := sentReportStat.Value(); got != reportCounter { + t.Errorf("got sentReportStat.Value() = %d, want = %d", got, reportCounter) + } + if p, ok := e.Read(); !ok { + t.Fatal("expected a report message to be sent") + } else { + test.validateReport(t, p, test.finalMulticastAddr) + } + + // Should not send any more packets. + clock.Advance(time.Hour) + if p, ok := e.Read(); ok { + t.Fatalf("sent unexpected packet = %#v", p) + } + }) + } +} + +// TestMGPDisabledOnLoopback tests that the multicast group protocol is not +// performed on loopback interfaces since they have no neighbours. +func TestMGPDisabledOnLoopback(t *testing.T) { + tests := []struct { + name string + protoNum tcpip.NetworkProtocolNumber + multicastAddr tcpip.Address + sentReportStat func(*stack.Stack) *tcpip.StatCounter + }{ + { + name: "IGMP", + protoNum: ipv4.ProtocolNumber, + multicastAddr: ipv4MulticastAddr1, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().IGMP.PacketsSent.V2MembershipReport + }, + }, + { + name: "MLD", + protoNum: ipv6.ProtocolNumber, + multicastAddr: ipv6MulticastAddr1, + sentReportStat: func(s *stack.Stack) *tcpip.StatCounter { + return s.Stats().ICMP.V6.PacketsSent.MulticastListenerReport + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s, clock := createStackWithLinkEndpoint(t, test.protoNum == ipv4.ProtocolNumber /* v4 */, true /* mgpEnabled */, loopback.New()) + + sentReportStat := test.sentReportStat(s) + if got := sentReportStat.Value(); got != 0 { + t.Fatalf("got sentReportStat.Value() = %d, want = 0", got) + } + clock.Advance(time.Hour) + if got := sentReportStat.Value(); got != 0 { + t.Fatalf("got sentReportStat.Value() = %d, want = 0", got) + } + + // Test joining a specific group explicitly and verify that no reports are + // sent. + if err := s.JoinGroup(test.protoNum, nicID, test.multicastAddr); err != nil { + t.Fatalf("JoinGroup(%d, %d, %s): %s", test.protoNum, nicID, test.multicastAddr, err) + } + if got := sentReportStat.Value(); got != 0 { + t.Fatalf("got sentReportStat.Value() = %d, want = 0", got) + } + clock.Advance(time.Hour) + if got := sentReportStat.Value(); got != 0 { + t.Fatalf("got sentReportStat.Value() = %d, want = 0", got) + } + }) + } +} diff --git a/pkg/tcpip/network/testutil/testutil.go b/pkg/tcpip/network/testutil/testutil.go index 7cc52985e..5c3363759 100644 --- a/pkg/tcpip/network/testutil/testutil.go +++ b/pkg/tcpip/network/testutil/testutil.go @@ -85,21 +85,6 @@ func (ep *MockLinkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts st return n, nil } -// WriteRawPacket implements LinkEndpoint.WriteRawPacket. -func (ep *MockLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - if ep.allowPackets == 0 { - return ep.err - } - ep.allowPackets-- - - pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: vv, - }) - ep.WrittenPackets = append(ep.WrittenPackets, pkt) - - return nil -} - // Attach implements LinkEndpoint.Attach. func (*MockLinkEndpoint) Attach(stack.NetworkDispatcher) {} diff --git a/pkg/tcpip/socketops.go b/pkg/tcpip/socketops.go index 2a6c7c7c0..b60a5fd76 100644 --- a/pkg/tcpip/socketops.go +++ b/pkg/tcpip/socketops.go @@ -15,31 +15,350 @@ package tcpip import ( + "sync/atomic" + "gvisor.dev/gvisor/pkg/sync" ) -// SocketOptions contains all the variables which store values for socket -// level options. +// SocketOptionsHandler holds methods that help define endpoint specific +// behavior for socket level socket options. These must be implemented by +// endpoints to get notified when socket level options are set. +type SocketOptionsHandler interface { + // OnReuseAddressSet is invoked when SO_REUSEADDR is set for an endpoint. + OnReuseAddressSet(v bool) + + // OnReusePortSet is invoked when SO_REUSEPORT is set for an endpoint. + OnReusePortSet(v bool) + + // OnKeepAliveSet is invoked when SO_KEEPALIVE is set for an endpoint. + OnKeepAliveSet(v bool) + + // OnDelayOptionSet is invoked when TCP_NODELAY is set for an endpoint. + // Note that v will be the inverse of TCP_NODELAY option. + OnDelayOptionSet(v bool) + + // OnCorkOptionSet is invoked when TCP_CORK is set for an endpoint. + OnCorkOptionSet(v bool) + + // LastError is invoked when SO_ERROR is read for an endpoint. + LastError() *Error +} + +// DefaultSocketOptionsHandler is an embeddable type that implements no-op +// implementations for SocketOptionsHandler methods. +type DefaultSocketOptionsHandler struct{} + +var _ SocketOptionsHandler = (*DefaultSocketOptionsHandler)(nil) + +// OnReuseAddressSet implements SocketOptionsHandler.OnReuseAddressSet. +func (*DefaultSocketOptionsHandler) OnReuseAddressSet(bool) {} + +// OnReusePortSet implements SocketOptionsHandler.OnReusePortSet. +func (*DefaultSocketOptionsHandler) OnReusePortSet(bool) {} + +// OnKeepAliveSet implements SocketOptionsHandler.OnKeepAliveSet. +func (*DefaultSocketOptionsHandler) OnKeepAliveSet(bool) {} + +// OnDelayOptionSet implements SocketOptionsHandler.OnDelayOptionSet. +func (*DefaultSocketOptionsHandler) OnDelayOptionSet(bool) {} + +// OnCorkOptionSet implements SocketOptionsHandler.OnCorkOptionSet. +func (*DefaultSocketOptionsHandler) OnCorkOptionSet(bool) {} + +// LastError implements SocketOptionsHandler.LastError. +func (*DefaultSocketOptionsHandler) LastError() *Error { + return nil +} + +// SocketOptions contains all the variables which store values for SOL_SOCKET, +// SOL_IP, SOL_IPV6 and SOL_TCP level options. // // +stateify savable type SocketOptions struct { - // mu protects fields below. - mu sync.Mutex `state:"nosave"` - broadcastEnabled bool + handler SocketOptionsHandler + + // These fields are accessed and modified using atomic operations. + + // broadcastEnabled determines whether datagram sockets are allowed to + // send packets to a broadcast address. + broadcastEnabled uint32 + + // passCredEnabled determines whether SCM_CREDENTIALS socket control + // messages are enabled. + passCredEnabled uint32 + + // noChecksumEnabled determines whether UDP checksum is disabled while + // transmitting for this socket. + noChecksumEnabled uint32 + + // reuseAddressEnabled determines whether Bind() should allow reuse of + // local address. + reuseAddressEnabled uint32 + + // reusePortEnabled determines whether to permit multiple sockets to be + // bound to an identical socket address. + reusePortEnabled uint32 + + // keepAliveEnabled determines whether TCP keepalive is enabled for this + // socket. + keepAliveEnabled uint32 + + // multicastLoopEnabled determines whether multicast packets sent over a + // non-loopback interface will be looped back. Analogous to inet->mc_loop. + multicastLoopEnabled uint32 + + // receiveTOSEnabled is used to specify if the TOS ancillary message is + // passed with incoming packets. + receiveTOSEnabled uint32 + + // receiveTClassEnabled is used to specify if the IPV6_TCLASS ancillary + // message is passed with incoming packets. + receiveTClassEnabled uint32 + + // receivePacketInfoEnabled is used to specify if more inforamtion is + // provided with incoming packets such as interface index and address. + receivePacketInfoEnabled uint32 + + // hdrIncludeEnabled is used to indicate for a raw endpoint that all packets + // being written have an IP header and the endpoint should not attach an IP + // header. + hdrIncludedEnabled uint32 + + // v6OnlyEnabled is used to determine whether an IPv6 socket is to be + // restricted to sending and receiving IPv6 packets only. + v6OnlyEnabled uint32 + + // quickAckEnabled is used to represent the value of TCP_QUICKACK option. + // It currently does not have any effect on the TCP endpoint. + quickAckEnabled uint32 + + // delayOptionEnabled is used to specify if data should be sent out immediately + // by the transport protocol. For TCP, it determines if the Nagle algorithm + // is on or off. + delayOptionEnabled uint32 + + // corkOptionEnabled is used to specify if data should be held until segments + // are full by the TCP transport protocol. + corkOptionEnabled uint32 + + // receiveOriginalDstAddress is used to specify if the original destination of + // the incoming packet should be returned as an ancillary message. + receiveOriginalDstAddress uint32 + + // mu protects the access to the below fields. + mu sync.Mutex `state:"nosave"` + + // linger determines the amount of time the socket should linger before + // close. We currently implement this option for TCP socket only. + linger LingerOption +} + +// InitHandler initializes the handler. This must be called before using the +// socket options utility. +func (so *SocketOptions) InitHandler(handler SocketOptionsHandler) { + so.handler = handler +} + +func storeAtomicBool(addr *uint32, v bool) { + var val uint32 + if v { + val = 1 + } + atomic.StoreUint32(addr, val) } // GetBroadcast gets value for SO_BROADCAST option. func (so *SocketOptions) GetBroadcast() bool { - so.mu.Lock() - defer so.mu.Unlock() - - return so.broadcastEnabled + return atomic.LoadUint32(&so.broadcastEnabled) != 0 } // SetBroadcast sets value for SO_BROADCAST option. func (so *SocketOptions) SetBroadcast(v bool) { + storeAtomicBool(&so.broadcastEnabled, v) +} + +// GetPassCred gets value for SO_PASSCRED option. +func (so *SocketOptions) GetPassCred() bool { + return atomic.LoadUint32(&so.passCredEnabled) != 0 +} + +// SetPassCred sets value for SO_PASSCRED option. +func (so *SocketOptions) SetPassCred(v bool) { + storeAtomicBool(&so.passCredEnabled, v) +} + +// GetNoChecksum gets value for SO_NO_CHECK option. +func (so *SocketOptions) GetNoChecksum() bool { + return atomic.LoadUint32(&so.noChecksumEnabled) != 0 +} + +// SetNoChecksum sets value for SO_NO_CHECK option. +func (so *SocketOptions) SetNoChecksum(v bool) { + storeAtomicBool(&so.noChecksumEnabled, v) +} + +// GetReuseAddress gets value for SO_REUSEADDR option. +func (so *SocketOptions) GetReuseAddress() bool { + return atomic.LoadUint32(&so.reuseAddressEnabled) != 0 +} + +// SetReuseAddress sets value for SO_REUSEADDR option. +func (so *SocketOptions) SetReuseAddress(v bool) { + storeAtomicBool(&so.reuseAddressEnabled, v) + so.handler.OnReuseAddressSet(v) +} + +// GetReusePort gets value for SO_REUSEPORT option. +func (so *SocketOptions) GetReusePort() bool { + return atomic.LoadUint32(&so.reusePortEnabled) != 0 +} + +// SetReusePort sets value for SO_REUSEPORT option. +func (so *SocketOptions) SetReusePort(v bool) { + storeAtomicBool(&so.reusePortEnabled, v) + so.handler.OnReusePortSet(v) +} + +// GetKeepAlive gets value for SO_KEEPALIVE option. +func (so *SocketOptions) GetKeepAlive() bool { + return atomic.LoadUint32(&so.keepAliveEnabled) != 0 +} + +// SetKeepAlive sets value for SO_KEEPALIVE option. +func (so *SocketOptions) SetKeepAlive(v bool) { + storeAtomicBool(&so.keepAliveEnabled, v) + so.handler.OnKeepAliveSet(v) +} + +// GetMulticastLoop gets value for IP_MULTICAST_LOOP option. +func (so *SocketOptions) GetMulticastLoop() bool { + return atomic.LoadUint32(&so.multicastLoopEnabled) != 0 +} + +// SetMulticastLoop sets value for IP_MULTICAST_LOOP option. +func (so *SocketOptions) SetMulticastLoop(v bool) { + storeAtomicBool(&so.multicastLoopEnabled, v) +} + +// GetReceiveTOS gets value for IP_RECVTOS option. +func (so *SocketOptions) GetReceiveTOS() bool { + return atomic.LoadUint32(&so.receiveTOSEnabled) != 0 +} + +// SetReceiveTOS sets value for IP_RECVTOS option. +func (so *SocketOptions) SetReceiveTOS(v bool) { + storeAtomicBool(&so.receiveTOSEnabled, v) +} + +// GetReceiveTClass gets value for IPV6_RECVTCLASS option. +func (so *SocketOptions) GetReceiveTClass() bool { + return atomic.LoadUint32(&so.receiveTClassEnabled) != 0 +} + +// SetReceiveTClass sets value for IPV6_RECVTCLASS option. +func (so *SocketOptions) SetReceiveTClass(v bool) { + storeAtomicBool(&so.receiveTClassEnabled, v) +} + +// GetReceivePacketInfo gets value for IP_PKTINFO option. +func (so *SocketOptions) GetReceivePacketInfo() bool { + return atomic.LoadUint32(&so.receivePacketInfoEnabled) != 0 +} + +// SetReceivePacketInfo sets value for IP_PKTINFO option. +func (so *SocketOptions) SetReceivePacketInfo(v bool) { + storeAtomicBool(&so.receivePacketInfoEnabled, v) +} + +// GetHeaderIncluded gets value for IP_HDRINCL option. +func (so *SocketOptions) GetHeaderIncluded() bool { + return atomic.LoadUint32(&so.hdrIncludedEnabled) != 0 +} + +// SetHeaderIncluded sets value for IP_HDRINCL option. +func (so *SocketOptions) SetHeaderIncluded(v bool) { + storeAtomicBool(&so.hdrIncludedEnabled, v) +} + +// GetV6Only gets value for IPV6_V6ONLY option. +func (so *SocketOptions) GetV6Only() bool { + return atomic.LoadUint32(&so.v6OnlyEnabled) != 0 +} + +// SetV6Only sets value for IPV6_V6ONLY option. +// +// Preconditions: the backing TCP or UDP endpoint must be in initial state. +func (so *SocketOptions) SetV6Only(v bool) { + storeAtomicBool(&so.v6OnlyEnabled, v) +} + +// GetQuickAck gets value for TCP_QUICKACK option. +func (so *SocketOptions) GetQuickAck() bool { + return atomic.LoadUint32(&so.quickAckEnabled) != 0 +} + +// SetQuickAck sets value for TCP_QUICKACK option. +func (so *SocketOptions) SetQuickAck(v bool) { + storeAtomicBool(&so.quickAckEnabled, v) +} + +// GetDelayOption gets inverted value for TCP_NODELAY option. +func (so *SocketOptions) GetDelayOption() bool { + return atomic.LoadUint32(&so.delayOptionEnabled) != 0 +} + +// SetDelayOption sets inverted value for TCP_NODELAY option. +func (so *SocketOptions) SetDelayOption(v bool) { + storeAtomicBool(&so.delayOptionEnabled, v) + so.handler.OnDelayOptionSet(v) +} + +// GetCorkOption gets value for TCP_CORK option. +func (so *SocketOptions) GetCorkOption() bool { + return atomic.LoadUint32(&so.corkOptionEnabled) != 0 +} + +// SetCorkOption sets value for TCP_CORK option. +func (so *SocketOptions) SetCorkOption(v bool) { + storeAtomicBool(&so.corkOptionEnabled, v) + so.handler.OnCorkOptionSet(v) +} + +// GetReceiveOriginalDstAddress gets value for IP(V6)_RECVORIGDSTADDR option. +func (so *SocketOptions) GetReceiveOriginalDstAddress() bool { + return atomic.LoadUint32(&so.receiveOriginalDstAddress) != 0 +} + +// SetReceiveOriginalDstAddress sets value for IP(V6)_RECVORIGDSTADDR option. +func (so *SocketOptions) SetReceiveOriginalDstAddress(v bool) { + storeAtomicBool(&so.receiveOriginalDstAddress, v) +} + +// GetLastError gets value for SO_ERROR option. +func (so *SocketOptions) GetLastError() *Error { + return so.handler.LastError() +} + +// GetOutOfBandInline gets value for SO_OOBINLINE option. +func (*SocketOptions) GetOutOfBandInline() bool { + return true +} + +// SetOutOfBandInline sets value for SO_OOBINLINE option. We currently do not +// support disabling this option. +func (*SocketOptions) SetOutOfBandInline(bool) {} + +// GetLinger gets value for SO_LINGER option. +func (so *SocketOptions) GetLinger() LingerOption { so.mu.Lock() - defer so.mu.Unlock() + linger := so.linger + so.mu.Unlock() + return linger +} - so.broadcastEnabled = v +// SetLinger sets value for SO_LINGER option. +func (so *SocketOptions) SetLinger(linger LingerOption) { + so.mu.Lock() + so.linger = linger + so.mu.Unlock() } diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index d09ebe7fa..9cc6074da 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "most_shards") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -112,7 +112,7 @@ go_test( "transport_demuxer_test.go", "transport_test.go", ], - shard_count = 20, + shard_count = most_shards, deps = [ ":stack", "//pkg/rand", @@ -120,6 +120,7 @@ go_test( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/checker", + "//pkg/tcpip/faketime", "//pkg/tcpip/header", "//pkg/tcpip/link/channel", "//pkg/tcpip/link/loopback", @@ -131,7 +132,6 @@ go_test( "//pkg/tcpip/transport/udp", "//pkg/waiter", "@com_github_google_go_cmp//cmp:go_default_library", - "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", ], ) diff --git a/pkg/tcpip/stack/addressable_endpoint_state.go b/pkg/tcpip/stack/addressable_endpoint_state.go index 9478f3fb7..cd423bf71 100644 --- a/pkg/tcpip/stack/addressable_endpoint_state.go +++ b/pkg/tcpip/stack/addressable_endpoint_state.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" ) -var _ GroupAddressableEndpoint = (*AddressableEndpointState)(nil) var _ AddressableEndpoint = (*AddressableEndpointState)(nil) // AddressableEndpointState is an implementation of an AddressableEndpoint. @@ -37,10 +36,6 @@ type AddressableEndpointState struct { endpoints map[tcpip.Address]*addressState primary []*addressState - - // groups holds the mapping between group addresses and the number of times - // they have been joined. - groups map[tcpip.Address]uint32 } } @@ -53,65 +48,33 @@ func (a *AddressableEndpointState) Init(networkEndpoint NetworkEndpoint) { a.mu.Lock() defer a.mu.Unlock() a.mu.endpoints = make(map[tcpip.Address]*addressState) - a.mu.groups = make(map[tcpip.Address]uint32) -} - -// ReadOnlyAddressableEndpointState provides read-only access to an -// AddressableEndpointState. -type ReadOnlyAddressableEndpointState struct { - inner *AddressableEndpointState } -// AddrOrMatching returns an endpoint for the passed address that is consisdered -// bound to the wrapped AddressableEndpointState. +// GetAddress returns the AddressEndpoint for the passed address. // -// If addr is an exact match with an existing address, that address is returned. -// Otherwise, f is called with each address and the address that f returns true -// for is returned. -// -// Returns nil of no address matches. -func (m ReadOnlyAddressableEndpointState) AddrOrMatching(addr tcpip.Address, spoofingOrPrimiscuous bool, f func(AddressEndpoint) bool) AddressEndpoint { - m.inner.mu.RLock() - defer m.inner.mu.RUnlock() - - if ep, ok := m.inner.mu.endpoints[addr]; ok { - if ep.IsAssigned(spoofingOrPrimiscuous) && ep.IncRef() { - return ep - } - } - - for _, ep := range m.inner.mu.endpoints { - if ep.IsAssigned(spoofingOrPrimiscuous) && f(ep) && ep.IncRef() { - return ep - } - } - - return nil -} - -// Lookup returns the AddressEndpoint for the passed address. +// GetAddress does not increment the address's reference count or check if the +// address is considered bound to the endpoint. // -// Returns nil if the passed address is not associated with the -// AddressableEndpointState. -func (m ReadOnlyAddressableEndpointState) Lookup(addr tcpip.Address) AddressEndpoint { - m.inner.mu.RLock() - defer m.inner.mu.RUnlock() +// Returns nil if the passed address is not associated with the endpoint. +func (a *AddressableEndpointState) GetAddress(addr tcpip.Address) AddressEndpoint { + a.mu.RLock() + defer a.mu.RUnlock() - ep, ok := m.inner.mu.endpoints[addr] + ep, ok := a.mu.endpoints[addr] if !ok { return nil } return ep } -// ForEach calls f for each address pair. +// ForEachEndpoint calls f for each address. // -// If f returns false, f is no longer be called. -func (m ReadOnlyAddressableEndpointState) ForEach(f func(AddressEndpoint) bool) { - m.inner.mu.RLock() - defer m.inner.mu.RUnlock() +// Once f returns false, f will no longer be called. +func (a *AddressableEndpointState) ForEachEndpoint(f func(AddressEndpoint) bool) { + a.mu.RLock() + defer a.mu.RUnlock() - for _, ep := range m.inner.mu.endpoints { + for _, ep := range a.mu.endpoints { if !f(ep) { return } @@ -120,18 +83,16 @@ func (m ReadOnlyAddressableEndpointState) ForEach(f func(AddressEndpoint) bool) // ForEachPrimaryEndpoint calls f for each primary address. // -// If f returns false, f is no longer be called. -func (m ReadOnlyAddressableEndpointState) ForEachPrimaryEndpoint(f func(AddressEndpoint)) { - m.inner.mu.RLock() - defer m.inner.mu.RUnlock() - for _, ep := range m.inner.mu.primary { - f(ep) - } -} +// Once f returns false, f will no longer be called. +func (a *AddressableEndpointState) ForEachPrimaryEndpoint(f func(AddressEndpoint) bool) { + a.mu.RLock() + defer a.mu.RUnlock() -// ReadOnly returns a readonly reference to a. -func (a *AddressableEndpointState) ReadOnly() ReadOnlyAddressableEndpointState { - return ReadOnlyAddressableEndpointState{inner: a} + for _, ep := range a.mu.primary { + if !f(ep) { + return + } + } } func (a *AddressableEndpointState) releaseAddressState(addrState *addressState) { @@ -335,11 +296,6 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address func (a *AddressableEndpointState) RemovePermanentAddress(addr tcpip.Address) *tcpip.Error { a.mu.Lock() defer a.mu.Unlock() - - if _, ok := a.mu.groups[addr]; ok { - panic(fmt.Sprintf("group address = %s must be removed with LeaveGroup", addr)) - } - return a.removePermanentAddressLocked(addr) } @@ -471,8 +427,19 @@ func (a *AddressableEndpointState) acquirePrimaryAddressRLocked(isValid func(*ad return deprecatedEndpoint } -// AcquireAssignedAddress implements AddressableEndpoint. -func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint { +// AcquireAssignedAddressOrMatching returns an address endpoint that is +// considered assigned to the addressable endpoint. +// +// If the address is an exact match with an existing address, that address is +// returned. Otherwise, if f is provided, f is called with each address and +// the address that f returns true for is returned. +// +// If there is no matching address, a temporary address will be returned if +// allowTemp is true. +// +// Regardless how the address was obtained, it will be acquired before it is +// returned. +func (a *AddressableEndpointState) AcquireAssignedAddressOrMatching(localAddr tcpip.Address, f func(AddressEndpoint) bool, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint { a.mu.Lock() defer a.mu.Unlock() @@ -488,6 +455,14 @@ func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Addres return addrState } + if f != nil { + for _, addrState := range a.mu.endpoints { + if addrState.IsAssigned(allowTemp) && f(addrState) && addrState.IncRef() { + return addrState + } + } + } + if !allowTemp { return nil } @@ -520,6 +495,11 @@ func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Addres return ep } +// AcquireAssignedAddress implements AddressableEndpoint. +func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint { + return a.AcquireAssignedAddressOrMatching(localAddr, nil, allowTemp, tempPEB) +} + // AcquireOutgoingPrimaryAddress implements AddressableEndpoint. func (a *AddressableEndpointState) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) AddressEndpoint { a.mu.RLock() @@ -588,72 +568,11 @@ func (a *AddressableEndpointState) PermanentAddresses() []tcpip.AddressWithPrefi return addrs } -// JoinGroup implements GroupAddressableEndpoint. -func (a *AddressableEndpointState) JoinGroup(group tcpip.Address) (bool, *tcpip.Error) { - a.mu.Lock() - defer a.mu.Unlock() - - joins, ok := a.mu.groups[group] - if !ok { - ep, err := a.addAndAcquireAddressLocked(group.WithPrefix(), NeverPrimaryEndpoint, AddressConfigStatic, false /* deprecated */, true /* permanent */) - if err != nil { - return false, err - } - // We have no need for the address endpoint. - a.decAddressRefLocked(ep) - } - - a.mu.groups[group] = joins + 1 - return !ok, nil -} - -// LeaveGroup implements GroupAddressableEndpoint. -func (a *AddressableEndpointState) LeaveGroup(group tcpip.Address) (bool, *tcpip.Error) { - a.mu.Lock() - defer a.mu.Unlock() - - joins, ok := a.mu.groups[group] - if !ok { - return false, tcpip.ErrBadLocalAddress - } - - if joins == 1 { - a.removeGroupAddressLocked(group) - delete(a.mu.groups, group) - return true, nil - } - - a.mu.groups[group] = joins - 1 - return false, nil -} - -// IsInGroup implements GroupAddressableEndpoint. -func (a *AddressableEndpointState) IsInGroup(group tcpip.Address) bool { - a.mu.RLock() - defer a.mu.RUnlock() - _, ok := a.mu.groups[group] - return ok -} - -func (a *AddressableEndpointState) removeGroupAddressLocked(group tcpip.Address) { - if err := a.removePermanentAddressLocked(group); err != nil { - // removePermanentEndpointLocked would only return an error if group is - // not bound to the addressable endpoint, but we know it MUST be assigned - // since we have group in our map of groups. - panic(fmt.Sprintf("error removing group address = %s: %s", group, err)) - } -} - // Cleanup forcefully leaves all groups and removes all permanent addresses. func (a *AddressableEndpointState) Cleanup() { a.mu.Lock() defer a.mu.Unlock() - for group := range a.mu.groups { - a.removeGroupAddressLocked(group) - } - a.mu.groups = make(map[tcpip.Address]uint32) - for _, ep := range a.mu.endpoints { // removePermanentEndpointLocked returns tcpip.ErrBadLocalAddress if ep is // not a permanent address. diff --git a/pkg/tcpip/stack/addressable_endpoint_state_test.go b/pkg/tcpip/stack/addressable_endpoint_state_test.go index 26787d0a3..140f146f6 100644 --- a/pkg/tcpip/stack/addressable_endpoint_state_test.go +++ b/pkg/tcpip/stack/addressable_endpoint_state_test.go @@ -53,25 +53,9 @@ func TestAddressableEndpointStateCleanup(t *testing.T) { ep.DecRef() } - group := tcpip.Address("\x02") - if added, err := s.JoinGroup(group); err != nil { - t.Fatalf("s.JoinGroup(%s): %s", group, err) - } else if !added { - t.Fatalf("got s.JoinGroup(%s) = false, want = true", group) - } - if !s.IsInGroup(group) { - t.Fatalf("got s.IsInGroup(%s) = false, want = true", group) - } - s.Cleanup() - { - ep := s.AcquireAssignedAddress(addr.Address, false /* allowTemp */, stack.NeverPrimaryEndpoint) - if ep != nil { - ep.DecRef() - t.Fatalf("got s.AcquireAssignedAddress(%s, false, NeverPrimaryEndpoint) = %s, want = nil", addr.Address, ep.AddressWithPrefix()) - } - } - if s.IsInGroup(group) { - t.Fatalf("got s.IsInGroup(%s) = true, want = false", group) + if ep := s.AcquireAssignedAddress(addr.Address, false /* allowTemp */, stack.NeverPrimaryEndpoint); ep != nil { + ep.DecRef() + t.Fatalf("got s.AcquireAssignedAddress(%s, false, NeverPrimaryEndpoint) = %s, want = nil", addr.Address, ep.AddressWithPrefix()) } } diff --git a/pkg/tcpip/stack/forwarding_test.go b/pkg/tcpip/stack/forwarding_test.go index 6dc9e7859..5ec9b3411 100644 --- a/pkg/tcpip/stack/forwarding_test.go +++ b/pkg/tcpip/stack/forwarding_test.go @@ -309,7 +309,7 @@ func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress { func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error { p := fwdTestPacketInfo{ - RemoteLinkAddress: r.RemoteLinkAddress, + RemoteLinkAddress: r.RemoteLinkAddress(), LocalLinkAddress: r.LocalLinkAddress, Pkt: pkt, } @@ -333,20 +333,6 @@ func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBuffer return n, nil } -// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. -func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - p := fwdTestPacketInfo{ - Pkt: NewPacketBuffer(PacketBufferOptions{Data: vv}), - } - - select { - case e.C <- p: - default: - } - - return nil -} - // Wait implements stack.LinkEndpoint.Wait. func (*fwdTestLinkEndpoint) Wait() {} diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 73a01c2dd..03d7b4e0d 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/checker" + "gvisor.dev/gvisor/pkg/tcpip/faketime" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/channel" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" @@ -352,7 +353,7 @@ func TestDADDisabled(t *testing.T) { } // We should not have sent any NDP NS messages. - if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 { + if got := s.Stats().ICMP.V6.PacketsSent.NeighborSolicit.Value(); got != 0 { t.Fatalf("got NeighborSolicit = %d, want = 0", got) } } @@ -465,14 +466,18 @@ func TestDADResolve(t *testing.T) { if err != tcpip.ErrNoRoute { t.Errorf("got FindRoute(%d, '', %s, %d, false) = (%+v, %v), want = (_, %s)", nicID, addr2, header.IPv6ProtocolNumber, r, err, tcpip.ErrNoRoute) } - r.Release() + if r != nil { + r.Release() + } } { r, err := s.FindRoute(nicID, addr1, addr2, header.IPv6ProtocolNumber, false) if err != tcpip.ErrNoRoute { t.Errorf("got FindRoute(%d, %s, %s, %d, false) = (%+v, %v), want = (_, %s)", nicID, addr1, addr2, header.IPv6ProtocolNumber, r, err, tcpip.ErrNoRoute) } - r.Release() + if r != nil { + r.Release() + } } if t.Failed() { @@ -510,7 +515,9 @@ func TestDADResolve(t *testing.T) { } else if r.LocalAddress != addr1 { t.Errorf("got r.LocalAddress = %s, want = %s", r.LocalAddress, addr1) } - r.Release() + if r != nil { + r.Release() + } } if t.Failed() { @@ -518,7 +525,7 @@ func TestDADResolve(t *testing.T) { } // Should not have sent any more NS messages. - if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != uint64(test.dupAddrDetectTransmits) { + if got := s.Stats().ICMP.V6.PacketsSent.NeighborSolicit.Value(); got != uint64(test.dupAddrDetectTransmits) { t.Fatalf("got NeighborSolicit = %d, want = %d", got, test.dupAddrDetectTransmits) } @@ -533,8 +540,8 @@ func TestDADResolve(t *testing.T) { // Make sure the right remote link address is used. snmc := header.SolicitedNodeAddr(addr1) - if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want { - t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want) + if got, want := p.Route.RemoteLinkAddress(), header.EthernetAddressFromMulticastIPv6Address(snmc); got != want { + t.Errorf("got remote link address = %s, want = %s", got, want) } // Check NDP NS packet. @@ -563,18 +570,18 @@ func rxNDPSolicit(e *channel.Endpoint, tgt tcpip.Address) { hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborSolicitMinimumSize) pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize)) pkt.SetType(header.ICMPv6NeighborSolicit) - ns := header.NDPNeighborSolicit(pkt.NDPPayload()) + ns := header.NDPNeighborSolicit(pkt.MessageBody()) ns.SetTargetAddress(tgt) snmc := header.SolicitedNodeAddr(tgt) pkt.SetChecksum(header.ICMPv6Checksum(pkt, header.IPv6Any, snmc, buffer.VectorisedView{})) payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(icmp.ProtocolNumber6), - HopLimit: 255, - SrcAddr: header.IPv6Any, - DstAddr: snmc, + PayloadLength: uint16(payloadLength), + TransportProtocol: icmp.ProtocolNumber6, + HopLimit: 255, + SrcAddr: header.IPv6Any, + DstAddr: snmc, }) e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{Data: hdr.View().ToVectorisedView()})) } @@ -605,7 +612,7 @@ func TestDADFail(t *testing.T) { hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize) pkt := header.ICMPv6(hdr.Prepend(naSize)) pkt.SetType(header.ICMPv6NeighborAdvert) - na := header.NDPNeighborAdvert(pkt.NDPPayload()) + na := header.NDPNeighborAdvert(pkt.MessageBody()) na.SetSolicitedFlag(true) na.SetOverrideFlag(true) na.SetTargetAddress(tgt) @@ -616,11 +623,11 @@ func TestDADFail(t *testing.T) { payloadLength := hdr.UsedLength() ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(icmp.ProtocolNumber6), - HopLimit: 255, - SrcAddr: tgt, - DstAddr: header.IPv6AllNodesMulticastAddress, + PayloadLength: uint16(payloadLength), + TransportProtocol: icmp.ProtocolNumber6, + HopLimit: 255, + SrcAddr: tgt, + DstAddr: header.IPv6AllNodesMulticastAddress, }) e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{Data: hdr.View().ToVectorisedView()})) }, @@ -666,7 +673,7 @@ func TestDADFail(t *testing.T) { // Receive a packet to simulate an address conflict. test.rxPkt(e, addr1) - stat := test.getStat(s.Stats().ICMP.V6PacketsReceived) + stat := test.getStat(s.Stats().ICMP.V6.PacketsReceived) if got := stat.Value(); got != 1 { t.Fatalf("got stat = %d, want = 1", got) } @@ -803,7 +810,7 @@ func TestDADStop(t *testing.T) { } // Should not have sent more than 1 NS message. - if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 { + if got := s.Stats().ICMP.V6.PacketsSent.NeighborSolicit.Value(); got > 1 { t.Errorf("got NeighborSolicit = %d, want <= 1", got) } }) @@ -982,7 +989,7 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo pkt := header.ICMPv6(hdr.Prepend(icmpSize)) pkt.SetType(header.ICMPv6RouterAdvert) pkt.SetCode(0) - raPayload := pkt.NDPPayload() + raPayload := pkt.MessageBody() ra := header.NDPRouterAdvert(raPayload) // Populate the Router Lifetime. binary.BigEndian.PutUint16(raPayload[2:], rl) @@ -1004,11 +1011,11 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo payloadLength := hdr.UsedLength() iph := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) iph.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLength), - NextHeader: uint8(icmp.ProtocolNumber6), - HopLimit: header.NDPHopLimit, - SrcAddr: ip, - DstAddr: header.IPv6AllNodesMulticastAddress, + PayloadLength: uint16(payloadLength), + TransportProtocol: icmp.ProtocolNumber6, + HopLimit: header.NDPHopLimit, + SrcAddr: ip, + DstAddr: header.IPv6AllNodesMulticastAddress, }) return stack.NewPacketBuffer(stack.PacketBufferOptions{ @@ -2162,8 +2169,8 @@ func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) { NDPConfigs: ipv6.NDPConfigurations{ AutoGenTempGlobalAddresses: true, }, - NDPDisp: &ndpDisp, - AutoGenIPv6LinkLocal: true, + NDPDisp: &ndpDisp, + AutoGenLinkLocal: true, })}, }) @@ -2843,9 +2850,7 @@ func addrForNewConnectionTo(t *testing.T, s *stack.Stack, addr tcpip.FullAddress t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err) } defer ep.Close() - if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil { - t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err) - } + ep.SocketOptions().SetV6Only(true) if err := ep.Connect(addr); err != nil { t.Fatalf("ep.Connect(%+v): %s", addr, err) } @@ -2879,9 +2884,7 @@ func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullA t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err) } defer ep.Close() - if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil { - t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err) - } + ep.SocketOptions().SetV6Only(true) if err := ep.Bind(addr); err != nil { t.Fatalf("ep.Bind(%+v): %s", addr, err) } @@ -3250,9 +3253,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err) } defer ep.Close() - if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil { - t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err) - } + ep.SocketOptions().SetV6Only(true) if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute { t.Errorf("got ep.Connect(%+v) = %s, want = %s", dstAddr, err, tcpip.ErrNoRoute) @@ -4044,9 +4045,9 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { ndpConfigs.AutoGenAddressConflictRetries = maxRetries s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal, - NDPConfigs: ndpConfigs, - NDPDisp: &ndpDisp, + AutoGenLinkLocal: addrType.autoGenLinkLocal, + NDPConfigs: ndpConfigs, + NDPDisp: &ndpDisp, OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{ NICNameFromID: func(_ tcpip.NICID, nicName string) string { return nicName @@ -4179,9 +4180,9 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) { e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal, - NDPConfigs: addrType.ndpConfigs, - NDPDisp: &ndpDisp, + AutoGenLinkLocal: addrType.autoGenLinkLocal, + NDPConfigs: addrType.ndpConfigs, + NDPDisp: &ndpDisp, })}, }) if err := s.CreateNIC(nicID, e); err != nil { @@ -4708,7 +4709,7 @@ func TestCleanupNDPState(t *testing.T) { } s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - AutoGenIPv6LinkLocal: true, + AutoGenLinkLocal: true, NDPConfigs: ipv6.NDPConfigurations{ HandleRAs: true, DiscoverDefaultRouters: true, @@ -5174,113 +5175,99 @@ func TestRouterSolicitation(t *testing.T) { }, } - // This Run will not return until the parallel tests finish. - // - // We need this because we need to do some teardown work after the - // parallel tests complete. - // - // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for - // more details. - t.Run("group", func(t *testing.T) { - for _, test := range tests { - test := test - - t.Run(test.name, func(t *testing.T) { - t.Parallel() + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + clock := faketime.NewManualClock() + e := channelLinkWithHeaderLength{ + Endpoint: channel.New(int(test.maxRtrSolicit), 1280, test.linkAddr), + headerLength: test.linkHeaderLen, + } + e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired + waitForPkt := func(timeout time.Duration) { + t.Helper() - e := channelLinkWithHeaderLength{ - Endpoint: channel.New(int(test.maxRtrSolicit), 1280, test.linkAddr), - headerLength: test.linkHeaderLen, + clock.Advance(timeout) + p, ok := e.Read() + if !ok { + t.Fatal("expected router solicitation packet") } - e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired - waitForPkt := func(timeout time.Duration) { - t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - p, ok := e.ReadContext(ctx) - if !ok { - t.Fatal("timed out waiting for packet") - return - } - if p.Proto != header.IPv6ProtocolNumber { - t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber) - } + if p.Proto != header.IPv6ProtocolNumber { + t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber) + } - // Make sure the right remote link address is used. - if want := header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress); p.Route.RemoteLinkAddress != want { - t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want) - } + // Make sure the right remote link address is used. + if got, want := p.Route.RemoteLinkAddress(), header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress); got != want { + t.Errorf("got remote link address = %s, want = %s", got, want) + } - checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()), - checker.SrcAddr(test.expectedSrcAddr), - checker.DstAddr(header.IPv6AllRoutersMulticastAddress), - checker.TTL(header.NDPHopLimit), - checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)), - ) + checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()), + checker.SrcAddr(test.expectedSrcAddr), + checker.DstAddr(header.IPv6AllRoutersMulticastAddress), + checker.TTL(header.NDPHopLimit), + checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)), + ) - if l, want := p.Pkt.AvailableHeaderBytes(), int(test.linkHeaderLen); l != want { - t.Errorf("got p.Pkt.AvailableHeaderBytes() = %d; want = %d", l, want) - } - } - waitForNothing := func(timeout time.Duration) { - t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - if _, ok := e.ReadContext(ctx); ok { - t.Fatal("unexpectedly got a packet") - } - } - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPConfigs: ipv6.NDPConfigurations{ - MaxRtrSolicitations: test.maxRtrSolicit, - RtrSolicitationInterval: test.rtrSolicitInt, - MaxRtrSolicitationDelay: test.maxRtrSolicitDelay, - }, - })}, - }) - if err := s.CreateNIC(nicID, &e); err != nil { - t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + if l, want := p.Pkt.AvailableHeaderBytes(), int(test.linkHeaderLen); l != want { + t.Errorf("got p.Pkt.AvailableHeaderBytes() = %d; want = %d", l, want) } + } + waitForNothing := func(timeout time.Duration) { + t.Helper() - if addr := test.nicAddr; addr != "" { - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr, err) - } + clock.Advance(timeout) + if p, ok := e.Read(); ok { + t.Fatalf("unexpectedly got a packet = %#v", p) } + } + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ + NDPConfigs: ipv6.NDPConfigurations{ + MaxRtrSolicitations: test.maxRtrSolicit, + RtrSolicitationInterval: test.rtrSolicitInt, + MaxRtrSolicitationDelay: test.maxRtrSolicitDelay, + }, + })}, + Clock: clock, + }) + if err := s.CreateNIC(nicID, &e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } - // Make sure each RS is sent at the right time. - remaining := test.maxRtrSolicit - if remaining > 0 { - waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultAsyncPositiveEventTimeout) - remaining-- + if addr := test.nicAddr; addr != "" { + if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr); err != nil { + t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr, err) } + } - for ; remaining > 0; remaining-- { - if test.effectiveRtrSolicitInt > defaultAsyncPositiveEventTimeout { - waitForNothing(test.effectiveRtrSolicitInt - defaultAsyncNegativeEventTimeout) - waitForPkt(defaultAsyncPositiveEventTimeout) - } else { - waitForPkt(test.effectiveRtrSolicitInt + defaultAsyncPositiveEventTimeout) - } - } + // Make sure each RS is sent at the right time. + remaining := test.maxRtrSolicit + if remaining > 0 { + waitForPkt(test.effectiveMaxRtrSolicitDelay) + remaining-- + } - // Make sure no more RS. - if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay { - waitForNothing(test.effectiveRtrSolicitInt + defaultAsyncNegativeEventTimeout) + for ; remaining > 0; remaining-- { + if test.effectiveRtrSolicitInt > defaultAsyncPositiveEventTimeout { + waitForNothing(test.effectiveRtrSolicitInt - time.Nanosecond) + waitForPkt(time.Nanosecond) } else { - waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultAsyncNegativeEventTimeout) + waitForPkt(test.effectiveRtrSolicitInt) } + } - // Make sure the counter got properly - // incremented. - if got, want := s.Stats().ICMP.V6PacketsSent.RouterSolicit.Value(), uint64(test.maxRtrSolicit); got != want { - t.Fatalf("got sent RouterSolicit = %d, want = %d", got, want) - } - }) - } - }) + // Make sure no more RS. + if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay { + waitForNothing(test.effectiveRtrSolicitInt) + } else { + waitForNothing(test.effectiveMaxRtrSolicitDelay) + } + + if got, want := s.Stats().ICMP.V6.PacketsSent.RouterSolicit.Value(), uint64(test.maxRtrSolicit); got != want { + t.Fatalf("got sent RouterSolicit = %d, want = %d", got, want) + } + }) + } } func TestStopStartSolicitingRouters(t *testing.T) { diff --git a/pkg/tcpip/stack/neighbor_cache.go b/pkg/tcpip/stack/neighbor_cache.go index 177bf5516..317f6871d 100644 --- a/pkg/tcpip/stack/neighbor_cache.go +++ b/pkg/tcpip/stack/neighbor_cache.go @@ -24,9 +24,16 @@ import ( const neighborCacheSize = 512 // max entries per interface +// NeighborStats holds metrics for the neighbor table. +type NeighborStats struct { + // FailedEntryLookups counts the number of lookups performed on an entry in + // Failed state. + FailedEntryLookups *tcpip.StatCounter +} + // neighborCache maps IP addresses to link addresses. It uses the Least // Recently Used (LRU) eviction strategy to implement a bounded cache for -// dynmically acquired entries. It contains the state machine and configuration +// dynamically acquired entries. It contains the state machine and configuration // for running Neighbor Unreachability Detection (NUD). // // There are two types of entries in the neighbor cache: @@ -175,14 +182,15 @@ func (n *neighborCache) removeWaker(addr tcpip.Address, waker *sleep.Waker) { // entries returns all entries in the neighbor cache. func (n *neighborCache) entries() []NeighborEntry { - entries := make([]NeighborEntry, 0, len(n.cache)) n.mu.RLock() + defer n.mu.RUnlock() + + entries := make([]NeighborEntry, 0, len(n.cache)) for _, entry := range n.cache { entry.mu.RLock() entries = append(entries, entry.neigh) entry.mu.RUnlock() } - n.mu.RUnlock() return entries } @@ -226,6 +234,8 @@ func (n *neighborCache) addStaticEntry(addr tcpip.Address, linkAddr tcpip.LinkAd } // removeEntryLocked removes the specified entry from the neighbor cache. +// +// Prerequisite: n.mu and entry.mu MUST be locked. func (n *neighborCache) removeEntryLocked(entry *neighborEntry) { if entry.neigh.State != Static { n.dynamic.lru.Remove(entry) diff --git a/pkg/tcpip/stack/neighbor_cache_test.go b/pkg/tcpip/stack/neighbor_cache_test.go index ed33418f3..732a299f7 100644 --- a/pkg/tcpip/stack/neighbor_cache_test.go +++ b/pkg/tcpip/stack/neighbor_cache_test.go @@ -80,17 +80,20 @@ func entryDiffOptsWithSort() []cmp.Option { func newTestNeighborCache(nudDisp NUDDispatcher, config NUDConfigurations, clock tcpip.Clock) *neighborCache { config.resetInvalidFields() rng := rand.New(rand.NewSource(time.Now().UnixNano())) - return &neighborCache{ + neigh := &neighborCache{ nic: &NIC{ stack: &Stack{ clock: clock, nudDisp: nudDisp, }, - id: 1, + id: 1, + stats: makeNICStats(), }, state: NewNUDState(config, rng), cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize), } + neigh.nic.neigh = neigh + return neigh } // testEntryStore contains a set of IP to NeighborEntry mappings. diff --git a/pkg/tcpip/stack/neighbor_entry.go b/pkg/tcpip/stack/neighbor_entry.go index 493e48031..32399b4f5 100644 --- a/pkg/tcpip/stack/neighbor_entry.go +++ b/pkg/tcpip/stack/neighbor_entry.go @@ -258,7 +258,7 @@ func (e *neighborEntry) setStateLocked(next NeighborState) { case Failed: e.notifyWakersLocked() - e.job = e.nic.stack.newJob(&e.mu, func() { + e.job = e.nic.stack.newJob(&doubleLock{first: &e.nic.neigh.mu, second: &e.mu}, func() { e.nic.neigh.removeEntryLocked(e) }) e.job.Schedule(config.UnreachableTime) @@ -347,9 +347,10 @@ func (e *neighborEntry) handlePacketQueuedLocked(localAddr tcpip.Address) { e.setStateLocked(Delay) e.dispatchChangeEventLocked() - case Incomplete, Reachable, Delay, Probe, Static, Failed: + case Incomplete, Reachable, Delay, Probe, Static: // Do nothing - + case Failed: + e.nic.stats.Neighbor.FailedEntryLookups.Increment() default: panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State)) } @@ -511,3 +512,23 @@ func (e *neighborEntry) handleUpperLevelConfirmationLocked() { panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State)) } } + +// doubleLock combines two locks into one while maintaining lock ordering. +// +// TODO(gvisor.dev/issue/4796): Remove this once subsequent traffic to a Failed +// neighbor is allowed. +type doubleLock struct { + first, second sync.Locker +} + +// Lock locks both locks in order: first then second. +func (l *doubleLock) Lock() { + l.first.Lock() + l.second.Lock() +} + +// Unlock unlocks both locks in reverse order: second then first. +func (l *doubleLock) Unlock() { + l.second.Unlock() + l.first.Unlock() +} diff --git a/pkg/tcpip/stack/neighbor_entry_test.go b/pkg/tcpip/stack/neighbor_entry_test.go index c2b763325..c497d3932 100644 --- a/pkg/tcpip/stack/neighbor_entry_test.go +++ b/pkg/tcpip/stack/neighbor_entry_test.go @@ -89,7 +89,7 @@ func eventDiffOptsWithSort() []cmp.Option { // | Stale | Reachable | Solicited confirmation w/o address | Notify wakers | Changed | // | Stale | Stale | Override confirmation | Update LinkAddr | Changed | // | Stale | Stale | Probe w/ different address | Update LinkAddr | Changed | -// | Stale | Delay | Packet sent | | Changed | +// | Stale | Delay | Packet queued | | Changed | // | Delay | Reachable | Upper-layer confirmation | | Changed | // | Delay | Reachable | Solicited override confirmation | Update LinkAddr | Changed | // | Delay | Reachable | Solicited confirmation w/o address | Notify wakers | Changed | @@ -101,6 +101,7 @@ func eventDiffOptsWithSort() []cmp.Option { // | Probe | Stale | Probe or confirmation w/ different address | | Changed | // | Probe | Probe | Retransmit timer expired | Send probe | Changed | // | Probe | Failed | Max probes sent without reply | Notify wakers | Removed | +// | Failed | Failed | Packet queued | | | // | Failed | | Unreachability timer expired | Delete entry | | type testEntryEventType uint8 @@ -228,6 +229,7 @@ func entryTestSetup(c NUDConfigurations) (*neighborEntry, *testNUDDispatcher, *e clock: clock, nudDisp: &disp, }, + stats: makeNICStats(), } nic.networkEndpoints = map[tcpip.NetworkProtocolNumber]NetworkEndpoint{ header.IPv6ProtocolNumber: (&testIPv6Protocol{}).NewEndpoint(&nic, nil, nil, nil), @@ -3433,6 +3435,146 @@ func TestEntryProbeToFailed(t *testing.T) { nudDisp.mu.Unlock() } +func TestEntryFailedToFailed(t *testing.T) { + c := DefaultNUDConfigurations() + c.MaxMulticastProbes = 3 + c.MaxUnicastProbes = 3 + e, nudDisp, linkRes, clock := entryTestSetup(c) + + // Verify the cache contains the entry. + if _, ok := e.nic.neigh.cache[entryTestAddr1]; !ok { + t.Errorf("expected entry %q to exist in the neighbor cache", entryTestAddr1) + } + + // TODO(gvisor.dev/issue/4872): Use helper functions to start entry tests in + // their expected state. + e.mu.Lock() + e.handlePacketQueuedLocked(entryTestAddr2) + e.mu.Unlock() + + runImmediatelyScheduledJobs(clock) + { + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + LocalAddress: entryTestAddr2, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.probes = nil + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + } + + e.mu.Lock() + e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{ + Solicited: false, + Override: false, + IsRouter: false, + }) + e.handlePacketQueuedLocked(entryTestAddr2) + e.mu.Unlock() + + waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes) + clock.Advance(waitFor) + { + wantProbes := []entryTestProbeInfo{ + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + }, + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + }, + { + RemoteAddress: entryTestAddr1, + RemoteLinkAddress: entryTestLinkAddr1, + }, + } + linkRes.mu.Lock() + diff := cmp.Diff(linkRes.probes, wantProbes) + linkRes.mu.Unlock() + if diff != "" { + t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff) + } + } + + wantEvents := []testEntryEventInfo{ + { + EventType: entryTestAdded, + NICID: entryTestNICID, + Entry: NeighborEntry{ + Addr: entryTestAddr1, + LinkAddr: tcpip.LinkAddress(""), + State: Incomplete, + }, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Entry: NeighborEntry{ + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Stale, + }, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Entry: NeighborEntry{ + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Delay, + }, + }, + { + EventType: entryTestChanged, + NICID: entryTestNICID, + Entry: NeighborEntry{ + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + }, + { + EventType: entryTestRemoved, + NICID: entryTestNICID, + Entry: NeighborEntry{ + Addr: entryTestAddr1, + LinkAddr: entryTestLinkAddr1, + State: Probe, + }, + }, + } + nudDisp.mu.Lock() + if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" { + t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff) + } + nudDisp.mu.Unlock() + + failedLookups := e.nic.stats.Neighbor.FailedEntryLookups + if got := failedLookups.Value(); got != 0 { + t.Errorf("got Neighbor.FailedEntryLookups = %d, want = 0", got) + } + + e.mu.Lock() + // Verify queuing a packet to the entry immediately fails. + e.handlePacketQueuedLocked(entryTestAddr2) + state := e.neigh.State + e.mu.Unlock() + if state != Failed { + t.Errorf("got e.neigh.State = %q, want = %q", state, Failed) + } + + if got := failedLookups.Value(); got != 1 { + t.Errorf("got Neighbor.FailedEntryLookups = %d, want = 1", got) + } +} + func TestEntryFailedGetsDeleted(t *testing.T) { c := DefaultNUDConfigurations() c.MaxMulticastProbes = 3 diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 3e6ceff28..5d037a27e 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -54,18 +54,20 @@ type NIC struct { sync.RWMutex spoofing bool promiscuous bool - // packetEPs is protected by mu, but the contained PacketEndpoint - // values are not. - packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint + // packetEPs is protected by mu, but the contained packetEndpointList are + // not. + packetEPs map[tcpip.NetworkProtocolNumber]*packetEndpointList } } -// NICStats includes transmitted and received stats. +// NICStats hold statistics for a NIC. type NICStats struct { Tx DirectionStats Rx DirectionStats DisabledRx DirectionStats + + Neighbor NeighborStats } func makeNICStats() NICStats { @@ -80,6 +82,39 @@ type DirectionStats struct { Bytes *tcpip.StatCounter } +type packetEndpointList struct { + mu sync.RWMutex + + // eps is protected by mu, but the contained PacketEndpoint values are not. + eps []PacketEndpoint +} + +func (p *packetEndpointList) add(ep PacketEndpoint) { + p.mu.Lock() + defer p.mu.Unlock() + p.eps = append(p.eps, ep) +} + +func (p *packetEndpointList) remove(ep PacketEndpoint) { + p.mu.Lock() + defer p.mu.Unlock() + for i, epOther := range p.eps { + if epOther == ep { + p.eps = append(p.eps[:i], p.eps[i+1:]...) + break + } + } +} + +// forEach calls fn with each endpoints in p while holding the read lock on p. +func (p *packetEndpointList) forEach(fn func(PacketEndpoint)) { + p.mu.RLock() + defer p.mu.RUnlock() + for _, ep := range p.eps { + fn(ep) + } +} + // newNIC returns a new NIC using the default NDP configurations from stack. func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICContext) *NIC { // TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For @@ -100,7 +135,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC stats: makeNICStats(), networkEndpoints: make(map[tcpip.NetworkProtocolNumber]NetworkEndpoint), } - nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint) + nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber]*packetEndpointList) // Check for Neighbor Unreachability Detection support. var nud NUDHandler @@ -123,11 +158,11 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC // Register supported packet and network endpoint protocols. for _, netProto := range header.Ethertypes { - nic.mu.packetEPs[netProto] = []PacketEndpoint{} + nic.mu.packetEPs[netProto] = new(packetEndpointList) } for _, netProto := range stack.networkProtocols { netNum := netProto.Number() - nic.mu.packetEPs[netNum] = nil + nic.mu.packetEPs[netNum] = new(packetEndpointList) nic.networkEndpoints[netNum] = netProto.NewEndpoint(nic, stack, nud, nic) } @@ -170,7 +205,7 @@ func (n *NIC) disable() { // // n MUST be locked. func (n *NIC) disableLocked() { - if !n.setEnabled(false) { + if !n.Enabled() { return } @@ -182,6 +217,10 @@ func (n *NIC) disableLocked() { for _, ep := range n.networkEndpoints { ep.Disable() } + + if !n.setEnabled(false) { + panic("should have only done work to disable the NIC if it was enabled") + } } // enable enables n. @@ -265,7 +304,7 @@ func (n *NIC) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumb if ch, err := r.Resolve(nil); err != nil { if err == tcpip.ErrWouldBlock { r := r.Clone() - n.stack.linkResQueue.enqueue(ch, &r, protocol, pkt) + n.stack.linkResQueue.enqueue(ch, r, protocol, pkt) return nil } return err @@ -277,9 +316,9 @@ func (n *NIC) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumb // WritePacketToRemote implements NetworkInterface. func (n *NIC) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error { r := Route{ - NetProto: protocol, - RemoteLinkAddress: remoteLinkAddr, + NetProto: protocol, } + r.ResolveWith(remoteLinkAddr) return n.writePacket(&r, gso, protocol, pkt) } @@ -561,8 +600,7 @@ func (n *NIC) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address return tcpip.ErrNotSupported } - _, err := gep.JoinGroup(addr) - return err + return gep.JoinGroup(addr) } // leaveGroup decrements the count for the given multicast address, and when it @@ -578,11 +616,7 @@ func (n *NIC) leaveGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Addres return tcpip.ErrNotSupported } - if _, err := gep.LeaveGroup(addr); err != nil { - return err - } - - return nil + return gep.LeaveGroup(addr) } // isInGroup returns true if n has joined the multicast group addr. @@ -637,15 +671,23 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp pkt.RXTransportChecksumValidated = n.LinkEndpoint.Capabilities()&CapabilityRXChecksumOffload != 0 // Are any packet type sockets listening for this network protocol? - packetEPs := n.mu.packetEPs[protocol] - // Add any other packet type sockets that may be listening for all protocols. - packetEPs = append(packetEPs, n.mu.packetEPs[header.EthernetProtocolAll]...) + protoEPs := n.mu.packetEPs[protocol] + // Other packet type sockets that are listening for all protocols. + anyEPs := n.mu.packetEPs[header.EthernetProtocolAll] n.mu.RUnlock() - for _, ep := range packetEPs { + + // Deliver to interested packet endpoints without holding NIC lock. + deliverPacketEPs := func(ep PacketEndpoint) { p := pkt.Clone() p.PktType = tcpip.PacketHost ep.HandlePacket(n.id, local, protocol, p) } + if protoEPs != nil { + protoEPs.forEach(deliverPacketEPs) + } + if anyEPs != nil { + anyEPs.forEach(deliverPacketEPs) + } // Parse headers. netProto := n.stack.NetworkProtocolInstance(protocol) @@ -686,16 +728,17 @@ func (n *NIC) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tc // We do not deliver to protocol specific packet endpoints as on Linux // only ETH_P_ALL endpoints get outbound packets. // Add any other packet sockets that maybe listening for all protocols. - packetEPs := n.mu.packetEPs[header.EthernetProtocolAll] + eps := n.mu.packetEPs[header.EthernetProtocolAll] n.mu.RUnlock() - for _, ep := range packetEPs { + + eps.forEach(func(ep PacketEndpoint) { p := pkt.Clone() p.PktType = tcpip.PacketOutgoing // Add the link layer header as outgoing packets are intercepted // before the link layer header is created. n.LinkEndpoint.AddHeader(local, remote, protocol, p) ep.HandlePacket(n.id, local, protocol, p) - } + }) } // DeliverTransportPacket delivers the packets to the appropriate transport @@ -848,7 +891,7 @@ func (n *NIC) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep Pa if !ok { return tcpip.ErrNotSupported } - n.mu.packetEPs[netProto] = append(eps, ep) + eps.add(ep) return nil } @@ -861,13 +904,7 @@ func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep if !ok { return } - - for i, epOther := range eps { - if epOther == ep { - n.mu.packetEPs[netProto] = append(eps[:i], eps[i+1:]...) - return - } - } + eps.remove(ep) } // isValidForOutgoing returns true if the endpoint can be used to send out a diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index 2cb13c6fa..b334e27c4 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -259,15 +259,6 @@ const ( PacketLoop ) -// NetOptions is an interface that allows us to pass network protocol specific -// options through the Stack layer code. -type NetOptions interface { - // SizeWithPadding returns the amount of memory that must be allocated to - // hold the options given that the value must be rounded up to the next - // multiple of 4 bytes. - SizeWithPadding() int -} - // NetworkHeaderParams are the header parameters given as input by the // transport endpoint to the network. type NetworkHeaderParams struct { @@ -279,10 +270,6 @@ type NetworkHeaderParams struct { // TOS refers to TypeOfService or TrafficClass field of the IP-header. TOS uint8 - - // Options is a set of options to add to a network header (or nil). - // It will be protocol specific opaque information from higher layers. - Options NetOptions } // GroupAddressableEndpoint is an endpoint that supports group addressing. @@ -291,14 +278,10 @@ type NetworkHeaderParams struct { // endpoints may associate themselves with the same identifier (group address). type GroupAddressableEndpoint interface { // JoinGroup joins the specified group. - // - // Returns true if the group was newly joined. - JoinGroup(group tcpip.Address) (bool, *tcpip.Error) + JoinGroup(group tcpip.Address) *tcpip.Error // LeaveGroup attempts to leave the specified group. - // - // Returns tcpip.ErrBadLocalAddress if the endpoint has not joined the group. - LeaveGroup(group tcpip.Address) (bool, *tcpip.Error) + LeaveGroup(group tcpip.Address) *tcpip.Error // IsInGroup returns true if the endpoint is a member of the specified group. IsInGroup(group tcpip.Address) bool @@ -739,10 +722,6 @@ type LinkEndpoint interface { // endpoint. Capabilities() LinkEndpointCapabilities - // WriteRawPacket writes a packet directly to the link. The packet - // should already have an ethernet header. It takes ownership of vv. - WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error - // Attach attaches the data link layer endpoint to the network-layer // dispatcher of the stack. // diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go index 53cb6694f..de5fe6ffe 100644 --- a/pkg/tcpip/stack/route.go +++ b/pkg/tcpip/stack/route.go @@ -18,19 +18,22 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) // Route represents a route through the networking stack to a given destination. +// +// It is safe to call Route's methods from multiple goroutines. +// +// The exported fields are immutable. +// +// TODO(gvisor.dev/issue/4902): Unexpose immutable fields. type Route struct { // RemoteAddress is the final destination of the route. RemoteAddress tcpip.Address - // RemoteLinkAddress is the link-layer (MAC) address of the - // final destination of the route. - RemoteLinkAddress tcpip.LinkAddress - // LocalAddress is the local address where the route starts. LocalAddress tcpip.Address @@ -52,8 +55,16 @@ type Route struct { // address's assigned status without the NIC. localAddressNIC *NIC - // localAddressEndpoint is the local address this route is associated with. - localAddressEndpoint AssignableAddressEndpoint + mu struct { + sync.RWMutex + + // localAddressEndpoint is the local address this route is associated with. + localAddressEndpoint AssignableAddressEndpoint + + // remoteLinkAddress is the link-layer (MAC) address of the next hop in the + // route. + remoteLinkAddress tcpip.LinkAddress + } // outgoingNIC is the interface this route uses to write packets. outgoingNIC *NIC @@ -71,22 +82,24 @@ type Route struct { // ownership of the provided local address. // // Returns an empty route if validation fails. -func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndpoint AssignableAddressEndpoint, localAddressNIC, outgoingNIC *NIC, gateway, remoteAddr tcpip.Address, handleLocal, multicastLoop bool) Route { - addrWithPrefix := addressEndpoint.AddressWithPrefix() +func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndpoint AssignableAddressEndpoint, localAddressNIC, outgoingNIC *NIC, gateway, localAddr, remoteAddr tcpip.Address, handleLocal, multicastLoop bool) *Route { + if len(localAddr) == 0 { + localAddr = addressEndpoint.AddressWithPrefix().Address + } - if localAddressNIC != outgoingNIC && header.IsV6LinkLocalAddress(addrWithPrefix.Address) { + if localAddressNIC != outgoingNIC && header.IsV6LinkLocalAddress(localAddr) { addressEndpoint.DecRef() - return Route{} + return nil } // If no remote address is provided, use the local address. if len(remoteAddr) == 0 { - remoteAddr = addrWithPrefix.Address + remoteAddr = localAddr } r := makeRoute( netProto, - addrWithPrefix.Address, + localAddr, remoteAddr, outgoingNIC, localAddressNIC, @@ -99,8 +112,8 @@ func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndp // broadcast it. if len(gateway) > 0 { r.NextHop = gateway - } else if subnet := addrWithPrefix.Subnet(); subnet.IsBroadcast(remoteAddr) { - r.RemoteLinkAddress = header.EthernetBroadcastAddress + } else if subnet := addressEndpoint.Subnet(); subnet.IsBroadcast(remoteAddr) { + r.ResolveWith(header.EthernetBroadcastAddress) } return r @@ -108,11 +121,15 @@ func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndp // makeRoute initializes a new route. It takes ownership of the provided // AssignableAddressEndpoint. -func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint, handleLocal, multicastLoop bool) Route { +func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint, handleLocal, multicastLoop bool) *Route { if localAddressNIC.stack != outgoingNIC.stack { panic(fmt.Sprintf("cannot create a route with NICs from different stacks")) } + if len(localAddr) == 0 { + localAddr = localAddressEndpoint.AddressWithPrefix().Address + } + loop := PacketOut // TODO(gvisor.dev/issue/4689): Loopback interface loops back packets at the @@ -133,18 +150,21 @@ func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip return makeRouteInner(netProto, localAddr, remoteAddr, outgoingNIC, localAddressNIC, localAddressEndpoint, loop) } -func makeRouteInner(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint, loop PacketLooping) Route { - r := Route{ - NetProto: netProto, - LocalAddress: localAddr, - LocalLinkAddress: outgoingNIC.LinkEndpoint.LinkAddress(), - RemoteAddress: remoteAddr, - localAddressNIC: localAddressNIC, - localAddressEndpoint: localAddressEndpoint, - outgoingNIC: outgoingNIC, - Loop: loop, +func makeRouteInner(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint, loop PacketLooping) *Route { + r := &Route{ + NetProto: netProto, + LocalAddress: localAddr, + LocalLinkAddress: outgoingNIC.LinkEndpoint.LinkAddress(), + RemoteAddress: remoteAddr, + localAddressNIC: localAddressNIC, + outgoingNIC: outgoingNIC, + Loop: loop, } + r.mu.Lock() + r.mu.localAddressEndpoint = localAddressEndpoint + r.mu.Unlock() + if r.outgoingNIC.LinkEndpoint.Capabilities()&CapabilityResolutionRequired != 0 { if linkRes, ok := r.outgoingNIC.stack.linkAddrResolvers[r.NetProto]; ok { r.linkRes = linkRes @@ -159,7 +179,7 @@ func makeRouteInner(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr // provided AssignableAddressEndpoint. // // A local route is a route to a destination that is local to the stack. -func makeLocalRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint) Route { +func makeLocalRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *NIC, localAddressEndpoint AssignableAddressEndpoint) *Route { loop := PacketLoop // TODO(gvisor.dev/issue/4689): Loopback interface loops back packets at the // link endpoint level. We can remove this check once loopback interfaces @@ -170,6 +190,14 @@ func makeLocalRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr return makeRouteInner(netProto, localAddr, remoteAddr, outgoingNIC, localAddressNIC, localAddressEndpoint, loop) } +// RemoteLinkAddress returns the link-layer (MAC) address of the next hop in +// the route. +func (r *Route) RemoteLinkAddress() tcpip.LinkAddress { + r.mu.RLock() + defer r.mu.RUnlock() + return r.mu.remoteLinkAddress +} + // NICID returns the id of the NIC from which this route originates. func (r *Route) NICID() tcpip.NICID { return r.outgoingNIC.ID() @@ -231,7 +259,9 @@ func (r *Route) GSOMaxSize() uint32 { // ResolveWith immediately resolves a route with the specified remote link // address. func (r *Route) ResolveWith(addr tcpip.LinkAddress) { - r.RemoteLinkAddress = addr + r.mu.Lock() + defer r.mu.Unlock() + r.mu.remoteLinkAddress = addr } // Resolve attempts to resolve the link address if necessary. Returns ErrWouldBlock in @@ -244,7 +274,10 @@ func (r *Route) ResolveWith(addr tcpip.LinkAddress) { // // The NIC r uses must not be locked. func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) { - if !r.IsResolutionRequired() { + r.mu.Lock() + defer r.mu.Unlock() + + if !r.isResolutionRequiredRLocked() { // Nothing to do if there is no cache (which does the resolution on cache miss) or // link address is already known. return nil, nil @@ -254,7 +287,7 @@ func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) { if nextAddr == "" { // Local link address is already known. if r.RemoteAddress == r.LocalAddress { - r.RemoteLinkAddress = r.LocalLinkAddress + r.mu.remoteLinkAddress = r.LocalLinkAddress return nil, nil } nextAddr = r.RemoteAddress @@ -272,7 +305,7 @@ func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) { if err != nil { return ch, err } - r.RemoteLinkAddress = entry.LinkAddr + r.mu.remoteLinkAddress = entry.LinkAddr return nil, nil } @@ -280,7 +313,7 @@ func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) { if err != nil { return ch, err } - r.RemoteLinkAddress = linkAddr + r.mu.remoteLinkAddress = linkAddr return nil, nil } @@ -309,7 +342,13 @@ func (r *Route) local() bool { // // The NICs the route is associated with must not be locked. func (r *Route) IsResolutionRequired() bool { - if !r.isValidForOutgoing() || r.RemoteLinkAddress != "" || r.local() { + r.mu.RLock() + defer r.mu.RUnlock() + return r.isResolutionRequiredRLocked() +} + +func (r *Route) isResolutionRequiredRLocked() bool { + if !r.isValidForOutgoingRLocked() || r.mu.remoteLinkAddress != "" || r.local() { return false } @@ -317,11 +356,18 @@ func (r *Route) IsResolutionRequired() bool { } func (r *Route) isValidForOutgoing() bool { + r.mu.RLock() + defer r.mu.RUnlock() + return r.isValidForOutgoingRLocked() +} + +func (r *Route) isValidForOutgoingRLocked() bool { if !r.outgoingNIC.Enabled() { return false } - if !r.localAddressNIC.isValidForOutgoing(r.localAddressEndpoint) { + localAddressEndpoint := r.mu.localAddressEndpoint + if localAddressEndpoint == nil || !r.localAddressNIC.isValidForOutgoing(localAddressEndpoint) { return false } @@ -375,37 +421,44 @@ func (r *Route) MTU() uint32 { // Release frees all resources associated with the route. func (r *Route) Release() { - if r.localAddressEndpoint != nil { - r.localAddressEndpoint.DecRef() - r.localAddressEndpoint = nil + r.mu.Lock() + defer r.mu.Unlock() + + if r.mu.localAddressEndpoint != nil { + r.mu.localAddressEndpoint.DecRef() + r.mu.localAddressEndpoint = nil } } // Clone clones the route. -func (r *Route) Clone() Route { - if r.localAddressEndpoint != nil { - if !r.localAddressEndpoint.IncRef() { - panic(fmt.Sprintf("failed to increment reference count for local address endpoint = %s", r.LocalAddress)) - } +func (r *Route) Clone() *Route { + r.mu.RLock() + defer r.mu.RUnlock() + + newRoute := &Route{ + RemoteAddress: r.RemoteAddress, + LocalAddress: r.LocalAddress, + LocalLinkAddress: r.LocalLinkAddress, + NextHop: r.NextHop, + NetProto: r.NetProto, + Loop: r.Loop, + localAddressNIC: r.localAddressNIC, + outgoingNIC: r.outgoingNIC, + linkCache: r.linkCache, + linkRes: r.linkRes, } - return *r -} -// MakeLoopedRoute duplicates the given route with special handling for routes -// used for sending multicast or broadcast packets. In those cases the -// multicast/broadcast address is the remote address when sending out, but for -// incoming (looped) packets it becomes the local address. Similarly, the local -// interface address that was the local address going out becomes the remote -// address coming in. This is different to unicast routes where local and -// remote addresses remain the same as they identify location (local vs remote) -// not direction (source vs destination). -func (r *Route) MakeLoopedRoute() Route { - l := r.Clone() - if r.RemoteAddress == header.IPv4Broadcast || header.IsV4MulticastAddress(r.RemoteAddress) || header.IsV6MulticastAddress(r.RemoteAddress) { - l.RemoteAddress, l.LocalAddress = l.LocalAddress, l.RemoteAddress - l.RemoteLinkAddress = l.LocalLinkAddress + newRoute.mu.Lock() + defer newRoute.mu.Unlock() + newRoute.mu.localAddressEndpoint = r.mu.localAddressEndpoint + if newRoute.mu.localAddressEndpoint != nil { + if !newRoute.mu.localAddressEndpoint.IncRef() { + panic(fmt.Sprintf("failed to increment reference count for local address endpoint = %s", newRoute.LocalAddress)) + } } - return l + newRoute.mu.remoteLinkAddress = r.mu.remoteLinkAddress + + return newRoute } // Stack returns the instance of the Stack that owns this route. @@ -418,7 +471,14 @@ func (r *Route) isV4Broadcast(addr tcpip.Address) bool { return true } - subnet := r.localAddressEndpoint.Subnet() + r.mu.RLock() + localAddressEndpoint := r.mu.localAddressEndpoint + r.mu.RUnlock() + if localAddressEndpoint == nil { + return false + } + + subnet := localAddressEndpoint.Subnet() return subnet.IsBroadcast(addr) } @@ -428,27 +488,3 @@ func (r *Route) IsOutboundBroadcast() bool { // Only IPv4 has a notion of broadcast. return r.isV4Broadcast(r.RemoteAddress) } - -// isInboundBroadcast returns true if the route is for an inbound broadcast -// packet. -func (r *Route) isInboundBroadcast() bool { - // Only IPv4 has a notion of broadcast. - return r.isV4Broadcast(r.LocalAddress) -} - -// ReverseRoute returns new route with given source and destination address. -func (r *Route) ReverseRoute(src tcpip.Address, dst tcpip.Address) Route { - return Route{ - NetProto: r.NetProto, - LocalAddress: dst, - LocalLinkAddress: r.RemoteLinkAddress, - RemoteAddress: src, - RemoteLinkAddress: r.LocalLinkAddress, - Loop: r.Loop, - localAddressNIC: r.localAddressNIC, - localAddressEndpoint: r.localAddressEndpoint, - outgoingNIC: r.outgoingNIC, - linkCache: r.linkCache, - linkRes: r.linkRes, - } -} diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index e0025e0a9..026d330c4 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -171,6 +171,9 @@ type TCPSenderState struct { // Outstanding is the number of packets in flight. Outstanding int + // SackedOut is the number of packets which have been selectively acked. + SackedOut int + // SndWnd is the send window size in bytes. SndWnd seqnum.Size @@ -1118,6 +1121,16 @@ func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint) } +// AddAddressWithPrefix is the same as AddAddress, but allows you to specify +// the address prefix. +func (s *Stack) AddAddressWithPrefix(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.AddressWithPrefix) *tcpip.Error { + ap := tcpip.ProtocolAddress{ + Protocol: protocol, + AddressWithPrefix: addr, + } + return s.AddProtocolAddressWithOptions(id, ap, CanBePrimaryEndpoint) +} + // AddProtocolAddress adds a new network-layer protocol address to the // specified NIC. func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress) *tcpip.Error { @@ -1208,10 +1221,10 @@ func (s *Stack) getAddressEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netP // from the specified NIC. // // Precondition: s.mu must be read locked. -func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (route Route, ok bool) { +func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) *Route { localAddressEndpoint := localAddressNIC.getAddressOrCreateTempInner(netProto, localAddr, false /* createTemp */, NeverPrimaryEndpoint) if localAddressEndpoint == nil { - return Route{}, false + return nil } var outgoingNIC *NIC @@ -1235,12 +1248,12 @@ func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, re // route. if outgoingNIC == nil { localAddressEndpoint.DecRef() - return Route{}, false + return nil } r := makeLocalRoute( netProto, - localAddressEndpoint.AddressWithPrefix().Address, + localAddr, remoteAddr, outgoingNIC, localAddressNIC, @@ -1249,10 +1262,10 @@ func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, re if r.IsOutboundBroadcast() { r.Release() - return Route{}, false + return nil } - return r, true + return r } // findLocalRouteRLocked returns a local route. @@ -1261,26 +1274,26 @@ func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *NIC, localAddr, re // is, a local route is a route where packets never have to leave the stack. // // Precondition: s.mu must be read locked. -func (s *Stack) findLocalRouteRLocked(localAddressNICID tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (route Route, ok bool) { +func (s *Stack) findLocalRouteRLocked(localAddressNICID tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) *Route { if len(localAddr) == 0 { localAddr = remoteAddr } if localAddressNICID == 0 { for _, localAddressNIC := range s.nics { - if r, ok := s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto); ok { - return r, true + if r := s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto); r != nil { + return r } } - return Route{}, false + return nil } if localAddressNIC, ok := s.nics[localAddressNICID]; ok { return s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto) } - return Route{}, false + return nil } // FindRoute creates a route to the given destination address, leaving through @@ -1294,7 +1307,7 @@ func (s *Stack) findLocalRouteRLocked(localAddressNICID tcpip.NICID, localAddr, // If no local address is provided, the stack will select a local address. If no // remote address is provided, the stack wil use a remote address equal to the // local address. -func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (Route, *tcpip.Error) { +func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (*Route, *tcpip.Error) { s.mu.RLock() defer s.mu.RUnlock() @@ -1305,7 +1318,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n needRoute := !(isLocalBroadcast || isMulticast || isLinkLocal || isLoopback) if s.handleLocal && !isMulticast && !isLocalBroadcast { - if r, ok := s.findLocalRouteRLocked(id, localAddr, remoteAddr, netProto); ok { + if r := s.findLocalRouteRLocked(id, localAddr, remoteAddr, netProto); r != nil { return r, nil } } @@ -1317,7 +1330,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, netProto); addressEndpoint != nil { return makeRoute( netProto, - addressEndpoint.AddressWithPrefix().Address, + localAddr, remoteAddr, nic, /* outboundNIC */ nic, /* localAddressNIC*/ @@ -1329,9 +1342,9 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n } if isLoopback { - return Route{}, tcpip.ErrBadLocalAddress + return nil, tcpip.ErrBadLocalAddress } - return Route{}, tcpip.ErrNetworkUnreachable + return nil, tcpip.ErrNetworkUnreachable } canForward := s.Forwarding(netProto) && !header.IsV6LinkLocalAddress(localAddr) && !isLinkLocal @@ -1354,8 +1367,8 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n if needRoute { gateway = route.Gateway } - r := constructAndValidateRoute(netProto, addressEndpoint, nic /* outgoingNIC */, nic /* outgoingNIC */, gateway, remoteAddr, s.handleLocal, multicastLoop) - if r == (Route{}) { + r := constructAndValidateRoute(netProto, addressEndpoint, nic /* outgoingNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop) + if r == nil { panic(fmt.Sprintf("non-forwarding route validation failed with route table entry = %#v, id = %d, localAddr = %s, remoteAddr = %s", route, id, localAddr, remoteAddr)) } return r, nil @@ -1391,13 +1404,13 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n if id != 0 { if aNIC, ok := s.nics[id]; ok { if addressEndpoint := s.getAddressEP(aNIC, localAddr, remoteAddr, netProto); addressEndpoint != nil { - if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, remoteAddr, s.handleLocal, multicastLoop); r != (Route{}) { + if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop); r != nil { return r, nil } } } - return Route{}, tcpip.ErrNoRoute + return nil, tcpip.ErrNoRoute } if id == 0 { @@ -1409,7 +1422,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n continue } - if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, remoteAddr, s.handleLocal, multicastLoop); r != (Route{}) { + if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop); r != nil { return r, nil } } @@ -1417,12 +1430,12 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n } if needRoute { - return Route{}, tcpip.ErrNoRoute + return nil, tcpip.ErrNoRoute } if header.IsV6LoopbackAddress(remoteAddr) { - return Route{}, tcpip.ErrBadLocalAddress + return nil, tcpip.ErrBadLocalAddress } - return Route{}, tcpip.ErrNetworkUnreachable + return nil, tcpip.ErrNetworkUnreachable } // CheckNetworkProtocol checks if a given network protocol is enabled in the @@ -1810,49 +1823,20 @@ func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip nic.unregisterPacketEndpoint(netProto, ep) } -// WritePacket writes data directly to the specified NIC. It adds an ethernet -// header based on the arguments. -func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error { - s.mu.Lock() - nic, ok := s.nics[nicID] - s.mu.Unlock() - if !ok { - return tcpip.ErrUnknownDevice - } - - // Add our own fake ethernet header. - ethFields := header.EthernetFields{ - SrcAddr: nic.LinkEndpoint.LinkAddress(), - DstAddr: dst, - Type: netProto, - } - fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) - fakeHeader.Encode(ðFields) - vv := buffer.View(fakeHeader).ToVectorisedView() - vv.Append(payload) - - if err := nic.LinkEndpoint.WriteRawPacket(vv); err != nil { - return err - } - - return nil -} - -// WriteRawPacket writes data directly to the specified NIC without adding any -// headers. -func (s *Stack) WriteRawPacket(nicID tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error { +// WritePacketToRemote writes a payload on the specified NIC using the provided +// network protocol and remote link address. +func (s *Stack) WritePacketToRemote(nicID tcpip.NICID, remote tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error { s.mu.Lock() nic, ok := s.nics[nicID] s.mu.Unlock() if !ok { return tcpip.ErrUnknownDevice } - - if err := nic.LinkEndpoint.WriteRawPacket(payload); err != nil { - return err - } - - return nil + pkt := NewPacketBuffer(PacketBufferOptions{ + ReserveHeaderBytes: int(nic.MaxHeaderLength()), + Data: payload, + }) + return nic.WritePacketToRemote(remote, nil, netProto, pkt) } // NetworkProtocolInstance returns the protocol instance in the stack for the @@ -1912,7 +1896,6 @@ func (s *Stack) RemoveTCPProbe() { // JoinGroup joins the given multicast group on the given NIC. func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error { - // TODO: notify network of subscription via igmp protocol. s.mu.RLock() defer s.mu.RUnlock() @@ -2159,3 +2142,43 @@ func (s *Stack) networkProtocolNumbers() []tcpip.NetworkProtocolNumber { } return protos } + +func isSubnetBroadcastOnNIC(nic *NIC, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { + addressEndpoint := nic.getAddressOrCreateTempInner(protocol, addr, false /* createTemp */, NeverPrimaryEndpoint) + if addressEndpoint == nil { + return false + } + + subnet := addressEndpoint.Subnet() + addressEndpoint.DecRef() + return subnet.IsBroadcast(addr) +} + +// IsSubnetBroadcast returns true if the provided address is a subnet-local +// broadcast address on the specified NIC and protocol. +// +// Returns false if the NIC is unknown or if the protocol is unknown or does +// not support addressing. +// +// If the NIC is not specified, the stack will check all NICs. +func (s *Stack) IsSubnetBroadcast(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { + s.mu.RLock() + defer s.mu.RUnlock() + + if nicID != 0 { + nic, ok := s.nics[nicID] + if !ok { + return false + } + + return isSubnetBroadcastOnNIC(nic, protocol, addr) + } + + for _, nic := range s.nics { + if isSubnetBroadcastOnNIC(nic, protocol, addr) { + return true + } + } + + return false +} diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 61db3164b..457990945 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -27,7 +27,6 @@ import ( "time" "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" @@ -407,7 +406,7 @@ func sendTo(s *stack.Stack, addr tcpip.Address, payload buffer.View) *tcpip.Erro return send(r, payload) } -func send(r stack.Route, payload buffer.View) *tcpip.Error { +func send(r *stack.Route, payload buffer.View) *tcpip.Error { return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(r.MaxHeaderLength()), Data: payload.ToVectorisedView(), @@ -425,7 +424,7 @@ func testSendTo(t *testing.T, s *stack.Stack, addr tcpip.Address, ep *channel.En } } -func testSend(t *testing.T, r stack.Route, ep *channel.Endpoint, payload buffer.View) { +func testSend(t *testing.T, r *stack.Route, ep *channel.Endpoint, payload buffer.View) { t.Helper() ep.Drain() if err := send(r, payload); err != nil { @@ -436,7 +435,7 @@ func testSend(t *testing.T, r stack.Route, ep *channel.Endpoint, payload buffer. } } -func testFailingSend(t *testing.T, r stack.Route, ep *channel.Endpoint, payload buffer.View, wantErr *tcpip.Error) { +func testFailingSend(t *testing.T, r *stack.Route, ep *channel.Endpoint, payload buffer.View, wantErr *tcpip.Error) { t.Helper() if gotErr := send(r, payload); gotErr != wantErr { t.Errorf("send failed: got = %s, want = %s ", gotErr, wantErr) @@ -1563,15 +1562,15 @@ func TestSpoofingNoAddress(t *testing.T) { // testSendTo(t, s, remoteAddr, ep, nil) } -func verifyRoute(gotRoute, wantRoute stack.Route) error { +func verifyRoute(gotRoute, wantRoute *stack.Route) error { if gotRoute.LocalAddress != wantRoute.LocalAddress { return fmt.Errorf("bad local address: got %s, want = %s", gotRoute.LocalAddress, wantRoute.LocalAddress) } if gotRoute.RemoteAddress != wantRoute.RemoteAddress { return fmt.Errorf("bad remote address: got %s, want = %s", gotRoute.RemoteAddress, wantRoute.RemoteAddress) } - if gotRoute.RemoteLinkAddress != wantRoute.RemoteLinkAddress { - return fmt.Errorf("bad remote link address: got %s, want = %s", gotRoute.RemoteLinkAddress, wantRoute.RemoteLinkAddress) + if got, want := gotRoute.RemoteLinkAddress(), wantRoute.RemoteLinkAddress(); got != want { + return fmt.Errorf("bad remote link address: got %s, want = %s", got, want) } if gotRoute.NextHop != wantRoute.NextHop { return fmt.Errorf("bad next-hop address: got %s, want = %s", gotRoute.NextHop, wantRoute.NextHop) @@ -1603,7 +1602,7 @@ func TestOutgoingBroadcastWithEmptyRouteTable(t *testing.T) { if err != nil { t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err) } - if err := verifyRoute(r, stack.Route{LocalAddress: header.IPv4Any, RemoteAddress: header.IPv4Broadcast}); err != nil { + if err := verifyRoute(r, &stack.Route{LocalAddress: header.IPv4Any, RemoteAddress: header.IPv4Broadcast}); err != nil { t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err) } @@ -1657,7 +1656,7 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) { if err != nil { t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err) } - if err := verifyRoute(r, stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil { + if err := verifyRoute(r, &stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil { t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err) } @@ -1667,7 +1666,7 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) { if err != nil { t.Fatalf("FindRoute(0, \"\", %s, %d) failed: %s", header.IPv4Broadcast, fakeNetNumber, err) } - if err := verifyRoute(r, stack.Route{LocalAddress: nic2Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil { + if err := verifyRoute(r, &stack.Route{LocalAddress: nic2Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil { t.Errorf("FindRoute(0, \"\", %s, %d) returned unexpected Route: %s)", header.IPv4Broadcast, fakeNetNumber, err) } @@ -1683,7 +1682,7 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) { if err != nil { t.Fatalf("FindRoute(0, \"\", %s, %d) failed: %s", header.IPv4Broadcast, fakeNetNumber, err) } - if err := verifyRoute(r, stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil { + if err := verifyRoute(r, &stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil { t.Errorf("FindRoute(0, \"\", %s, %d) returned unexpected Route: %s)", header.IPv4Broadcast, fakeNetNumber, err) } } @@ -2407,9 +2406,9 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) { } opts := stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - AutoGenIPv6LinkLocal: test.autoGen, - NDPDisp: &ndpDisp, - OpaqueIIDOpts: test.iidOpts, + AutoGenLinkLocal: test.autoGen, + NDPDisp: &ndpDisp, + OpaqueIIDOpts: test.iidOpts, })}, } @@ -2502,8 +2501,8 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) { t.Run(test.name, func(t *testing.T) { opts := stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - AutoGenIPv6LinkLocal: true, - OpaqueIIDOpts: test.opaqueIIDOpts, + AutoGenLinkLocal: true, + OpaqueIIDOpts: test.opaqueIIDOpts, })}, } @@ -2536,9 +2535,9 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) { ndpConfigs := ipv6.DefaultNDPConfigurations() opts := stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPConfigs: ndpConfigs, - AutoGenIPv6LinkLocal: true, - NDPDisp: &ndpDisp, + NDPConfigs: ndpConfigs, + AutoGenLinkLocal: true, + NDPDisp: &ndpDisp, })}, } @@ -3351,11 +3350,16 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { remNetSubnetBcast := remNetSubnet.Broadcast() tests := []struct { - name string - nicAddr tcpip.ProtocolAddress - routes []tcpip.Route - remoteAddr tcpip.Address - expectedRoute stack.Route + name string + nicAddr tcpip.ProtocolAddress + routes []tcpip.Route + remoteAddr tcpip.Address + expectedLocalAddress tcpip.Address + expectedRemoteAddress tcpip.Address + expectedRemoteLinkAddress tcpip.LinkAddress + expectedNextHop tcpip.Address + expectedNetProto tcpip.NetworkProtocolNumber + expectedLoop stack.PacketLooping }{ // Broadcast to a locally attached subnet populates the broadcast MAC. { @@ -3370,14 +3374,12 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { NIC: nicID1, }, }, - remoteAddr: ipv4SubnetBcast, - expectedRoute: stack.Route{ - LocalAddress: ipv4Addr.Address, - RemoteAddress: ipv4SubnetBcast, - RemoteLinkAddress: header.EthernetBroadcastAddress, - NetProto: header.IPv4ProtocolNumber, - Loop: stack.PacketOut | stack.PacketLoop, - }, + remoteAddr: ipv4SubnetBcast, + expectedLocalAddress: ipv4Addr.Address, + expectedRemoteAddress: ipv4SubnetBcast, + expectedRemoteLinkAddress: header.EthernetBroadcastAddress, + expectedNetProto: header.IPv4ProtocolNumber, + expectedLoop: stack.PacketOut | stack.PacketLoop, }, // Broadcast to a locally attached /31 subnet does not populate the // broadcast MAC. @@ -3393,13 +3395,11 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { NIC: nicID1, }, }, - remoteAddr: ipv4Subnet31Bcast, - expectedRoute: stack.Route{ - LocalAddress: ipv4AddrPrefix31.Address, - RemoteAddress: ipv4Subnet31Bcast, - NetProto: header.IPv4ProtocolNumber, - Loop: stack.PacketOut, - }, + remoteAddr: ipv4Subnet31Bcast, + expectedLocalAddress: ipv4AddrPrefix31.Address, + expectedRemoteAddress: ipv4Subnet31Bcast, + expectedNetProto: header.IPv4ProtocolNumber, + expectedLoop: stack.PacketOut, }, // Broadcast to a locally attached /32 subnet does not populate the // broadcast MAC. @@ -3415,13 +3415,11 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { NIC: nicID1, }, }, - remoteAddr: ipv4Subnet32Bcast, - expectedRoute: stack.Route{ - LocalAddress: ipv4AddrPrefix32.Address, - RemoteAddress: ipv4Subnet32Bcast, - NetProto: header.IPv4ProtocolNumber, - Loop: stack.PacketOut, - }, + remoteAddr: ipv4Subnet32Bcast, + expectedLocalAddress: ipv4AddrPrefix32.Address, + expectedRemoteAddress: ipv4Subnet32Bcast, + expectedNetProto: header.IPv4ProtocolNumber, + expectedLoop: stack.PacketOut, }, // IPv6 has no notion of a broadcast. { @@ -3436,13 +3434,11 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { NIC: nicID1, }, }, - remoteAddr: ipv6SubnetBcast, - expectedRoute: stack.Route{ - LocalAddress: ipv6Addr.Address, - RemoteAddress: ipv6SubnetBcast, - NetProto: header.IPv6ProtocolNumber, - Loop: stack.PacketOut, - }, + remoteAddr: ipv6SubnetBcast, + expectedLocalAddress: ipv6Addr.Address, + expectedRemoteAddress: ipv6SubnetBcast, + expectedNetProto: header.IPv6ProtocolNumber, + expectedLoop: stack.PacketOut, }, // Broadcast to a remote subnet in the route table is send to the next-hop // gateway. @@ -3459,14 +3455,12 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { NIC: nicID1, }, }, - remoteAddr: remNetSubnetBcast, - expectedRoute: stack.Route{ - LocalAddress: ipv4Addr.Address, - RemoteAddress: remNetSubnetBcast, - NextHop: ipv4Gateway, - NetProto: header.IPv4ProtocolNumber, - Loop: stack.PacketOut, - }, + remoteAddr: remNetSubnetBcast, + expectedLocalAddress: ipv4Addr.Address, + expectedRemoteAddress: remNetSubnetBcast, + expectedNextHop: ipv4Gateway, + expectedNetProto: header.IPv4ProtocolNumber, + expectedLoop: stack.PacketOut, }, // Broadcast to an unknown subnet follows the default route. Note that this // is essentially just routing an unknown destination IP, because w/o any @@ -3484,14 +3478,12 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { NIC: nicID1, }, }, - remoteAddr: remNetSubnetBcast, - expectedRoute: stack.Route{ - LocalAddress: ipv4Addr.Address, - RemoteAddress: remNetSubnetBcast, - NextHop: ipv4Gateway, - NetProto: header.IPv4ProtocolNumber, - Loop: stack.PacketOut, - }, + remoteAddr: remNetSubnetBcast, + expectedLocalAddress: ipv4Addr.Address, + expectedRemoteAddress: remNetSubnetBcast, + expectedNextHop: ipv4Gateway, + expectedNetProto: header.IPv4ProtocolNumber, + expectedLoop: stack.PacketOut, }, } @@ -3520,10 +3512,27 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { t.Fatalf("got unexpected address length = %d bytes", l) } - if r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, test.remoteAddr, netProto, false /* multicastLoop */); err != nil { + r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, test.remoteAddr, netProto, false /* multicastLoop */) + if err != nil { t.Fatalf("FindRoute(%d, '', %s, %d): %s", unspecifiedNICID, test.remoteAddr, netProto, err) - } else if diff := cmp.Diff(r, test.expectedRoute, cmpopts.IgnoreUnexported(r)); diff != "" { - t.Errorf("route mismatch (-want +got):\n%s", diff) + } + if r.LocalAddress != test.expectedLocalAddress { + t.Errorf("got r.LocalAddress = %s, want = %s", r.LocalAddress, test.expectedLocalAddress) + } + if r.RemoteAddress != test.expectedRemoteAddress { + t.Errorf("got r.RemoteAddress = %s, want = %s", r.RemoteAddress, test.expectedRemoteAddress) + } + if got := r.RemoteLinkAddress(); got != test.expectedRemoteLinkAddress { + t.Errorf("got r.RemoteLinkAddress() = %s, want = %s", got, test.expectedRemoteLinkAddress) + } + if r.NextHop != test.expectedNextHop { + t.Errorf("got r.NextHop = %s, want = %s", r.NextHop, test.expectedNextHop) + } + if r.NetProto != test.expectedNetProto { + t.Errorf("got r.NetProto = %d, want = %d", r.NetProto, test.expectedNetProto) + } + if r.Loop != test.expectedLoop { + t.Errorf("got r.Loop = %x, want = %x", r.Loop, test.expectedLoop) } }) } @@ -4091,10 +4100,12 @@ func TestFindRouteWithForwarding(t *testing.T) { s.SetRouteTable([]tcpip.Route{{Destination: test.netCfg.remoteAddr.WithPrefix().Subnet(), NIC: nicID2}}) r, err := s.FindRoute(test.addrNIC, test.localAddr, test.netCfg.remoteAddr, test.netCfg.proto, false /* multicastLoop */) + if r != nil { + defer r.Release() + } if err != test.findRouteErr { t.Fatalf("FindRoute(%d, %s, %s, %d, false) = %s, want = %s", test.addrNIC, test.localAddr, test.netCfg.remoteAddr, test.netCfg.proto, err, test.findRouteErr) } - defer r.Release() if test.findRouteErr != nil { return @@ -4152,3 +4163,63 @@ func TestFindRouteWithForwarding(t *testing.T) { }) } } + +func TestWritePacketToRemote(t *testing.T) { + const nicID = 1 + const MTU = 1280 + e := channel.New(1, MTU, linkAddr1) + s := stack.New(stack.Options{}) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("CreateNIC(%d) = %s", nicID, err) + } + tests := []struct { + name string + protocol tcpip.NetworkProtocolNumber + payload []byte + }{ + { + name: "SuccessIPv4", + protocol: header.IPv4ProtocolNumber, + payload: []byte{1, 2, 3, 4}, + }, + { + name: "SuccessIPv6", + protocol: header.IPv6ProtocolNumber, + payload: []byte{5, 6, 7, 8}, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + if err := s.WritePacketToRemote(nicID, linkAddr2, test.protocol, buffer.View(test.payload).ToVectorisedView()); err != nil { + t.Fatalf("s.WritePacketToRemote(_, _, _, _) = %s", err) + } + + pkt, ok := e.Read() + if got, want := ok, true; got != want { + t.Fatalf("e.Read() = %t, want %t", got, want) + } + if got, want := pkt.Proto, test.protocol; got != want { + t.Fatalf("pkt.Proto = %d, want %d", got, want) + } + if got, want := pkt.Route.RemoteLinkAddress(), linkAddr2; got != want { + t.Fatalf("pkt.Route.RemoteAddress = %s, want %s", got, want) + } + if diff := cmp.Diff(pkt.Pkt.Data.ToView(), buffer.View(test.payload)); diff != "" { + t.Errorf("pkt.Pkt.Data mismatch (-want +got):\n%s", diff) + } + }) + } + + t.Run("InvalidNICID", func(t *testing.T) { + if got, want := s.WritePacketToRemote(234, linkAddr2, header.IPv4ProtocolNumber, buffer.View([]byte{1}).ToVectorisedView()), tcpip.ErrUnknownDevice; got != want { + t.Fatalf("s.WritePacketToRemote(_, _, _, _) = %s, want = %s", got, want) + } + pkt, ok := e.Read() + if got, want := ok, false; got != want { + t.Fatalf("e.Read() = %t, %v; want %t", got, pkt, want) + } + }) +} diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go index 41a8e5ad0..a692af20b 100644 --- a/pkg/tcpip/stack/transport_demuxer_test.go +++ b/pkg/tcpip/stack/transport_demuxer_test.go @@ -141,11 +141,11 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI // Initialize the IP header. ip := header.IPv6(buf) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(header.UDPMinimumSize + len(payload)), - NextHeader: uint8(udp.ProtocolNumber), - HopLimit: 65, - SrcAddr: testSrcAddrV6, - DstAddr: testDstAddrV6, + PayloadLength: uint16(header.UDPMinimumSize + len(payload)), + TransportProtocol: udp.ProtocolNumber, + HopLimit: 65, + SrcAddr: testSrcAddrV6, + DstAddr: testDstAddrV6, }) // Initialize the UDP header. @@ -307,9 +307,7 @@ func TestBindToDeviceDistribution(t *testing.T) { }(ep) defer ep.Close() - if err := ep.SetSockOptBool(tcpip.ReusePortOption, endpoint.reuse); err != nil { - t.Fatalf("SetSockOptBool(ReusePortOption, %t) on endpoint %d failed: %s", endpoint.reuse, i, err) - } + ep.SocketOptions().SetReusePort(endpoint.reuse) bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice) if err := ep.SetSockOpt(&bindToDeviceOption); err != nil { t.Fatalf("SetSockOpt(&%T(%d)) on endpoint %d failed: %s", bindToDeviceOption, bindToDeviceOption, i, err) diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go index 5b9043d85..66eb562ba 100644 --- a/pkg/tcpip/stack/transport_test.go +++ b/pkg/tcpip/stack/transport_test.go @@ -38,14 +38,15 @@ const ( // use it. type fakeTransportEndpoint struct { stack.TransportEndpointInfo + tcpip.DefaultSocketOptionsHandler proto *fakeTransportProtocol peerAddr tcpip.Address - route stack.Route + route *stack.Route uniqueID uint64 // acceptQueue is non-nil iff bound. - acceptQueue []fakeTransportEndpoint + acceptQueue []*fakeTransportEndpoint // ops is used to set and get socket options. ops tcpip.SocketOptions @@ -64,8 +65,11 @@ func (*fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {} func (f *fakeTransportEndpoint) SocketOptions() *tcpip.SocketOptions { return &f.ops } + func newFakeTransportEndpoint(proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint { - return &fakeTransportEndpoint{TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID} + ep := &fakeTransportEndpoint{TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID} + ep.ops.InitHandler(ep) + return ep } func (f *fakeTransportEndpoint) Abort() { @@ -105,8 +109,8 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions return int64(len(v)), nil, nil } -func (*fakeTransportEndpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { - return 0, tcpip.ControlMessages{}, nil +func (*fakeTransportEndpoint) Peek([][]byte) (int64, *tcpip.Error) { + return 0, nil } // SetSockOpt sets a socket option. Currently not supported. @@ -114,21 +118,11 @@ func (*fakeTransportEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Erro return tcpip.ErrInvalidEndpointState } -// SetSockOptBool sets a socket option. Currently not supported. -func (*fakeTransportEndpoint) SetSockOptBool(tcpip.SockOptBool, bool) *tcpip.Error { - return tcpip.ErrInvalidEndpointState -} - // SetSockOptInt sets a socket option. Currently not supported. func (*fakeTransportEndpoint) SetSockOptInt(tcpip.SockOptInt, int) *tcpip.Error { return tcpip.ErrInvalidEndpointState } -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (*fakeTransportEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - return false, tcpip.ErrUnknownProtocolOption -} - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { return -1, tcpip.ErrUnknownProtocolOption @@ -189,7 +183,7 @@ func (f *fakeTransportEndpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *wai if len(f.acceptQueue) == 0 { return nil, nil, nil } - a := &f.acceptQueue[0] + a := f.acceptQueue[0] f.acceptQueue = f.acceptQueue[1:] return a, nil, nil } @@ -206,7 +200,7 @@ func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error { ); err != nil { return err } - f.acceptQueue = []fakeTransportEndpoint{} + f.acceptQueue = []*fakeTransportEndpoint{} return nil } @@ -232,7 +226,7 @@ func (f *fakeTransportEndpoint) HandlePacket(id stack.TransportEndpointID, pkt * } route.ResolveWith(pkt.SourceLinkAddress()) - f.acceptQueue = append(f.acceptQueue, fakeTransportEndpoint{ + ep := &fakeTransportEndpoint{ TransportEndpointInfo: stack.TransportEndpointInfo{ ID: f.ID, NetProto: f.NetProto, @@ -240,7 +234,9 @@ func (f *fakeTransportEndpoint) HandlePacket(id stack.TransportEndpointID, pkt * proto: f.proto, peerAddr: route.RemoteAddress, route: route, - }) + } + ep.ops.InitHandler(ep) + f.acceptQueue = append(f.acceptQueue, ep) } func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, *stack.PacketBuffer) { diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index f9e83dd1c..45fa62720 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -49,8 +49,9 @@ const ipv4AddressSize = 4 // Error represents an error in the netstack error space. Using a special type // ensures that errors outside of this space are not accidentally introduced. // -// Note: to support save / restore, it is important that all tcpip errors have -// distinct error messages. +// All errors must have unique msg strings. +// +// +stateify savable type Error struct { msg string @@ -247,6 +248,16 @@ func (a Address) WithPrefix() AddressWithPrefix { } } +// Unspecified returns true if the address is unspecified. +func (a Address) Unspecified() bool { + for _, b := range a { + if b != 0 { + return false + } + } + return true +} + // AddressMask is a bitmask for an address. type AddressMask string @@ -481,6 +492,14 @@ type ControlMessages struct { // PacketInfo holds interface and address data on an incoming packet. PacketInfo IPPacketInfo + + // HasOriginalDestinationAddress indicates whether OriginalDstAddress is + // set. + HasOriginalDstAddress bool + + // OriginalDestinationAddress holds the original destination address + // and port of the incoming packet. + OriginalDstAddress FullAddress } // PacketOwner is used to get UID and GID of the packet. @@ -535,7 +554,7 @@ type Endpoint interface { // Peek reads data without consuming it from the endpoint. // // This method does not block if there is no data pending. - Peek([][]byte) (int64, ControlMessages, *Error) + Peek([][]byte) (int64, *Error) // Connect connects the endpoint to its peer. Specifying a NIC is // optional. @@ -593,10 +612,6 @@ type Endpoint interface { // SetSockOpt sets a socket option. SetSockOpt(opt SettableSocketOption) *Error - // SetSockOptBool sets a socket option, for simple cases where a value - // has the bool type. - SetSockOptBool(opt SockOptBool, v bool) *Error - // SetSockOptInt sets a socket option, for simple cases where a value // has the int type. SetSockOptInt(opt SockOptInt, v int) *Error @@ -604,10 +619,6 @@ type Endpoint interface { // GetSockOpt gets a socket option. GetSockOpt(opt GettableSocketOption) *Error - // GetSockOptBool gets a socket option for simple cases where a return - // value has the bool type. - GetSockOptBool(SockOptBool) (bool, *Error) - // GetSockOptInt gets a socket option for simple cases where a return // value has the int type. GetSockOptInt(SockOptInt) (int, *Error) @@ -694,79 +705,6 @@ type WriteOptions struct { Atomic bool } -// SockOptBool represents socket options which values have the bool type. -type SockOptBool int - -const ( - // CorkOption is used by SetSockOptBool/GetSockOptBool to specify if - // data should be held until segments are full by the TCP transport - // protocol. - CorkOption SockOptBool = iota - - // DelayOption is used by SetSockOptBool/GetSockOptBool to specify if - // data should be sent out immediately by the transport protocol. For - // TCP, it determines if the Nagle algorithm is on or off. - DelayOption - - // KeepaliveEnabledOption is used by SetSockOptBool/GetSockOptBool to - // specify whether TCP keepalive is enabled for this socket. - KeepaliveEnabledOption - - // MulticastLoopOption is used by SetSockOptBool/GetSockOptBool to - // specify whether multicast packets sent over a non-loopback interface - // will be looped back. - MulticastLoopOption - - // NoChecksumOption is used by SetSockOptBool/GetSockOptBool to specify - // whether UDP checksum is disabled for this socket. - NoChecksumOption - - // PasscredOption is used by SetSockOptBool/GetSockOptBool to specify - // whether SCM_CREDENTIALS socket control messages are enabled. - // - // Only supported on Unix sockets. - PasscredOption - - // QuickAckOption is stubbed out in SetSockOptBool/GetSockOptBool. - QuickAckOption - - // ReceiveTClassOption is used by SetSockOptBool/GetSockOptBool to - // specify if the IPV6_TCLASS ancillary message is passed with incoming - // packets. - ReceiveTClassOption - - // ReceiveTOSOption is used by SetSockOptBool/GetSockOptBool to specify - // if the TOS ancillary message is passed with incoming packets. - ReceiveTOSOption - - // ReceiveIPPacketInfoOption is used by SetSockOptBool/GetSockOptBool to - // specify if more inforamtion is provided with incoming packets such as - // interface index and address. - ReceiveIPPacketInfoOption - - // ReuseAddressOption is used by SetSockOptBool/GetSockOptBool to - // specify whether Bind() should allow reuse of local address. - ReuseAddressOption - - // ReusePortOption is used by SetSockOptBool/GetSockOptBool to permit - // multiple sockets to be bound to an identical socket address. - ReusePortOption - - // V6OnlyOption is used by SetSockOptBool/GetSockOptBool to specify - // whether an IPv6 socket is to be restricted to sending and receiving - // IPv6 packets only. - V6OnlyOption - - // IPHdrIncludedOption is used by SetSockOpt to indicate for a raw - // endpoint that all packets being written have an IP header and the - // endpoint should not attach an IP header. - IPHdrIncludedOption - - // AcceptConnOption is used by GetSockOptBool to indicate if the - // socket is a listening socket. - AcceptConnOption -) - // SockOptInt represents socket options which values have the int type. type SockOptInt int @@ -1158,14 +1096,6 @@ type RemoveMembershipOption MembershipOption func (*RemoveMembershipOption) isSettableSocketOption() {} -// OutOfBandInlineOption is used by SetSockOpt/GetSockOpt to specify whether -// TCP out-of-band data is delivered along with the normal in-band data. -type OutOfBandInlineOption int - -func (*OutOfBandInlineOption) isGettableSocketOption() {} - -func (*OutOfBandInlineOption) isSettableSocketOption() {} - // SocketDetachFilterOption is used by SetSockOpt to detach a previously attached // classic BPF filter on a given endpoint. type SocketDetachFilterOption int @@ -1215,10 +1145,6 @@ type LingerOption struct { Timeout time.Duration } -func (*LingerOption) isGettableSocketOption() {} - -func (*LingerOption) isSettableSocketOption() {} - // IPPacketInfo is the message structure for IP_PKTINFO. // // +stateify savable @@ -1389,6 +1315,18 @@ type ICMPv6PacketStats struct { // RedirectMsg is the total number of ICMPv6 redirect message packets // counted. RedirectMsg *StatCounter + + // MulticastListenerQuery is the total number of Multicast Listener Query + // messages counted. + MulticastListenerQuery *StatCounter + + // MulticastListenerReport is the total number of Multicast Listener Report + // messages counted. + MulticastListenerReport *StatCounter + + // MulticastListenerDone is the total number of Multicast Listener Done + // messages counted. + MulticastListenerDone *StatCounter } // ICMPv4SentPacketStats collects outbound ICMPv4-specific stats. @@ -1430,6 +1368,10 @@ type ICMPv6SentPacketStats struct { type ICMPv6ReceivedPacketStats struct { ICMPv6PacketStats + // Unrecognized is the total number of ICMPv6 packets received that the + // transport layer does not know how to parse. + Unrecognized *StatCounter + // Invalid is the total number of ICMPv6 packets received that the // transport layer could not parse. Invalid *StatCounter @@ -1439,25 +1381,90 @@ type ICMPv6ReceivedPacketStats struct { RouterOnlyPacketsDroppedByHost *StatCounter } -// ICMPStats collects ICMP-specific stats (both v4 and v6). -type ICMPStats struct { +// ICMPv4Stats collects ICMPv4-specific stats. +type ICMPv4Stats struct { // ICMPv4SentPacketStats contains counts of sent packets by ICMPv4 packet type // and a single count of packets which failed to write to the link // layer. - V4PacketsSent ICMPv4SentPacketStats + PacketsSent ICMPv4SentPacketStats // ICMPv4ReceivedPacketStats contains counts of received packets by ICMPv4 // packet type and a single count of invalid packets received. - V4PacketsReceived ICMPv4ReceivedPacketStats + PacketsReceived ICMPv4ReceivedPacketStats +} +// ICMPv6Stats collects ICMPv6-specific stats. +type ICMPv6Stats struct { // ICMPv6SentPacketStats contains counts of sent packets by ICMPv6 packet type // and a single count of packets which failed to write to the link // layer. - V6PacketsSent ICMPv6SentPacketStats + PacketsSent ICMPv6SentPacketStats // ICMPv6ReceivedPacketStats contains counts of received packets by ICMPv6 // packet type and a single count of invalid packets received. - V6PacketsReceived ICMPv6ReceivedPacketStats + PacketsReceived ICMPv6ReceivedPacketStats +} + +// ICMPStats collects ICMP-specific stats (both v4 and v6). +type ICMPStats struct { + // V4 contains the ICMPv4-specifics stats. + V4 ICMPv4Stats + + // V6 contains the ICMPv4-specifics stats. + V6 ICMPv6Stats +} + +// IGMPPacketStats enumerates counts for all IGMP packet types. +type IGMPPacketStats struct { + // MembershipQuery is the total number of Membership Query messages counted. + MembershipQuery *StatCounter + + // V1MembershipReport is the total number of Version 1 Membership Report + // messages counted. + V1MembershipReport *StatCounter + + // V2MembershipReport is the total number of Version 2 Membership Report + // messages counted. + V2MembershipReport *StatCounter + + // LeaveGroup is the total number of Leave Group messages counted. + LeaveGroup *StatCounter +} + +// IGMPSentPacketStats collects outbound IGMP-specific stats. +type IGMPSentPacketStats struct { + IGMPPacketStats + + // Dropped is the total number of IGMP packets dropped. + Dropped *StatCounter +} + +// IGMPReceivedPacketStats collects inbound IGMP-specific stats. +type IGMPReceivedPacketStats struct { + IGMPPacketStats + + // Invalid is the total number of IGMP packets received that IGMP could not + // parse. + Invalid *StatCounter + + // ChecksumErrors is the total number of IGMP packets dropped due to bad + // checksums. + ChecksumErrors *StatCounter + + // Unrecognized is the total number of unrecognized messages counted, these + // are silently ignored for forward-compatibilty. + Unrecognized *StatCounter +} + +// IGMPStats colelcts IGMP-specific stats. +type IGMPStats struct { + // IGMPSentPacketStats contains counts of sent packets by IGMP packet type + // and a single count of invalid packets received. + PacketsSent IGMPSentPacketStats + + // IGMPReceivedPacketStats contains counts of received packets by IGMP packet + // type and a single count of invalid packets received. + PacketsReceived IGMPReceivedPacketStats } // IPStats collects IP-specific stats (both v4 and v6). @@ -1665,6 +1672,9 @@ type Stats struct { // ICMP breaks out ICMP-specific stats (both v4 and v6). ICMP ICMPStats + // IGMP breaks out IGMP-specific stats. + IGMP IGMPStats + // IP breaks out IP-specific stats (both v4 and v6). IP IPStats diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go index 1c8e2bc34..c461da137 100644 --- a/pkg/tcpip/tcpip_test.go +++ b/pkg/tcpip/tcpip_test.go @@ -226,3 +226,47 @@ func TestAddressWithPrefixSubnet(t *testing.T) { } } } + +func TestAddressUnspecified(t *testing.T) { + tests := []struct { + addr Address + unspecified bool + }{ + { + addr: "", + unspecified: true, + }, + { + addr: "\x00", + unspecified: true, + }, + { + addr: "\x01", + unspecified: false, + }, + { + addr: "\x00\x00", + unspecified: true, + }, + { + addr: "\x01\x00", + unspecified: false, + }, + { + addr: "\x00\x01", + unspecified: false, + }, + { + addr: "\x01\x01", + unspecified: false, + }, + } + + for _, test := range tests { + t.Run(fmt.Sprintf("addr=%s", test.addr), func(t *testing.T) { + if got := test.addr.Unspecified(); got != test.unspecified { + t.Fatalf("got addr.Unspecified() = %t, want = %t", got, test.unspecified) + } + }) + } +} diff --git a/pkg/tcpip/tests/integration/BUILD b/pkg/tcpip/tests/integration/BUILD index 9b0f3b675..800025fb9 100644 --- a/pkg/tcpip/tests/integration/BUILD +++ b/pkg/tcpip/tests/integration/BUILD @@ -25,6 +25,7 @@ go_test( "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", "//pkg/tcpip/transport/icmp", + "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/waiter", "@com_github_google_go_cmp//cmp:go_default_library", diff --git a/pkg/tcpip/tests/integration/loopback_test.go b/pkg/tcpip/tests/integration/loopback_test.go index 421da1add..baaa741cd 100644 --- a/pkg/tcpip/tests/integration/loopback_test.go +++ b/pkg/tcpip/tests/integration/loopback_test.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/waiter" ) @@ -70,8 +71,8 @@ func TestInitialLoopbackAddresses(t *testing.T) { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPDisp: &ndpDispatcher{}, - AutoGenIPv6LinkLocal: true, + NDPDisp: &ndpDispatcher{}, + AutoGenLinkLocal: true, OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{ NICNameFromID: func(nicID tcpip.NICID, nicName string) string { t.Fatalf("should not attempt to get name for NIC with ID = %d; nicName = %s", nicID, nicName) @@ -93,9 +94,10 @@ func TestInitialLoopbackAddresses(t *testing.T) { } } -// TestLoopbackAcceptAllInSubnet tests that a loopback interface considers -// itself bound to all addresses in the subnet of an assigned address. -func TestLoopbackAcceptAllInSubnet(t *testing.T) { +// TestLoopbackAcceptAllInSubnetUDP tests that a loopback interface considers +// itself bound to all addresses in the subnet of an assigned address and UDP +// traffic is sent/received correctly. +func TestLoopbackAcceptAllInSubnetUDP(t *testing.T) { const ( nicID = 1 localPort = 80 @@ -107,7 +109,7 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) { Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: ipv4Addr, } - ipv4Bytes := []byte(ipv4Addr.Address) + ipv4Bytes := []byte(ipv4ProtocolAddress.AddressWithPrefix.Address) ipv4Bytes[len(ipv4Bytes)-1]++ otherIPv4Address := tcpip.Address(ipv4Bytes) @@ -129,7 +131,7 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) { { name: "IPv4 bind to wildcard and send to assigned address", addAddress: ipv4ProtocolAddress, - dstAddr: ipv4Addr.Address, + dstAddr: ipv4ProtocolAddress.AddressWithPrefix.Address, expectRx: true, }, { @@ -148,7 +150,7 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) { name: "IPv4 bind to other subnet-local address and send to assigned address", addAddress: ipv4ProtocolAddress, bindAddr: otherIPv4Address, - dstAddr: ipv4Addr.Address, + dstAddr: ipv4ProtocolAddress.AddressWithPrefix.Address, expectRx: false, }, { @@ -161,7 +163,7 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) { { name: "IPv4 bind to assigned address and send to other subnet-local address", addAddress: ipv4ProtocolAddress, - bindAddr: ipv4Addr.Address, + bindAddr: ipv4ProtocolAddress.AddressWithPrefix.Address, dstAddr: otherIPv4Address, expectRx: false, }, @@ -236,13 +238,17 @@ func TestLoopbackAcceptAllInSubnet(t *testing.T) { t.Fatalf("got sep.Write(_, _) = (%d, _, nil), want = (%d, _, nil)", n, want) } - if gotPayload, _, err := rep.Read(nil); test.expectRx { + var addr tcpip.FullAddress + if gotPayload, _, err := rep.Read(&addr); test.expectRx { if err != nil { - t.Fatalf("reep.Read(nil): %s", err) + t.Fatalf("reep.Read(_): %s", err) } if diff := cmp.Diff(buffer.View(data), gotPayload); diff != "" { t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff) } + if addr.Addr != test.addAddress.AddressWithPrefix.Address { + t.Errorf("got addr.Addr = %s, want = %s", addr.Addr, test.addAddress.AddressWithPrefix.Address) + } } else { if err != tcpip.ErrWouldBlock { t.Fatalf("got rep.Read(nil) = (%x, _, %s), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock) @@ -312,3 +318,168 @@ func TestLoopbackSubnetLifetimeBoundToAddr(t *testing.T) { t.Fatalf("got r.WritePacket(nil, %#v, _) = %s, want = %s", params, err, tcpip.ErrInvalidEndpointState) } } + +// TestLoopbackAcceptAllInSubnetTCP tests that a loopback interface considers +// itself bound to all addresses in the subnet of an assigned address and TCP +// traffic is sent/received correctly. +func TestLoopbackAcceptAllInSubnetTCP(t *testing.T) { + const ( + nicID = 1 + localPort = 80 + ) + + ipv4ProtocolAddress := tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: ipv4Addr, + } + ipv4ProtocolAddress.AddressWithPrefix.PrefixLen = 8 + ipv4Bytes := []byte(ipv4ProtocolAddress.AddressWithPrefix.Address) + ipv4Bytes[len(ipv4Bytes)-1]++ + otherIPv4Address := tcpip.Address(ipv4Bytes) + + ipv6ProtocolAddress := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: ipv6Addr, + } + ipv6Bytes := []byte(ipv6Addr.Address) + ipv6Bytes[len(ipv6Bytes)-1]++ + otherIPv6Address := tcpip.Address(ipv6Bytes) + + tests := []struct { + name string + addAddress tcpip.ProtocolAddress + bindAddr tcpip.Address + dstAddr tcpip.Address + expectAccept bool + }{ + { + name: "IPv4 bind to wildcard and send to assigned address", + addAddress: ipv4ProtocolAddress, + dstAddr: ipv4ProtocolAddress.AddressWithPrefix.Address, + expectAccept: true, + }, + { + name: "IPv4 bind to wildcard and send to other subnet-local address", + addAddress: ipv4ProtocolAddress, + dstAddr: otherIPv4Address, + expectAccept: true, + }, + { + name: "IPv4 bind to wildcard send to other address", + addAddress: ipv4ProtocolAddress, + dstAddr: remoteIPv4Addr, + expectAccept: false, + }, + { + name: "IPv4 bind to other subnet-local address and send to assigned address", + addAddress: ipv4ProtocolAddress, + bindAddr: otherIPv4Address, + dstAddr: ipv4ProtocolAddress.AddressWithPrefix.Address, + expectAccept: false, + }, + { + name: "IPv4 bind and send to other subnet-local address", + addAddress: ipv4ProtocolAddress, + bindAddr: otherIPv4Address, + dstAddr: otherIPv4Address, + expectAccept: true, + }, + { + name: "IPv4 bind to assigned address and send to other subnet-local address", + addAddress: ipv4ProtocolAddress, + bindAddr: ipv4ProtocolAddress.AddressWithPrefix.Address, + dstAddr: otherIPv4Address, + expectAccept: false, + }, + + { + name: "IPv6 bind and send to assigned address", + addAddress: ipv6ProtocolAddress, + bindAddr: ipv6Addr.Address, + dstAddr: ipv6Addr.Address, + expectAccept: true, + }, + { + name: "IPv6 bind to wildcard and send to other subnet-local address", + addAddress: ipv6ProtocolAddress, + dstAddr: otherIPv6Address, + expectAccept: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol}, + }) + if err := s.CreateNIC(nicID, loopback.New()); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID, err) + } + if err := s.AddProtocolAddress(nicID, test.addAddress); err != nil { + t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, test.addAddress, err) + } + s.SetRouteTable([]tcpip.Route{ + tcpip.Route{ + Destination: header.IPv4EmptySubnet, + NIC: nicID, + }, + tcpip.Route{ + Destination: header.IPv6EmptySubnet, + NIC: nicID, + }, + }) + + var wq waiter.Queue + we, ch := waiter.NewChannelEntry(nil) + wq.EventRegister(&we, waiter.EventIn) + defer wq.EventUnregister(&we) + listeningEndpoint, err := s.NewEndpoint(tcp.ProtocolNumber, test.addAddress.Protocol, &wq) + if err != nil { + t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err) + } + defer listeningEndpoint.Close() + + bindAddr := tcpip.FullAddress{Addr: test.bindAddr, Port: localPort} + if err := listeningEndpoint.Bind(bindAddr); err != nil { + t.Fatalf("listeningEndpoint.Bind(%#v): %s", bindAddr, err) + } + + if err := listeningEndpoint.Listen(1); err != nil { + t.Fatalf("listeningEndpoint.Listen(1): %s", err) + } + + connectingEndpoint, err := s.NewEndpoint(tcp.ProtocolNumber, test.addAddress.Protocol, &wq) + if err != nil { + t.Fatalf("s.NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err) + } + defer connectingEndpoint.Close() + + connectAddr := tcpip.FullAddress{ + Addr: test.dstAddr, + Port: localPort, + } + if err := connectingEndpoint.Connect(connectAddr); err != tcpip.ErrConnectStarted { + t.Fatalf("connectingEndpoint.Connect(%#v): %s", connectAddr, err) + } + + if !test.expectAccept { + if _, _, err := listeningEndpoint.Accept(nil); err != tcpip.ErrWouldBlock { + t.Fatalf("got listeningEndpoint.Accept(nil) = %s, want = %s", err, tcpip.ErrWouldBlock) + } + return + } + + // Wait for the listening endpoint to be "readable". That is, wait for a + // new connection. + <-ch + var addr tcpip.FullAddress + if _, _, err := listeningEndpoint.Accept(&addr); err != nil { + t.Fatalf("listeningEndpoint.Accept(nil): %s", err) + } + if addr.Addr != test.addAddress.AddressWithPrefix.Address { + t.Errorf("got addr.Addr = %s, want = %s", addr.Addr, test.addAddress.AddressWithPrefix.Address) + } + }) + } +} diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go index 9d30329f5..2e59f6a42 100644 --- a/pkg/tcpip/tests/integration/multicast_broadcast_test.go +++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go @@ -96,11 +96,11 @@ func TestPingMulticastBroadcast(t *testing.T) { pkt.SetChecksum(header.ICMPv6Checksum(pkt, remoteIPv6Addr, dst, buffer.VectorisedView{})) ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: header.ICMPv6MinimumSize, - NextHeader: uint8(icmp.ProtocolNumber6), - HopLimit: ttl, - SrcAddr: remoteIPv6Addr, - DstAddr: dst, + PayloadLength: header.ICMPv6MinimumSize, + TransportProtocol: icmp.ProtocolNumber6, + HopLimit: ttl, + SrcAddr: remoteIPv6Addr, + DstAddr: dst, }) e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ @@ -272,11 +272,11 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) { ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(payloadLen), - NextHeader: uint8(udp.ProtocolNumber), - HopLimit: ttl, - SrcAddr: remoteIPv6Addr, - DstAddr: dst, + PayloadLength: uint16(payloadLen), + TransportProtocol: udp.ProtocolNumber, + HopLimit: ttl, + SrcAddr: remoteIPv6Addr, + DstAddr: dst, }) e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{ @@ -510,10 +510,7 @@ func TestReuseAddrAndBroadcast(t *testing.T) { } defer ep.Close() - if err := ep.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("eps[%d].SetSockOptBool(tcpip.ReuseAddressOption, true): %s", len(eps), err) - } - + ep.SocketOptions().SetReuseAddress(true) ep.SocketOptions().SetBroadcast(true) bindAddr := tcpip.FullAddress{Port: localPort} diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index 440cb0352..74fe19e98 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -49,6 +49,7 @@ const ( // +stateify savable type endpoint struct { stack.TransportEndpointInfo + tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and are // immutable. @@ -71,11 +72,9 @@ type endpoint struct { // shutdownFlags represent the current shutdown state of the endpoint. shutdownFlags tcpip.ShutdownFlags state endpointState - route stack.Route `state:"manual"` + route *stack.Route `state:"manual"` ttl uint8 stats tcpip.TransportEndpointStats `state:"nosave"` - // linger is used for SO_LINGER socket option. - linger tcpip.LingerOption // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner @@ -85,7 +84,7 @@ type endpoint struct { } func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { - return &endpoint{ + ep := &endpoint{ stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{ NetProto: netProto, @@ -96,7 +95,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt sndBufSize: 32 * 1024, state: stateInitial, uniqueID: s.UniqueID(), - }, nil + } + ep.ops.InitHandler(ep) + return ep, nil } // UniqueID implements stack.TransportEndpoint.UniqueID. @@ -129,7 +130,10 @@ func (e *endpoint) Close() { } e.rcvMu.Unlock() - e.route.Release() + if e.route != nil { + e.route.Release() + e.route = nil + } // Update the state. e.state = stateClosed @@ -142,6 +146,7 @@ func (e *endpoint) Close() { // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. func (e *endpoint) ModerateRecvBuf(copied int) {} +// SetOwner implements tcpip.Endpoint.SetOwner. func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.owner = owner } @@ -267,26 +272,8 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } } - var route *stack.Route - if to == nil { - route = &e.route - - if route.IsResolutionRequired() { - // Promote lock to exclusive if using a shared route, - // given that it may need to change in Route.Resolve() - // call below. - e.mu.RUnlock() - defer e.mu.RLock() - - e.mu.Lock() - defer e.mu.Unlock() - - // Recheck state after lock was re-acquired. - if e.state != stateConnected { - return 0, nil, tcpip.ErrInvalidEndpointState - } - } - } else { + route := e.route + if to != nil { // Reject destination address if it goes through a different // NIC than the endpoint was bound to. nicID := to.NIC @@ -310,7 +297,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } defer r.Release() - route = &r + route = r } if route.IsResolutionRequired() { @@ -343,26 +330,12 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } // Peek only returns data from a single datagram, so do nothing here. -func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { - return 0, tcpip.ControlMessages{}, nil +func (e *endpoint) Peek([][]byte) (int64, *tcpip.Error) { + return 0, nil } // SetSockOpt sets a socket option. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch v := opt.(type) { - case *tcpip.SocketDetachFilterOption: - return nil - - case *tcpip.LingerOption: - e.mu.Lock() - e.linger = *v - e.mu.Unlock() - } - return nil -} - -// SetSockOptBool sets a socket option. Currently not supported. -func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { return nil } @@ -378,17 +351,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { return nil } -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption: - return false, nil - - default: - return false, tcpip.ErrUnknownProtocolOption - } -} - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { @@ -426,16 +388,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { - switch o := opt.(type) { - case *tcpip.LingerOption: - e.mu.Lock() - *o = e.linger - e.mu.Unlock() - return nil - - default: - return tcpip.ErrUnknownProtocolOption - } + return tcpip.ErrUnknownProtocolOption } func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error { @@ -857,6 +810,7 @@ func (*endpoint) LastError() *tcpip.Error { return nil } +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 3bff3755a..9faab4b9e 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -60,6 +60,8 @@ type packet struct { // +stateify savable type endpoint struct { stack.TransportEndpointInfo + tcpip.DefaultSocketOptionsHandler + // The following fields are initialized at creation time and are // immutable. stack *stack.Stack `state:"manual"` @@ -83,8 +85,6 @@ type endpoint struct { stats tcpip.TransportEndpointStats `state:"nosave"` bound bool boundNIC tcpip.NICID - // linger is used for SO_LINGER socket option. - linger tcpip.LingerOption // lastErrorMu protects lastError. lastErrorMu sync.Mutex `state:"nosave"` @@ -107,6 +107,7 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb rcvBufSizeMax: 32 * 1024, sndBufSize: 32 * 1024, } + ep.ops.InitHandler(ep) // Override with stack defaults. var ss stack.SendBufferSizeOption @@ -203,8 +204,8 @@ func (*endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-cha } // Peek implements tcpip.Endpoint.Peek. -func (*endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { - return 0, tcpip.ControlMessages{}, nil +func (*endpoint) Peek([][]byte) (int64, *tcpip.Error) { + return 0, nil } // Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be @@ -303,26 +304,15 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // used with SetSockOpt, and this function always returns // tcpip.ErrNotSupported. func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch v := opt.(type) { + switch opt.(type) { case *tcpip.SocketDetachFilterOption: return nil - case *tcpip.LingerOption: - ep.mu.Lock() - ep.linger = *v - ep.mu.Unlock() - return nil - default: return tcpip.ErrUnknownProtocolOption } } -// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool. -func (ep *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { switch opt { @@ -378,26 +368,7 @@ func (ep *endpoint) LastError() *tcpip.Error { // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { - switch o := opt.(type) { - case *tcpip.LingerOption: - ep.mu.Lock() - *o = ep.linger - ep.mu.Unlock() - return nil - - default: - return tcpip.ErrNotSupported - } -} - -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (*endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - case tcpip.AcceptConnOption: - return false, nil - default: - return false, tcpip.ErrNotSupported - } + return tcpip.ErrNotSupported } // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. @@ -551,8 +522,10 @@ func (ep *endpoint) Stats() tcpip.EndpointStats { return &ep.stats } +// SetOwner implements tcpip.Endpoint.SetOwner. func (ep *endpoint) SetOwner(owner tcpip.PacketOwner) {} +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (ep *endpoint) SocketOptions() *tcpip.SocketOptions { return &ep.ops } diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 4ae1f92ab..87c60bdab 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -58,12 +58,13 @@ type rawPacket struct { // +stateify savable type endpoint struct { stack.TransportEndpointInfo + tcpip.DefaultSocketOptionsHandler + // The following fields are initialized at creation time and are // immutable. stack *stack.Stack `state:"manual"` waiterQueue *waiter.Queue associated bool - hdrIncluded bool // The following fields are used to manage the receive queue and are // protected by rcvMu. @@ -82,10 +83,8 @@ type endpoint struct { bound bool // route is the route to a remote network endpoint. It is set via // Connect(), and is valid only when conneted is true. - route stack.Route `state:"manual"` + route *stack.Route `state:"manual"` stats tcpip.TransportEndpointStats `state:"nosave"` - // linger is used for SO_LINGER socket option. - linger tcpip.LingerOption // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner @@ -114,8 +113,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt rcvBufSizeMax: 32 * 1024, sndBufSizeMax: 32 * 1024, associated: associated, - hdrIncluded: !associated, } + e.ops.InitHandler(e) + e.ops.SetHeaderIncluded(!associated) // Override with stack defaults. var ss stack.SendBufferSizeOption @@ -170,9 +170,11 @@ func (e *endpoint) Close() { e.rcvList.Remove(e.rcvList.Front()) } - if e.connected { + e.connected = false + + if e.route != nil { e.route.Release() - e.connected = false + e.route = nil } e.closed = true @@ -223,6 +225,13 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c return 0, nil, tcpip.ErrInvalidOptionValue } + if opts.To != nil { + // Raw sockets do not support sending to a IPv4 address on a IPv6 endpoint. + if e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber && len(opts.To.Addr) != header.IPv6AddressSize { + return 0, nil, tcpip.ErrInvalidOptionValue + } + } + n, ch, err := e.write(p, opts) switch err { case nil: @@ -266,7 +275,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c // If this is an unassociated socket and callee provided a nonzero // destination address, route using that address. - if e.hdrIncluded { + if e.ops.GetHeaderIncluded() { ip := header.IPv4(payloadBytes) if !ip.IsValid(len(payloadBytes)) { e.mu.RUnlock() @@ -296,7 +305,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } if e.route.IsResolutionRequired() { - savedRoute := &e.route + savedRoute := e.route // Promote lock to exclusive if using a shared route, // given that it may need to change in finishWrite. e.mu.RUnlock() @@ -304,7 +313,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c // Make sure that the route didn't change during the // time we didn't hold the lock. - if !e.connected || savedRoute != &e.route { + if !e.connected || savedRoute != e.route { e.mu.Unlock() return 0, nil, tcpip.ErrInvalidEndpointState } @@ -314,7 +323,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c return n, ch, err } - n, ch, err := e.finishWrite(payloadBytes, &e.route) + n, ch, err := e.finishWrite(payloadBytes, e.route) e.mu.RUnlock() return n, ch, err } @@ -335,7 +344,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c return 0, nil, err } - n, ch, err := e.finishWrite(payloadBytes, &route) + n, ch, err := e.finishWrite(payloadBytes, route) route.Release() e.mu.RUnlock() return n, ch, err @@ -356,7 +365,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64, } } - if e.hdrIncluded { + if e.ops.GetHeaderIncluded() { pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: buffer.View(payloadBytes).ToVectorisedView(), }) @@ -382,8 +391,8 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64, } // Peek implements tcpip.Endpoint.Peek. -func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { - return 0, tcpip.ControlMessages{}, nil +func (e *endpoint) Peek([][]byte) (int64, *tcpip.Error) { + return 0, nil } // Disconnect implements tcpip.Endpoint.Disconnect. @@ -393,6 +402,11 @@ func (*endpoint) Disconnect() *tcpip.Error { // Connect implements tcpip.Endpoint.Connect. func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { + // Raw sockets do not support connecting to a IPv4 address on a IPv6 endpoint. + if e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber && len(addr.Addr) != header.IPv6AddressSize { + return tcpip.ErrAddressFamilyNotSupported + } + e.mu.Lock() defer e.mu.Unlock() @@ -516,33 +530,15 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // SetSockOpt implements tcpip.Endpoint.SetSockOpt. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch v := opt.(type) { + switch opt.(type) { case *tcpip.SocketDetachFilterOption: return nil - case *tcpip.LingerOption: - e.mu.Lock() - e.linger = *v - e.mu.Unlock() - return nil - default: return tcpip.ErrUnknownProtocolOption } } -// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool. -func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - switch opt { - case tcpip.IPHdrIncludedOption: - e.mu.Lock() - e.hdrIncluded = v - e.mu.Unlock() - return nil - } - return tcpip.ErrUnknownProtocolOption -} - // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { switch opt { @@ -589,33 +585,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { - switch o := opt.(type) { - case *tcpip.LingerOption: - e.mu.Lock() - *o = e.linger - e.mu.Unlock() - return nil - - default: - return tcpip.ErrUnknownProtocolOption - } -} - -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption: - return false, nil - - case tcpip.IPHdrIncludedOption: - e.mu.Lock() - v := e.hdrIncluded - e.mu.Unlock() - return v, nil - - default: - return false, tcpip.ErrUnknownProtocolOption - } + return tcpip.ErrUnknownProtocolOption } // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. @@ -756,10 +726,12 @@ func (e *endpoint) Stats() tcpip.EndpointStats { // Wait implements stack.TransportEndpoint.Wait. func (*endpoint) Wait() {} +// LastError implements tcpip.Endpoint.LastError. func (*endpoint) LastError() *tcpip.Error { return nil } +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go index 7d97cbdc7..4a7e1c039 100644 --- a/pkg/tcpip/transport/raw/endpoint_state.go +++ b/pkg/tcpip/transport/raw/endpoint_state.go @@ -73,7 +73,13 @@ func (e *endpoint) Resume(s *stack.Stack) { // If the endpoint is connected, re-connect. if e.connected { var err *tcpip.Error - e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, e.route.RemoteAddress, e.NetProto, false) + // TODO(gvisor.dev/issue/4906): Properly restore the route with the right + // remote address. We used to pass e.remote.RemoteAddress which was + // effectively the empty address but since moving e.route to hold a pointer + // to a route instead of the route by value, we pass the empty address + // directly. Obviously this was always wrong since we should provide the + // remote address we were connected to, to properly restore the route. + e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, "", e.NetProto, false) if err != nil { panic(err) } diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 518449602..cf232b508 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "more_shards") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -45,7 +45,9 @@ go_library( "rcv.go", "rcv_state.go", "reno.go", + "reno_recovery.go", "sack.go", + "sack_recovery.go", "sack_scoreboard.go", "segment.go", "segment_heap.go", @@ -91,7 +93,7 @@ go_test( "tcp_test.go", "tcp_timestamp_test.go", ], - shard_count = 10, + shard_count = more_shards, deps = [ ":tcp", "//pkg/rand", diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 6e5adc383..3e1041cbe 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -213,7 +213,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i route.ResolveWith(s.remoteLinkAddr) n := newEndpoint(l.stack, netProto, queue) - n.v6only = l.v6Only + n.ops.SetV6Only(l.v6Only) n.ID = s.id n.boundNICID = s.nicID n.route = route @@ -599,7 +599,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) *tcpip.Er ack: s.sequenceNumber + 1, rcvWnd: ctx.rcvWnd, } - if err := e.sendSynTCP(&route, fields, synOpts); err != nil { + if err := e.sendSynTCP(route, fields, synOpts); err != nil { return err } e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment() @@ -752,7 +752,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) *tcpip.Er // its own goroutine and is responsible for handling connection requests. func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) { e.mu.Lock() - v6Only := e.v6only + v6Only := e.ops.GetV6Only() ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto) defer func() { diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index ac6d879a7..c944dccc0 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -16,6 +16,7 @@ package tcp import ( "encoding/binary" + "math" "time" "gvisor.dev/gvisor/pkg/rand" @@ -133,7 +134,7 @@ func FindWndScale(wnd seqnum.Size) int { return 0 } - max := seqnum.Size(0xffff) + max := seqnum.Size(math.MaxUint16) s := 0 for wnd > max && s < header.MaxWndScale { s++ @@ -300,7 +301,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error { if ttl == 0 { ttl = h.ep.route.DefaultTTL() } - h.ep.sendSynTCP(&h.ep.route, tcpFields{ + h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.ID, ttl: ttl, tos: h.ep.sendTOS, @@ -361,7 +362,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error { SACKPermitted: h.ep.sackPermitted, MSS: h.ep.amss, } - h.ep.sendSynTCP(&h.ep.route, tcpFields{ + h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.ID, ttl: h.ep.ttl, tos: h.ep.sendTOS, @@ -496,7 +497,7 @@ func (h *handshake) resolveRoute() *tcpip.Error { h.ep.mu.Lock() } if n¬ifyError != 0 { - return h.ep.LastError() + return h.ep.lastErrorLocked() } } @@ -547,7 +548,7 @@ func (h *handshake) start() *tcpip.Error { } h.sendSYNOpts = synOpts - h.ep.sendSynTCP(&h.ep.route, tcpFields{ + h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.ID, ttl: h.ep.ttl, tos: h.ep.sendTOS, @@ -575,7 +576,6 @@ func (h *handshake) complete() *tcpip.Error { return err } defer timer.stop() - for h.state != handshakeCompleted { // Unlock before blocking, and reacquire again afterwards (h.ep.mu is held // throughout handshake processing). @@ -597,7 +597,7 @@ func (h *handshake) complete() *tcpip.Error { // the connection with another ACK or data (as ACKs are never // retransmitted on their own). if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept { - h.ep.sendSynTCP(&h.ep.route, tcpFields{ + h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.ID, ttl: h.ep.ttl, tos: h.ep.sendTOS, @@ -631,9 +631,8 @@ func (h *handshake) complete() *tcpip.Error { h.ep.mu.Lock() } if n¬ifyError != 0 { - return h.ep.LastError() + return h.ep.lastErrorLocked() } - case wakerForNewSegment: if err := h.processSegments(); err != nil { return err @@ -820,8 +819,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso data = data.Clone(nil) optLen := len(tf.opts) - if tf.rcvWnd > 0xffff { - tf.rcvWnd = 0xffff + if tf.rcvWnd > math.MaxUint16 { + tf.rcvWnd = math.MaxUint16 } mss := int(gso.MSS) @@ -865,8 +864,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso // network endpoint and under the provided identity. func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error { optLen := len(tf.opts) - if tf.rcvWnd > 0xffff { - tf.rcvWnd = 0xffff + if tf.rcvWnd > math.MaxUint16 { + tf.rcvWnd = math.MaxUint16 } if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() { @@ -941,7 +940,7 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] } options := e.makeOptions(sackBlocks) - err := e.sendTCP(&e.route, tcpFields{ + err := e.sendTCP(e.route, tcpFields{ id: e.ID, ttl: e.ttl, tos: e.sendTOS, @@ -1002,7 +1001,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) { // Only send a reset if the connection is being aborted for a reason // other than receiving a reset. e.setEndpointState(StateError) - e.HardError = err + e.hardError = err if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout { // The exact sequence number to be used for the RST is the same as the // one used by Linux. We need to handle the case of window being shrunk @@ -1080,7 +1079,7 @@ func (e *endpoint) transitionToStateCloseLocked() { // to any other listening endpoint. We reply with RST if we cannot find one. func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) { ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, s.nicID) - if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" { + if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != "" { // Dual-stack socket, try IPv4. ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, s.nicID) } @@ -1141,7 +1140,7 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) { // delete the TCB, and return. case StateCloseWait: e.transitionToStateCloseLocked() - e.HardError = tcpip.ErrAborted + e.hardError = tcpip.ErrAborted e.notifyProtocolGoroutine(notifyTickleWorker) return false, nil default: @@ -1286,7 +1285,7 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error { userTimeout := e.userTimeout e.keepalive.Lock() - if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() { + if !e.SocketOptions().GetKeepAlive() || !e.keepalive.timer.checkExpiration() { e.keepalive.Unlock() return nil } @@ -1323,7 +1322,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) { } // Start the keepalive timer IFF it's enabled and there is no pending // data to send. - if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt { + if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.sndUna != e.snd.sndNxt { e.keepalive.timer.disable() e.keepalive.Unlock() return @@ -1353,7 +1352,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ epilogue := func() { // e.mu is expected to be hold upon entering this section. - if e.snd != nil { e.snd.resendTimer.cleanup() } @@ -1383,7 +1381,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ e.lastErrorMu.Unlock() e.setEndpointState(StateError) - e.HardError = err + e.hardError = err e.workerCleanup = true // Lock released below. @@ -1638,7 +1636,7 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func() } extTW, newSyn := e.rcv.handleTimeWaitSegment(s) if newSyn { - info := e.EndpointInfo.TransportEndpointInfo + info := e.TransportEndpointInfo newID := info.ID newID.RemoteAddress = "" newID.RemotePort = 0 diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go index a6f25896b..1d1b01a6c 100644 --- a/pkg/tcpip/transport/tcp/dual_stack_test.go +++ b/pkg/tcpip/transport/tcp/dual_stack_test.go @@ -405,14 +405,6 @@ func testV4Accept(t *testing.T, c *context.Context) { } } - // Make sure we get the same error when calling the original ep and the - // new one. This validates that v4-mapped endpoints are still able to - // query the V6Only flag, whereas pure v4 endpoints are not. - _, expected := c.EP.GetSockOptBool(tcpip.V6OnlyOption) - if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != expected { - t.Fatalf("GetSockOpt returned unexpected value: got %v, want %v", err, expected) - } - // Check the peer address. addr, err := nep.GetRemoteAddress() if err != nil { @@ -530,12 +522,12 @@ func TestV6AcceptOnV6(t *testing.T) { c.WQ.EventRegister(&we, waiter.EventIn) defer c.WQ.EventUnregister(&we) var addr tcpip.FullAddress - nep, _, err := c.EP.Accept(&addr) + _, _, err := c.EP.Accept(&addr) if err == tcpip.ErrWouldBlock { // Wait for connection to be established. select { case <-ch: - nep, _, err = c.EP.Accept(&addr) + _, _, err = c.EP.Accept(&addr) if err != nil { t.Fatalf("Accept failed: %v", err) } @@ -548,12 +540,6 @@ func TestV6AcceptOnV6(t *testing.T) { if addr.Addr != context.TestV6Addr { t.Errorf("Unexpected remote address: got %s, want %s", addr.Addr, context.TestV6Addr) } - - // Make sure we can still query the v6 only status of the new endpoint, - // that is, that it is in fact a v6 socket. - if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil { - t.Errorf("GetSockOptBool(tcpip.V6OnlyOption) failed: %s", err) - } } func TestV4AcceptOnV4(t *testing.T) { diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 4f4f4c65e..bb0795f78 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -310,16 +310,12 @@ type Stats struct { func (*Stats) IsEndpointStats() {} // EndpointInfo holds useful information about a transport endpoint which -// can be queried by monitoring tools. +// can be queried by monitoring tools. This exists to allow tcp-only state to +// be exposed. // // +stateify savable type EndpointInfo struct { stack.TransportEndpointInfo - - // HardError is meaningful only when state is stateError. It stores the - // error to be returned when read/write syscalls are called and the - // endpoint is in this state. HardError is protected by endpoint mu. - HardError *tcpip.Error `state:".(string)"` } // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo @@ -367,6 +363,7 @@ func (*EndpointInfo) IsEndpointInfo() {} // +stateify savable type endpoint struct { EndpointInfo + tcpip.DefaultSocketOptionsHandler // endpointEntry is used to queue endpoints for processing to the // a given tcp processor goroutine. @@ -386,6 +383,11 @@ type endpoint struct { waiterQueue *waiter.Queue `state:"wait"` uniqueID uint64 + // hardError is meaningful only when state is stateError. It stores the + // error to be returned when read/write syscalls are called and the + // endpoint is in this state. hardError is protected by endpoint mu. + hardError *tcpip.Error `state:".(string)"` + // lastError represents the last error that the endpoint reported; // access to it is protected by the following mutex. lastErrorMu sync.Mutex `state:"nosave"` @@ -421,7 +423,10 @@ type endpoint struct { // mu protects all endpoint fields unless documented otherwise. mu must // be acquired before interacting with the endpoint fields. - mu sync.Mutex `state:"nosave"` + // + // During handshake, mu is locked by the protocol listen goroutine and + // released by the handshake completion goroutine. + mu sync.CrossGoroutineMutex `state:"nosave"` ownedByUser uint32 // state must be read/set using the EndpointState()/setEndpointState() @@ -436,9 +441,8 @@ type endpoint struct { isPortReserved bool `state:"manual"` isRegistered bool `state:"manual"` boundNICID tcpip.NICID - route stack.Route `state:"manual"` + route *stack.Route `state:"manual"` ttl uint8 - v6only bool isConnectNotified bool // h stores a reference to the current handshake state if the endpoint is in @@ -506,24 +510,9 @@ type endpoint struct { // delay is a boolean (0 is false) and must be accessed atomically. delay uint32 - // cork holds back segments until full. - // - // cork is a boolean (0 is false) and must be accessed atomically. - cork uint32 - // scoreboard holds TCP SACK Scoreboard information for this endpoint. scoreboard *SACKScoreboard - // The options below aren't implemented, but we remember the user - // settings because applications expect to be able to set/query these - // options. - - // slowAck holds the negated state of quick ack. It is stubbed out and - // does nothing. - // - // slowAck is a boolean (0 is false) and must be accessed atomically. - slowAck uint32 - // segmentQueue is used to hand received segments to the protocol // goroutine. Segments are queued as long as the queue is not full, // and dropped when it is. @@ -685,9 +674,6 @@ type endpoint struct { // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner - // linger is used for SO_LINGER socket option. - linger tcpip.LingerOption - // ops is used to get socket level options. ops tcpip.SocketOptions } @@ -701,7 +687,7 @@ func (e *endpoint) UniqueID() uint64 { // // If userMSS is non-zero and is not greater than the maximum possible MSS for // r, it will be used; otherwise, the maximum possible MSS will be used. -func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 { +func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { // The maximum possible MSS is dependent on the route. // TODO(b/143359391): Respect TCP Min and Max size. maxMSS := uint16(r.MTU() - header.TCPMinimumSize) @@ -850,7 +836,6 @@ func (e *endpoint) recentTimestamp() uint32 { // +stateify savable type keepalive struct { sync.Mutex `state:"nosave"` - enabled bool idle time.Duration interval time.Duration count int @@ -884,6 +869,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue windowClamp: DefaultReceiveBufferSize, maxSynRetries: DefaultSynRetries, } + e.ops.InitHandler(e) + e.ops.SetMulticastLoop(true) + e.ops.SetQuickAck(true) var ss tcpip.TCPSendBufferSizeRangeOption if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { @@ -907,7 +895,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue var de tcpip.TCPDelayEnabled if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { - e.SetSockOptBool(tcpip.DelayOption, true) + e.ops.SetDelayOption(true) } var tcpLT tcpip.TCPLingerTimeoutOption @@ -1049,7 +1037,8 @@ func (e *endpoint) Close() { return } - if e.linger.Enabled && e.linger.Timeout == 0 { + linger := e.SocketOptions().GetLinger() + if linger.Enabled && linger.Timeout == 0 { s := e.EndpointState() isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv if isResetState { @@ -1169,7 +1158,11 @@ func (e *endpoint) cleanupLocked() { e.boundPortFlags = ports.Flags{} e.boundDest = tcpip.FullAddress{} - e.route.Release() + if e.route != nil { + e.route.Release() + e.route = nil + } + e.stack.CompleteTransportEndpointCleanup(e) tcpip.DeleteDanglingEndpoint(e) } @@ -1279,11 +1272,20 @@ func (e *endpoint) ModerateRecvBuf(copied int) { e.rcvListMu.Unlock() } +// SetOwner implements tcpip.Endpoint.SetOwner. func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.owner = owner } -func (e *endpoint) LastError() *tcpip.Error { +// Preconditions: e.mu must be held to call this function. +func (e *endpoint) hardErrorLocked() *tcpip.Error { + err := e.hardError + e.hardError = nil + return err +} + +// Preconditions: e.mu must be held to call this function. +func (e *endpoint) lastErrorLocked() *tcpip.Error { e.lastErrorMu.Lock() defer e.lastErrorMu.Unlock() err := e.lastError @@ -1291,6 +1293,16 @@ func (e *endpoint) LastError() *tcpip.Error { return err } +// LastError implements tcpip.Endpoint.LastError. +func (e *endpoint) LastError() *tcpip.Error { + e.LockUser() + defer e.UnlockUser() + if err := e.hardErrorLocked(); err != nil { + return err + } + return e.lastErrorLocked() +} + // Read reads data from the endpoint. func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { e.LockUser() @@ -1312,9 +1324,11 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, bufUsed := e.rcvBufUsed if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { e.rcvListMu.Unlock() - he := e.HardError if s == StateError { - return buffer.View{}, tcpip.ControlMessages{}, he + if err := e.hardErrorLocked(); err != nil { + return buffer.View{}, tcpip.ControlMessages{}, err + } + return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive } e.stats.ReadErrors.NotConnected.Increment() return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected @@ -1370,9 +1384,13 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) { // indicating the reason why it's not writable. // Caller must hold e.mu and e.sndBufMu func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) { + // The endpoint cannot be written to if it's not connected. switch s := e.EndpointState(); { case s == StateError: - return 0, e.HardError + if err := e.hardErrorLocked(); err != nil { + return 0, err + } + return 0, tcpip.ErrClosedForSend case !s.connecting() && !s.connected(): return 0, tcpip.ErrClosedForSend case s.connecting(): @@ -1478,7 +1496,7 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c // Peek reads data without consuming it from the endpoint. // // This method does not block if there is no data pending. -func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { +func (e *endpoint) Peek(vec [][]byte) (int64, *tcpip.Error) { e.LockUser() defer e.UnlockUser() @@ -1486,10 +1504,10 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro // but has some pending unread data. if s := e.EndpointState(); !s.connected() && s != StateClose { if s == StateError { - return 0, tcpip.ControlMessages{}, e.HardError + return 0, e.hardErrorLocked() } e.stats.ReadErrors.InvalidEndpointState.Increment() - return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState + return 0, tcpip.ErrInvalidEndpointState } e.rcvListMu.Lock() @@ -1498,9 +1516,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro if e.rcvBufUsed == 0 { if e.rcvClosed || !e.EndpointState().connected() { e.stats.ReadErrors.ReadClosed.Increment() - return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive + return 0, tcpip.ErrClosedForReceive } - return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock + return 0, tcpip.ErrWouldBlock } // Make a copy of vec so we can modify the slide headers. @@ -1515,7 +1533,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro for len(v) > 0 { if len(vec) == 0 { - return num, tcpip.ControlMessages{}, nil + return num, nil } if len(vec[0]) == 0 { vec = vec[1:] @@ -1530,7 +1548,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro } } - return num, tcpip.ControlMessages{}, nil + return num, nil } // selectWindowLocked returns the new window without checking for shrinking or scaling @@ -1602,72 +1620,39 @@ func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed boo return false, false } -// SetSockOptBool sets a socket option. -func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - switch opt { - - case tcpip.CorkOption: - e.LockUser() - if !v { - atomic.StoreUint32(&e.cork, 0) - - // Handle the corked data. - e.sndWaker.Assert() - } else { - atomic.StoreUint32(&e.cork, 1) - } - e.UnlockUser() - - case tcpip.DelayOption: - if v { - atomic.StoreUint32(&e.delay, 1) - } else { - atomic.StoreUint32(&e.delay, 0) - - // Handle delayed data. - e.sndWaker.Assert() - } - - case tcpip.KeepaliveEnabledOption: - e.keepalive.Lock() - e.keepalive.enabled = v - e.keepalive.Unlock() - e.notifyProtocolGoroutine(notifyKeepaliveChanged) - - case tcpip.QuickAckOption: - o := uint32(1) - if v { - o = 0 - } - atomic.StoreUint32(&e.slowAck, o) - - case tcpip.ReuseAddressOption: - e.LockUser() - e.portFlags.TupleOnly = v - e.UnlockUser() - - case tcpip.ReusePortOption: - e.LockUser() - e.portFlags.LoadBalanced = v - e.UnlockUser() +// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. +func (e *endpoint) OnReuseAddressSet(v bool) { + e.LockUser() + e.portFlags.TupleOnly = v + e.UnlockUser() +} - case tcpip.V6OnlyOption: - // We only recognize this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return tcpip.ErrInvalidEndpointState - } +// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. +func (e *endpoint) OnReusePortSet(v bool) { + e.LockUser() + e.portFlags.LoadBalanced = v + e.UnlockUser() +} - // We only allow this to be set when we're in the initial state. - if e.EndpointState() != StateInitial { - return tcpip.ErrInvalidEndpointState - } +// OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. +func (e *endpoint) OnKeepAliveSet(v bool) { + e.notifyProtocolGoroutine(notifyKeepaliveChanged) +} - e.LockUser() - e.v6only = v - e.UnlockUser() +// OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. +func (e *endpoint) OnDelayOptionSet(v bool) { + if !v { + // Handle delayed data. + e.sndWaker.Assert() } +} - return nil +// OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. +func (e *endpoint) OnCorkOptionSet(v bool) { + if !v { + // Handle the corked data. + e.sndWaker.Assert() + } } // SetSockOptInt sets a socket option. @@ -1851,9 +1836,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { e.keepalive.Unlock() e.notifyProtocolGoroutine(notifyKeepaliveChanged) - case *tcpip.OutOfBandInlineOption: - // We don't currently support disabling this option. - case *tcpip.TCPUserTimeoutOption: e.LockUser() e.userTimeout = time.Duration(*v) @@ -1922,11 +1904,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { case *tcpip.SocketDetachFilterOption: return nil - case *tcpip.LingerOption: - e.LockUser() - e.linger = *v - e.UnlockUser() - default: return nil } @@ -1949,67 +1926,6 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) { return e.rcvBufUsed, nil } -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - - case tcpip.CorkOption: - return atomic.LoadUint32(&e.cork) != 0, nil - - case tcpip.DelayOption: - return atomic.LoadUint32(&e.delay) != 0, nil - - case tcpip.KeepaliveEnabledOption: - e.keepalive.Lock() - v := e.keepalive.enabled - e.keepalive.Unlock() - - return v, nil - - case tcpip.QuickAckOption: - v := atomic.LoadUint32(&e.slowAck) == 0 - return v, nil - - case tcpip.ReuseAddressOption: - e.LockUser() - v := e.portFlags.TupleOnly - e.UnlockUser() - - return v, nil - - case tcpip.ReusePortOption: - e.LockUser() - v := e.portFlags.LoadBalanced - e.UnlockUser() - - return v, nil - - case tcpip.V6OnlyOption: - // We only recognize this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return false, tcpip.ErrUnknownProtocolOption - } - - e.LockUser() - v := e.v6only - e.UnlockUser() - - return v, nil - - case tcpip.MulticastLoopOption: - return true, nil - - case tcpip.AcceptConnOption: - e.LockUser() - defer e.UnlockUser() - - return e.EndpointState() == StateListen, nil - - default: - return false, tcpip.ErrUnknownProtocolOption - } -} - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { @@ -2120,10 +2036,6 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { *o = tcpip.TCPUserTimeoutOption(e.userTimeout) e.UnlockUser() - case *tcpip.OutOfBandInlineOption: - // We don't currently support disabling this option. - *o = 1 - case *tcpip.CongestionControlOption: e.LockUser() *o = e.cc @@ -2152,11 +2064,6 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { Port: port, } - case *tcpip.LingerOption: - e.LockUser() - *o = e.linger - e.UnlockUser() - default: return tcpip.ErrUnknownProtocolOption } @@ -2166,7 +2073,7 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { // checkV4MappedLocked determines the effective network protocol and converts // addr to its canonical form. func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) { - unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only) + unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) if err != nil { return tcpip.FullAddress{}, 0, err } @@ -2243,7 +2150,10 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc return tcpip.ErrAlreadyConnecting case StateError: - return e.HardError + if err := e.hardErrorLocked(); err != nil { + return err + } + return tcpip.ErrConnectionAborted default: return tcpip.ErrInvalidEndpointState @@ -2417,7 +2327,7 @@ func (e *endpoint) startMainLoop(handshake bool) *tcpip.Error { e.lastErrorMu.Unlock() e.setEndpointState(StateError) - e.HardError = err + e.hardError = err // Call cleanupLocked to free up any reservations. e.cleanupLocked() @@ -2697,7 +2607,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) { // v6only set to false. if netProto == header.IPv6ProtocolNumber { stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) - alsoBindToV4 := !e.v6only && addr.Addr == "" && stackHasV4 + alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4 if alsoBindToV4 { netProtos = append(netProtos, header.IPv4ProtocolNumber) } @@ -2782,7 +2692,7 @@ func (e *endpoint) getRemoteAddress() tcpip.FullAddress { func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) { // TCP HandlePacket is not required anymore as inbound packets first - // land at the Dispatcher which then can either delivery using the + // land at the Dispatcher which then can either deliver using the // worker go routine or directly do the invoke the tcp processing inline // based on the state of the endpoint. } @@ -3079,6 +2989,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState { Ssthresh: e.snd.sndSsthresh, SndCAAckCount: e.snd.sndCAAckCount, Outstanding: e.snd.outstanding, + SackedOut: e.snd.sackedOut, SndWnd: e.snd.sndWnd, SndUna: e.snd.sndUna, SndNxt: e.snd.sndNxt, @@ -3161,7 +3072,7 @@ func (e *endpoint) State() uint32 { func (e *endpoint) Info() tcpip.EndpointInfo { e.LockUser() // Make a copy of the endpoint info. - ret := e.EndpointInfo + ret := e.TransportEndpointInfo e.UnlockUser() return &ret } @@ -3187,6 +3098,7 @@ func (e *endpoint) Wait() { } } +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index bb901c0f8..ba67176b5 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -321,21 +321,21 @@ func (e *endpoint) loadRecentTSTime(unix unixTime) { } // saveHardError is invoked by stateify. -func (e *EndpointInfo) saveHardError() string { - if e.HardError == nil { +func (e *endpoint) saveHardError() string { + if e.hardError == nil { return "" } - return e.HardError.String() + return e.hardError.String() } // loadHardError is invoked by stateify. -func (e *EndpointInfo) loadHardError(s string) { +func (e *endpoint) loadHardError(s string) { if s == "" { return } - e.HardError = tcpip.StringToError(s) + e.hardError = tcpip.StringToError(s) } // saveMeasureTime is invoked by stateify. diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 2329aca4b..672159eed 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -250,7 +250,7 @@ func replyWithReset(stack *stack.Stack, s *segment, tos, ttl uint8) *tcpip.Error ttl = route.DefaultTTL() } - return sendTCP(&route, tcpFields{ + return sendTCP(route, tcpFields{ id: s.id, ttl: ttl, tos: tos, diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index 8e0b7c843..405a6dce7 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -16,6 +16,7 @@ package tcp import ( "container/heap" + "math" "time" "gvisor.dev/gvisor/pkg/tcpip" @@ -48,6 +49,10 @@ type receiver struct { rcvWndScale uint8 + // prevBufused is the snapshot of endpoint rcvBufUsed taken when we + // advertise a receive window. + prevBufUsed int + closed bool // pendingRcvdSegments is bounded by the receive buffer size of the @@ -80,9 +85,9 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool { // outgoing packets, we should use what we have advertised for acceptability // test. scaledWindowSize := r.rcvWnd >> r.rcvWndScale - if scaledWindowSize > 0xffff { + if scaledWindowSize > math.MaxUint16 { // This is what we actually put in the Window field. - scaledWindowSize = 0xffff + scaledWindowSize = math.MaxUint16 } advertisedWindowSize := scaledWindowSize << r.rcvWndScale return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize)) @@ -106,6 +111,34 @@ func (r *receiver) currentWindow() (curWnd seqnum.Size) { func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { newWnd := r.ep.selectWindow() curWnd := r.currentWindow() + unackLen := int(r.ep.snd.maxSentAck.Size(r.rcvNxt)) + bufUsed := r.ep.receiveBufferUsed() + + // Grow the right edge of the window only for payloads larger than the + // the segment overhead OR if the application is actively consuming data. + // + // Avoiding growing the right edge otherwise, addresses a situation below: + // An application has been slow in reading data and we have burst of + // incoming segments lengths < segment overhead. Here, our available free + // memory would reduce drastically when compared to the advertised receive + // window. + // + // For example: With incoming 512 bytes segments, segment overhead of + // 552 bytes (at the time of writing this comment), with receive window + // starting from 1MB and with rcvAdvWndScale being 1, buffer would reach 0 + // when the curWnd is still 19436 bytes, because for every incoming segment + // newWnd would reduce by (552+512) >> rcvAdvWndScale (current value 1), + // while curWnd would reduce by 512 bytes. + // Such a situation causes us to keep tail dropping the incoming segments + // and never advertise zero receive window to the peer. + // + // Linux does a similar check for minimal sk_buff size (128): + // https://github.com/torvalds/linux/blob/d5beb3140f91b1c8a3d41b14d729aefa4dcc58bc/net/ipv4/tcp_input.c#L783 + // + // Also, if the application is reading the data, we keep growing the right + // edge, as we are still advertising a window that we think can be serviced. + toGrow := unackLen >= SegSize || bufUsed <= r.prevBufUsed + // Update rcvAcc only if new window is > previously advertised window. We // should never shrink the acceptable sequence space once it has been // advertised the peer. If we shrink the acceptable sequence space then we @@ -115,7 +148,7 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { // rcvWUP rcvNxt rcvAcc new rcvAcc // <=====curWnd ===> // <========= newWnd > curWnd ========= > - if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) { + if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) && toGrow { // If the new window moves the right edge, then update rcvAcc. r.rcvAcc = r.rcvNxt.Add(seqnum.Size(newWnd)) } else { @@ -130,11 +163,22 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { // receiver's estimated RTT. r.rcvWnd = newWnd r.rcvWUP = r.rcvNxt + r.prevBufUsed = bufUsed scaledWnd := r.rcvWnd >> r.rcvWndScale if scaledWnd == 0 { // Increment a metric if we are advertising an actual zero window. r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment() } + + // If we started off with a window larger than what can he held in + // the 16bit window field, we ceil the value to the max value. + if scaledWnd > math.MaxUint16 { + scaledWnd = seqnum.Size(math.MaxUint16) + + // Ensure that the stashed receive window always reflects what + // is being advertised. + r.rcvWnd = scaledWnd << r.rcvWndScale + } return r.rcvNxt, scaledWnd } diff --git a/pkg/tcpip/transport/tcp/reno_recovery.go b/pkg/tcpip/transport/tcp/reno_recovery.go new file mode 100644 index 000000000..2aa708e97 --- /dev/null +++ b/pkg/tcpip/transport/tcp/reno_recovery.go @@ -0,0 +1,67 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp + +// renoRecovery stores the variables related to TCP Reno loss recovery +// algorithm. +// +// +stateify savable +type renoRecovery struct { + s *sender +} + +func newRenoRecovery(s *sender) *renoRecovery { + return &renoRecovery{s: s} +} + +func (rr *renoRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) { + ack := rcvdSeg.ackNumber + snd := rr.s + + // We are in fast recovery mode. Ignore the ack if it's out of range. + if !ack.InRange(snd.sndUna, snd.sndNxt+1) { + return + } + + // Don't count this as a duplicate if it is carrying data or + // updating the window. + if rcvdSeg.logicalLen() != 0 || snd.sndWnd != rcvdSeg.window { + return + } + + // Inflate the congestion window if we're getting duplicate acks + // for the packet we retransmitted. + if !fastRetransmit && ack == snd.fr.first { + // We received a dup, inflate the congestion window by 1 packet + // if we're not at the max yet. Only inflate the window if + // regular FastRecovery is in use, RFC6675 does not require + // inflating cwnd on duplicate ACKs. + if snd.sndCwnd < snd.fr.maxCwnd { + snd.sndCwnd++ + } + return + } + + // A partial ack was received. Retransmit this packet and remember it + // so that we don't retransmit it again. + // + // We don't inflate the window because we're putting the same packet + // back onto the wire. + // + // N.B. The retransmit timer will be reset by the caller. + snd.fr.first = ack + snd.dupAckCount = 0 + snd.resendSegment() +} diff --git a/pkg/tcpip/transport/tcp/sack_recovery.go b/pkg/tcpip/transport/tcp/sack_recovery.go new file mode 100644 index 000000000..7e813fa96 --- /dev/null +++ b/pkg/tcpip/transport/tcp/sack_recovery.go @@ -0,0 +1,120 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp + +import "gvisor.dev/gvisor/pkg/tcpip/seqnum" + +// sackRecovery stores the variables related to TCP SACK loss recovery +// algorithm. +// +// +stateify savable +type sackRecovery struct { + s *sender +} + +func newSACKRecovery(s *sender) *sackRecovery { + return &sackRecovery{s: s} +} + +// handleSACKRecovery implements the loss recovery phase as described in RFC6675 +// section 5, step C. +func (sr *sackRecovery) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) { + snd := sr.s + snd.SetPipe() + + if smss := int(snd.ep.scoreboard.SMSS()); limit > smss { + // Cap segment size limit to s.smss as SACK recovery requires + // that all retransmissions or new segments send during recovery + // be of <= SMSS. + limit = smss + } + + nextSegHint := snd.writeList.Front() + for snd.outstanding < snd.sndCwnd { + var nextSeg *segment + var rescueRtx bool + nextSeg, nextSegHint, rescueRtx = snd.NextSeg(nextSegHint) + if nextSeg == nil { + return dataSent + } + if !snd.isAssignedSequenceNumber(nextSeg) || snd.sndNxt.LessThanEq(nextSeg.sequenceNumber) { + // New data being sent. + + // Step C.3 described below is handled by + // maybeSendSegment which increments sndNxt when + // a segment is transmitted. + // + // Step C.3 "If any of the data octets sent in + // (C.1) are above HighData, HighData must be + // updated to reflect the transmission of + // previously unsent data." + // + // We pass s.smss as the limit as the Step 2) requires that + // new data sent should be of size s.smss or less. + if sent := snd.maybeSendSegment(nextSeg, limit, end); !sent { + return dataSent + } + dataSent = true + snd.outstanding++ + snd.writeNext = nextSeg.Next() + continue + } + + // Now handle the retransmission case where we matched either step 1,3 or 4 + // of the NextSeg algorithm. + // RFC 6675, Step C.4. + // + // "The estimate of the amount of data outstanding in the network + // must be updated by incrementing pipe by the number of octets + // transmitted in (C.1)." + snd.outstanding++ + dataSent = true + snd.sendSegment(nextSeg) + + segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) + if rescueRtx { + // We do the last part of rule (4) of NextSeg here to update + // RescueRxt as until this point we don't know if we are going + // to use the rescue transmission. + snd.fr.rescueRxt = snd.fr.last + } else { + // RFC 6675, Step C.2 + // + // "If any of the data octets sent in (C.1) are below + // HighData, HighRxt MUST be set to the highest sequence + // number of the retransmitted segment unless NextSeg () + // rule (4) was invoked for this retransmission." + snd.fr.highRxt = segEnd - 1 + } + } + return dataSent +} + +func (sr *sackRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) { + snd := sr.s + if fastRetransmit { + snd.resendSegment() + } + + // We are in fast recovery mode. Ignore the ack if it's out of range. + if ack := rcvdSeg.ackNumber; !ack.InRange(snd.sndUna, snd.sndNxt+1) { + return + } + + // RFC 6675 recovery algorithm step C 1-5. + end := snd.sndUna.Add(snd.sndWnd) + dataSent := sr.handleSACKRecovery(snd.maxPayloadSize, end) + snd.postXmit(dataSent) +} diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go index 2091989cc..5ef73ec74 100644 --- a/pkg/tcpip/transport/tcp/segment.go +++ b/pkg/tcpip/transport/tcp/segment.go @@ -204,7 +204,7 @@ func (s *segment) payloadSize() int { // segMemSize is the amount of memory used to hold the segment data and // the associated metadata. func (s *segment) segMemSize() int { - return segSize + s.data.Size() + return SegSize + s.data.Size() } // parse populates the sequence & ack numbers, flags, and window fields of the diff --git a/pkg/tcpip/transport/tcp/segment_unsafe.go b/pkg/tcpip/transport/tcp/segment_unsafe.go index 0ab7b8f56..392ff0859 100644 --- a/pkg/tcpip/transport/tcp/segment_unsafe.go +++ b/pkg/tcpip/transport/tcp/segment_unsafe.go @@ -19,5 +19,6 @@ import ( ) const ( - segSize = int(unsafe.Sizeof(segment{})) + // SegSize is the minimal size of the segment overhead. + SegSize = int(unsafe.Sizeof(segment{})) ) diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 0e0fdf14c..cc991aba6 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -18,7 +18,6 @@ import ( "fmt" "math" "sort" - "sync/atomic" "time" "gvisor.dev/gvisor/pkg/sleep" @@ -92,6 +91,17 @@ type congestionControl interface { PostRecovery() } +// lossRecovery is an interface that must be implemented by any supported +// loss recovery algorithm. +type lossRecovery interface { + // DoRecovery is invoked when loss is detected and segments need + // to be retransmitted. The cumulative or selective ACK is passed along + // with the flag which identifies whether the connection entered fast + // retransmit with this ACK and to retransmit the first unacknowledged + // segment. + DoRecovery(rcvdSeg *segment, fastRetransmit bool) +} + // sender holds the state necessary to send TCP segments. // // +stateify savable @@ -108,6 +118,9 @@ type sender struct { // fr holds state related to fast recovery. fr fastRecovery + // lr is the loss recovery algorithm used by the sender. + lr lossRecovery + // sndCwnd is the congestion window, in packets. sndCwnd int @@ -124,6 +137,9 @@ type sender struct { // that have been sent but not yet acknowledged. outstanding int + // sackedOut is the number of packets which are selectively acked. + sackedOut int + // sndWnd is the send window size. sndWnd seqnum.Size @@ -276,6 +292,8 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint s.cc = s.initCongestionControl(ep.cc) + s.lr = s.initLossRecovery() + // A negative sndWndScale means that no scaling is in use, otherwise we // store the scaling value. if sndWndScale > 0 { @@ -330,6 +348,14 @@ func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionCon } } +// initLossRecovery initiates the loss recovery algorithm for the sender. +func (s *sender) initLossRecovery() lossRecovery { + if s.ep.sackPermitted { + return newSACKRecovery(s) + } + return newRenoRecovery(s) +} + // updateMaxPayloadSize updates the maximum payload size based on the given // MTU. If this is in response to "packet too big" control packets (indicated // by the count argument), it also reduces the number of outstanding packets and @@ -349,6 +375,7 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) { m = 1 } + oldMSS := s.maxPayloadSize s.maxPayloadSize = m if s.gso { s.ep.gso.MSS = uint16(m) @@ -371,6 +398,7 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) { // Rewind writeNext to the first segment exceeding the MTU. Do nothing // if it is already before such a packet. + nextSeg := s.writeNext for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { if seg == s.writeNext { // We got to writeNext before we could find a segment @@ -378,16 +406,22 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) { break } - if seg.data.Size() > m { + if nextSeg == s.writeNext && seg.data.Size() > m { // We found a segment exceeding the MTU. Rewind // writeNext and try to retransmit it. - s.writeNext = seg - break + nextSeg = seg + } + + if s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { + // Update sackedOut for new maximum payload size. + s.sackedOut -= s.pCount(seg, oldMSS) + s.sackedOut += s.pCount(seg, s.maxPayloadSize) } } // Since we likely reduced the number of outstanding packets, we may be // ready to send some more. + s.writeNext = nextSeg s.sendData() } @@ -550,7 +584,7 @@ func (s *sender) retransmitTimerExpired() bool { // We were attempting fast recovery but were not successful. // Leave the state. We don't need to update ssthresh because it // has already been updated when entered fast-recovery. - s.leaveFastRecovery() + s.leaveRecovery() } s.state = RTORecovery @@ -606,13 +640,13 @@ func (s *sender) retransmitTimerExpired() bool { // pCount returns the number of packets in the segment. Due to GSO, a segment // can be composed of multiple packets. -func (s *sender) pCount(seg *segment) int { +func (s *sender) pCount(seg *segment, maxPayloadSize int) int { size := seg.data.Size() if size == 0 { return 1 } - return (size-1)/s.maxPayloadSize + 1 + return (size-1)/maxPayloadSize + 1 } // splitSeg splits a given segment at the size specified and inserts the @@ -789,7 +823,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se } if !nextTooBig && seg.data.Size() < available { // Segment is not full. - if s.outstanding > 0 && atomic.LoadUint32(&s.ep.delay) != 0 { + if s.outstanding > 0 && s.ep.ops.GetDelayOption() { // Nagle's algorithm. From Wikipedia: // Nagle's algorithm works by // combining a number of small @@ -808,7 +842,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se // send space and MSS. // TODO(gvisor.dev/issue/2833): Drain the held segments after a // timeout. - if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 { + if seg.data.Size() < s.maxPayloadSize && s.ep.ops.GetCorkOption() { return false } } @@ -913,79 +947,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se return true } -// handleSACKRecovery implements the loss recovery phase as described in RFC6675 -// section 5, step C. -func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) { - s.SetPipe() - - if smss := int(s.ep.scoreboard.SMSS()); limit > smss { - // Cap segment size limit to s.smss as SACK recovery requires - // that all retransmissions or new segments send during recovery - // be of <= SMSS. - limit = smss - } - - nextSegHint := s.writeList.Front() - for s.outstanding < s.sndCwnd { - var nextSeg *segment - var rescueRtx bool - nextSeg, nextSegHint, rescueRtx = s.NextSeg(nextSegHint) - if nextSeg == nil { - return dataSent - } - if !s.isAssignedSequenceNumber(nextSeg) || s.sndNxt.LessThanEq(nextSeg.sequenceNumber) { - // New data being sent. - - // Step C.3 described below is handled by - // maybeSendSegment which increments sndNxt when - // a segment is transmitted. - // - // Step C.3 "If any of the data octets sent in - // (C.1) are above HighData, HighData must be - // updated to reflect the transmission of - // previously unsent data." - // - // We pass s.smss as the limit as the Step 2) requires that - // new data sent should be of size s.smss or less. - if sent := s.maybeSendSegment(nextSeg, limit, end); !sent { - return dataSent - } - dataSent = true - s.outstanding++ - s.writeNext = nextSeg.Next() - continue - } - - // Now handle the retransmission case where we matched either step 1,3 or 4 - // of the NextSeg algorithm. - // RFC 6675, Step C.4. - // - // "The estimate of the amount of data outstanding in the network - // must be updated by incrementing pipe by the number of octets - // transmitted in (C.1)." - s.outstanding++ - dataSent = true - s.sendSegment(nextSeg) - - segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) - if rescueRtx { - // We do the last part of rule (4) of NextSeg here to update - // RescueRxt as until this point we don't know if we are going - // to use the rescue transmission. - s.fr.rescueRxt = s.fr.last - } else { - // RFC 6675, Step C.2 - // - // "If any of the data octets sent in (C.1) are below - // HighData, HighRxt MUST be set to the highest sequence - // number of the retransmitted segment unless NextSeg () - // rule (4) was invoked for this retransmission." - s.fr.highRxt = segEnd - 1 - } - } - return dataSent -} - func (s *sender) sendZeroWindowProbe() { ack, win := s.ep.rcv.getSendParams() s.unackZeroWindowProbes++ @@ -1014,6 +975,30 @@ func (s *sender) disableZeroWindowProbing() { s.resendTimer.disable() } +func (s *sender) postXmit(dataSent bool) { + if dataSent { + // We sent data, so we should stop the keepalive timer to ensure + // that no keepalives are sent while there is pending data. + s.ep.disableKeepaliveTimer() + } + + // If the sender has advertized zero receive window and we have + // data to be sent out, start zero window probing to query the + // the remote for it's receive window size. + if s.writeNext != nil && s.sndWnd == 0 { + s.enableZeroWindowProbing() + } + + // Enable the timer if we have pending data and it's not enabled yet. + if !s.resendTimer.enabled() && s.sndUna != s.sndNxt { + s.resendTimer.enable(s.rto) + } + // If we have no more pending data, start the keepalive timer. + if s.sndUna == s.sndNxt { + s.ep.resetKeepaliveTimer(false) + } +} + // sendData sends new data segments. It is called when data becomes available or // when the send window opens up. func (s *sender) sendData() { @@ -1034,55 +1019,29 @@ func (s *sender) sendData() { } var dataSent bool - - // RFC 6675 recovery algorithm step C 1-5. - if s.fr.active && s.ep.sackPermitted { - dataSent = s.handleSACKRecovery(s.maxPayloadSize, end) - } else { - for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() { - cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize - if cwndLimit < limit { - limit = cwndLimit - } - if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { - // Move writeNext along so that we don't try and scan data that - // has already been SACKED. - s.writeNext = seg.Next() - continue - } - if sent := s.maybeSendSegment(seg, limit, end); !sent { - break - } - dataSent = true - s.outstanding += s.pCount(seg) + for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() { + cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize + if cwndLimit < limit { + limit = cwndLimit + } + if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { + // Move writeNext along so that we don't try and scan data that + // has already been SACKED. s.writeNext = seg.Next() + continue } + if sent := s.maybeSendSegment(seg, limit, end); !sent { + break + } + dataSent = true + s.outstanding += s.pCount(seg, s.maxPayloadSize) + s.writeNext = seg.Next() } - if dataSent { - // We sent data, so we should stop the keepalive timer to ensure - // that no keepalives are sent while there is pending data. - s.ep.disableKeepaliveTimer() - } - - // If the sender has advertized zero receive window and we have - // data to be sent out, start zero window probing to query the - // the remote for it's receive window size. - if s.writeNext != nil && s.sndWnd == 0 { - s.enableZeroWindowProbing() - } - - // Enable the timer if we have pending data and it's not enabled yet. - if !s.resendTimer.enabled() && s.sndUna != s.sndNxt { - s.resendTimer.enable(s.rto) - } - // If we have no more pending data, start the keepalive timer. - if s.sndUna == s.sndNxt { - s.ep.resetKeepaliveTimer(false) - } + s.postXmit(dataSent) } -func (s *sender) enterFastRecovery() { +func (s *sender) enterRecovery() { s.fr.active = true // Save state to reflect we're now in fast recovery. // @@ -1090,6 +1049,7 @@ func (s *sender) enterFastRecovery() { // We inflate the cwnd by 3 to account for the 3 packets which triggered // the 3 duplicate ACKs and are now not in flight. s.sndCwnd = s.sndSsthresh + 3 + s.sackedOut = 0 s.fr.first = s.sndUna s.fr.last = s.sndNxt - 1 s.fr.maxCwnd = s.sndCwnd + s.outstanding @@ -1104,7 +1064,7 @@ func (s *sender) enterFastRecovery() { s.ep.stack.Stats().TCP.FastRecovery.Increment() } -func (s *sender) leaveFastRecovery() { +func (s *sender) leaveRecovery() { s.fr.active = false s.fr.maxCwnd = 0 s.dupAckCount = 0 @@ -1115,57 +1075,6 @@ func (s *sender) leaveFastRecovery() { s.cc.PostRecovery() } -func (s *sender) handleFastRecovery(seg *segment) (rtx bool) { - ack := seg.ackNumber - // We are in fast recovery mode. Ignore the ack if it's out of - // range. - if !ack.InRange(s.sndUna, s.sndNxt+1) { - return false - } - - // Leave fast recovery if it acknowledges all the data covered by - // this fast recovery session. - if s.fr.last.LessThan(ack) { - s.leaveFastRecovery() - return false - } - - if s.ep.sackPermitted { - // When SACK is enabled we let retransmission be governed by - // the SACK logic. - return false - } - - // Don't count this as a duplicate if it is carrying data or - // updating the window. - if seg.logicalLen() != 0 || s.sndWnd != seg.window { - return false - } - - // Inflate the congestion window if we're getting duplicate acks - // for the packet we retransmitted. - if ack == s.fr.first { - // We received a dup, inflate the congestion window by 1 packet - // if we're not at the max yet. Only inflate the window if - // regular FastRecovery is in use, RFC6675 does not require - // inflating cwnd on duplicate ACKs. - if s.sndCwnd < s.fr.maxCwnd { - s.sndCwnd++ - } - return false - } - - // A partial ack was received. Retransmit this packet and - // remember it so that we don't retransmit it again. We don't - // inflate the window because we're putting the same packet back - // onto the wire. - // - // N.B. The retransmit timer will be reset by the caller. - s.fr.first = ack - s.dupAckCount = 0 - return true -} - // isAssignedSequenceNumber relies on the fact that we only set flags once a // sequencenumber is assigned and that is only done right before we send the // segment. As a result any segment that has a non-zero flag has a valid @@ -1228,14 +1137,11 @@ func (s *sender) SetPipe() { s.outstanding = pipe } -// checkDuplicateAck is called when an ack is received. It manages the state -// related to duplicate acks and determines if a retransmit is needed according -// to the rules in RFC 6582 (NewReno). -func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) { +// detectLoss is called when an ack is received and returns whether a loss is +// detected. It manages the state related to duplicate acks and determines if +// a retransmit is needed according to the rules in RFC 6582 (NewReno). +func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { ack := seg.ackNumber - if s.fr.active { - return s.handleFastRecovery(seg) - } // We're not in fast recovery yet. A segment is considered a duplicate // only if it doesn't carry any data and doesn't update the send window, @@ -1266,14 +1172,14 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) { // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 // // We only do the check here, the incrementing of last to the highest - // sequence number transmitted till now is done when enterFastRecovery + // sequence number transmitted till now is done when enterRecovery // is invoked. if !s.fr.last.LessThan(seg.ackNumber) { s.dupAckCount = 0 return false } s.cc.HandleNDupAcks() - s.enterFastRecovery() + s.enterRecovery() s.dupAckCount = 0 return true } @@ -1313,6 +1219,7 @@ func (s *sender) walkSACK(rcvdSeg *segment) { s.rc.update(seg, rcvdSeg, s.ep.tsOffset) s.rc.detectReorder(seg) seg.acked = true + s.sackedOut += s.pCount(seg, s.maxPayloadSize) } seg = seg.Next() } @@ -1415,14 +1322,23 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { s.SetPipe() } - // Count the duplicates and do the fast retransmit if needed. - rtx := s.checkDuplicateAck(rcvdSeg) + ack := rcvdSeg.ackNumber + fastRetransmit := false + // Do not leave fast recovery, if the ACK is out of range. + if s.fr.active { + // Leave fast recovery if it acknowledges all the data covered by + // this fast recovery session. + if ack.InRange(s.sndUna, s.sndNxt+1) && s.fr.last.LessThan(ack) { + s.leaveRecovery() + } + } else { + // Detect loss by counting the duplicates and enter recovery. + fastRetransmit = s.detectLoss(rcvdSeg) + } // Stash away the current window size. s.sndWnd = rcvdSeg.window - ack := rcvdSeg.ackNumber - // Disable zero window probing if remote advertizes a non-zero receive // window. This can be with an ACK to the zero window probe (where the // acknumber refers to the already acknowledged byte) OR to any previously @@ -1477,10 +1393,10 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { datalen := seg.logicalLen() if datalen > ackLeft { - prevCount := s.pCount(seg) + prevCount := s.pCount(seg, s.maxPayloadSize) seg.data.TrimFront(int(ackLeft)) seg.sequenceNumber.UpdateForward(ackLeft) - s.outstanding -= prevCount - s.pCount(seg) + s.outstanding -= prevCount - s.pCount(seg, s.maxPayloadSize) break } @@ -1496,11 +1412,13 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { s.writeList.Remove(seg) - // If SACK is enabled then Only reduce outstanding if + // If SACK is enabled then only reduce outstanding if // the segment was not previously SACKED as these have // already been accounted for in SetPipe(). if !s.ep.sackPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { - s.outstanding -= s.pCount(seg) + s.outstanding -= s.pCount(seg, s.maxPayloadSize) + } else { + s.sackedOut -= s.pCount(seg, s.maxPayloadSize) } seg.decRef() ackLeft -= datalen @@ -1539,19 +1457,24 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { s.resendTimer.disable() } } + // Now that we've popped all acknowledged data from the retransmit // queue, retransmit if needed. - if rtx { - s.resendSegment() + if s.fr.active { + s.lr.DoRecovery(rcvdSeg, fastRetransmit) + // When SACK is enabled data sending is governed by steps in + // RFC 6675 Section 5 recovery steps A-C. + // See: https://tools.ietf.org/html/rfc6675#section-5. + if s.ep.sackPermitted { + return + } } // Send more data now that some of the pending data has been ack'd, or // that the window opened up, or the congestion window was inflated due // to a duplicate ack during fast recovery. This will also re-enable // the retransmit timer if needed. - if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || rcvdSeg.hasNewSACKInfo { - s.sendData() - } + s.sendData() } // sendSegment sends the specified segment. diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go index ef7f5719f..faf0c0ad7 100644 --- a/pkg/tcpip/transport/tcp/tcp_sack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go @@ -590,3 +590,45 @@ func TestSACKRecovery(t *testing.T) { expected++ } } + +// TestSACKUpdateSackedOut tests the sacked out field is updated when a SACK +// is received. +func TestSACKUpdateSackedOut(t *testing.T) { + c := context.New(t, uint32(mtu)) + defer c.Cleanup() + + probeDone := make(chan struct{}) + ackNum := 0 + c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) { + // Validate that the endpoint Sender.SackedOut is what we expect. + if state.Sender.SackedOut != 2 && ackNum == 0 { + t.Fatalf("SackedOut got updated to wrong value got: %v want: 2", state.Sender.SackedOut) + } + + if state.Sender.SackedOut != 0 && ackNum == 1 { + t.Fatalf("SackedOut got updated to wrong value got: %v want: 0", state.Sender.SackedOut) + } + if ackNum > 0 { + close(probeDone) + } + ackNum++ + }) + setStackSACKPermitted(t, c, true) + createConnectedWithSACKAndTS(c) + + sendAndReceive(t, c, 8) + + // ACK for [3-5] packets. + seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1) + start := c.IRS.Add(seqnum.Size(1 + 3*maxPayload)) + bytesRead := 2 * maxPayload + end := start.Add(seqnum.Size(bytesRead)) + c.SendAckWithSACK(seq, bytesRead, []header.SACKBlock{{start, end}}) + + bytesRead += 3 * maxPayload + c.SendAck(seq, bytesRead) + + // Wait for the probe function to finish processing the ACK before the + // test completes. + <-probeDone +} diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 9f0fb41e3..351a5e4f5 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -75,9 +75,6 @@ func TestGiveUpConnect(t *testing.T) { // Wait for ep to become writable. <-notifyCh - if err := ep.LastError(); err != tcpip.ErrAborted { - t.Fatalf("got ep.LastError() = %s, want = %s", err, tcpip.ErrAborted) - } // Call Connect again to retreive the handshake failure status // and stats updates. @@ -267,7 +264,7 @@ func TestTCPResetsSentNoICMP(t *testing.T) { } // Read outgoing ICMP stats and check no ICMP DstUnreachable was recorded. - sent := stats.ICMP.V4PacketsSent + sent := stats.ICMP.V4.PacketsSent if got, want := sent.DstUnreachable.Value(), uint64(0); got != want { t.Errorf("got ICMP DstUnreachable.Value() = %d, want = %d", got, want) } @@ -1935,6 +1932,84 @@ func TestFullWindowReceive(t *testing.T) { ) } +// Test the stack receive window advertisement on receiving segments smaller than +// segment overhead. It tests for the right edge of the window to not grow when +// the endpoint is not being read from. +func TestSmallSegReceiveWindowAdvertisement(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + opt := tcpip.TCPReceiveBufferSizeRangeOption{ + Min: 1, + Default: tcp.DefaultReceiveBufferSize, + Max: tcp.DefaultReceiveBufferSize << tcp.FindWndScale(seqnum.Size(tcp.DefaultReceiveBufferSize)), + } + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err) + } + + c.AcceptWithOptions(tcp.FindWndScale(seqnum.Size(opt.Default)), header.TCPSynOptions{MSS: defaultIPv4MSS}) + + // Bump up the receive buffer size such that, when the receive window grows, + // the scaled window exceeds maxUint16. + if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, opt.Max); err != nil { + t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed: %s", opt.Max, err) + } + + // Keep the payload size < segment overhead and such that it is a multiple + // of the window scaled value. This enables the test to perform equality + // checks on the incoming receive window. + payload := generateRandomPayload(t, (tcp.SegSize-1)&(1<<c.RcvdWindowScale)) + payloadLen := seqnum.Size(len(payload)) + iss := seqnum.Value(789) + seqNum := iss.Add(1) + + // Send payload to the endpoint and return the advertised receive window + // from the endpoint. + getIncomingRcvWnd := func() uint32 { + c.SendPacket(payload, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + SeqNum: seqNum, + AckNum: c.IRS.Add(1), + Flags: header.TCPFlagAck, + RcvWnd: 30000, + }) + seqNum = seqNum.Add(payloadLen) + + pkt := c.GetPacket() + return uint32(header.TCP(header.IPv4(pkt).Payload()).WindowSize()) << c.RcvdWindowScale + } + + // Read the advertised receive window with the ACK for payload. + rcvWnd := getIncomingRcvWnd() + + // Check if the subsequent ACK to our send has not grown the right edge of + // the window. + if got, want := getIncomingRcvWnd(), rcvWnd-uint32(len(payload)); got != want { + t.Fatalf("got incomingRcvwnd %d want %d", got, want) + } + + // Read the data so that the subsequent ACK from the endpoint + // grows the right edge of the window. + if _, _, err := c.EP.Read(nil); err != nil { + t.Fatalf("got Read(nil) = %s", err) + } + + // Check if we have received max uint16 as our advertised + // scaled window now after a read above. + maxRcv := uint32(math.MaxUint16 << c.RcvdWindowScale) + if got, want := getIncomingRcvWnd(), maxRcv; got != want { + t.Fatalf("got incomingRcvwnd %d want %d", got, want) + } + + // Check if the subsequent ACK to our send has not grown the right edge of + // the window. + if got, want := getIncomingRcvWnd(), maxRcv-uint32(len(payload)); got != want { + t.Fatalf("got incomingRcvwnd %d want %d", got, want) + } +} + func TestNoWindowShrinking(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() @@ -2532,10 +2607,10 @@ func TestSegmentMerging(t *testing.T) { { "cork", func(ep tcpip.Endpoint) { - ep.SetSockOptBool(tcpip.CorkOption, true) + ep.SocketOptions().SetCorkOption(true) }, func(ep tcpip.Endpoint) { - ep.SetSockOptBool(tcpip.CorkOption, false) + ep.SocketOptions().SetCorkOption(false) }, }, } @@ -2627,7 +2702,7 @@ func TestDelay(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) - c.EP.SetSockOptBool(tcpip.DelayOption, true) + c.EP.SocketOptions().SetDelayOption(true) var allData []byte for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} { @@ -2675,7 +2750,7 @@ func TestUndelay(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) - c.EP.SetSockOptBool(tcpip.DelayOption, true) + c.EP.SocketOptions().SetDelayOption(true) allData := [][]byte{{0}, {1, 2, 3}} for i, data := range allData { @@ -2708,7 +2783,7 @@ func TestUndelay(t *testing.T) { // Check that we don't get the second packet yet. c.CheckNoPacketTimeout("delayed second packet transmitted", 100*time.Millisecond) - c.EP.SetSockOptBool(tcpip.DelayOption, false) + c.EP.SocketOptions().SetDelayOption(false) // Check that data is received. second := c.GetPacket() @@ -2745,8 +2820,8 @@ func TestMSSNotDelayed(t *testing.T) { fn func(tcpip.Endpoint) }{ {"no-op", func(tcpip.Endpoint) {}}, - {"delay", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.DelayOption, true) }}, - {"cork", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.CorkOption, true) }}, + {"delay", func(ep tcpip.Endpoint) { ep.SocketOptions().SetDelayOption(true) }}, + {"cork", func(ep tcpip.Endpoint) { ep.SocketOptions().SetCorkOption(true) }}, } for _, test := range tests { @@ -3198,6 +3273,11 @@ loop: case tcpip.ErrWouldBlock: select { case <-ch: + // Expect the state to be StateError and subsequent Reads to fail with HardError. + if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset { + t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset) + } + break loop case <-time.After(1 * time.Second): t.Fatalf("Timed out waiting for reset to arrive") } @@ -3207,14 +3287,10 @@ loop: t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset) } } - // Expect the state to be StateError and subsequent Reads to fail with HardError. - if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset { - t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset) - } + if tcp.EndpointState(c.EP.State()) != tcp.StateError { t.Fatalf("got EP state is not StateError") } - if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 { t.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got) } @@ -4150,7 +4226,7 @@ func TestReadAfterClosedState(t *testing.T) { // Check that peek works. peekBuf := make([]byte, 10) - n, _, err := c.EP.Peek([][]byte{peekBuf}) + n, err := c.EP.Peek([][]byte{peekBuf}) if err != nil { t.Fatalf("Peek failed: %s", err) } @@ -4176,7 +4252,7 @@ func TestReadAfterClosedState(t *testing.T) { t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrClosedForReceive) } - if _, _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive { + if _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive { t.Fatalf("got c.EP.Peek(...) = %s, want = %s", err, tcpip.ErrClosedForReceive) } } @@ -4193,9 +4269,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4205,9 +4279,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4218,9 +4290,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4233,9 +4303,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4246,9 +4314,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4261,9 +4327,7 @@ func TestReusePort(t *testing.T) { if err != nil { t.Fatalf("NewEndpoint failed; %s", err) } - if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil { - t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err) - } + c.EP.SocketOptions().SetReuseAddress(true) if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } @@ -4656,13 +4720,9 @@ func TestConnectAvoidsBoundPorts(t *testing.T) { switch network { case "ipv4": case "ipv6": - if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil { - t.Fatalf("SetSockOptBool(V6OnlyOption(true)) failed: %s", err) - } + ep.SocketOptions().SetV6Only(true) case "dual": - if err := ep.SetSockOptBool(tcpip.V6OnlyOption, false); err != nil { - t.Fatalf("SetSockOptBool(V6OnlyOption(false)) failed: %s", err) - } + ep.SocketOptions().SetV6Only(false) default: t.Fatalf("unknown network: '%s'", network) } @@ -4998,9 +5058,7 @@ func TestKeepalive(t *testing.T) { if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5); err != nil { t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5): %s", err) } - if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil { - t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err) - } + c.EP.SocketOptions().SetKeepAlive(true) // 5 unacked keepalives are sent. ACK each one, and check that the // connection stays alive after 5. @@ -6118,10 +6176,13 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) { // Introduce a 25ms latency by delaying the first byte. latency := 25 * time.Millisecond time.Sleep(latency) - rawEP.SendPacketWithTS([]byte{1}, tsVal) + // Send an initial payload with atleast segment overhead size. The receive + // window would not grow for smaller segments. + rawEP.SendPacketWithTS(make([]byte, tcp.SegSize), tsVal) pkt := rawEP.VerifyAndReturnACKWithTS(tsVal) rcvWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() + time.Sleep(25 * time.Millisecond) // Allocate a large enough payload for the test. @@ -6394,10 +6455,7 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcpip.T if err != nil { t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %s", err) } - gotDelayOption, err := ep.GetSockOptBool(tcpip.DelayOption) - if err != nil { - t.Fatalf("ep.GetSockOptBool(tcpip.DelayOption) failed: %s", err) - } + gotDelayOption := ep.SocketOptions().GetDelayOption() if gotDelayOption != wantDelayOption { t.Errorf("ep.GetSockOptBool(tcpip.DelayOption) got: %t, want: %t", gotDelayOption, wantDelayOption) } @@ -7250,9 +7308,7 @@ func TestKeepaliveWithUserTimeout(t *testing.T) { if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10); err != nil { t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10): %s", err) } - if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil { - t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err) - } + c.EP.SocketOptions().SetKeepAlive(true) // Set userTimeout to be the duration to be 1 keepalive // probes. Which means that after the first probe is sent diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index e6aa4fc4b..ee55f030c 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -592,9 +592,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) { c.t.Fatalf("NewEndpoint failed: %v", err) } - if err := c.EP.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil { - c.t.Fatalf("SetSockOpt failed failed: %v", err) - } + c.EP.SocketOptions().SetV6Only(v6only) } // GetV6Packet reads a single packet from the link layer endpoint of the context @@ -637,11 +635,11 @@ func (c *Context) SendV6PacketWithAddrs(payload []byte, h *Headers, src, dst tcp // Initialize the IP header. ip := header.IPv6(buf) ip.Encode(&header.IPv6Fields{ - PayloadLength: uint16(header.TCPMinimumSize + len(payload)), - NextHeader: uint8(tcp.ProtocolNumber), - HopLimit: 65, - SrcAddr: src, - DstAddr: dst, + PayloadLength: uint16(header.TCPMinimumSize + len(payload)), + TransportProtocol: tcp.ProtocolNumber, + HopLimit: 65, + SrcAddr: src, + DstAddr: dst, }) // Initialize the TCP header. diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index c78549424..153e8c950 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -56,6 +56,8 @@ go_test( "//pkg/tcpip/network/ipv4", "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", + "//pkg/tcpip/transport/icmp", "//pkg/waiter", + "@com_github_google_go_cmp//cmp:go_default_library", ], ) diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 57976d4e3..763d1d654 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -16,8 +16,8 @@ package udp import ( "fmt" + "sync/atomic" - "gvisor.dev/gvisor/pkg/sleep" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" @@ -30,10 +30,11 @@ import ( // +stateify savable type udpPacket struct { udpPacketEntry - senderAddress tcpip.FullAddress - packetInfo tcpip.IPPacketInfo - data buffer.VectorisedView `state:".(buffer.VectorisedView)"` - timestamp int64 + senderAddress tcpip.FullAddress + destinationAddress tcpip.FullAddress + packetInfo tcpip.IPPacketInfo + data buffer.VectorisedView `state:".(buffer.VectorisedView)"` + timestamp int64 // tos stores either the receiveTOS or receiveTClass value. tos uint8 } @@ -77,6 +78,7 @@ func (s EndpointState) String() string { // +stateify savable type endpoint struct { stack.TransportEndpointInfo + tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and do not // change throughout the lifetime of the endpoint. @@ -94,21 +96,20 @@ type endpoint struct { rcvClosed bool // The following fields are protected by the mu mutex. - mu sync.RWMutex `state:"nosave"` - sndBufSize int - sndBufSizeMax int + mu sync.RWMutex `state:"nosave"` + sndBufSize int + sndBufSizeMax int + // state must be read/set using the EndpointState()/setEndpointState() + // methods. state EndpointState - route stack.Route `state:"manual"` + route *stack.Route `state:"manual"` dstPort uint16 - v6only bool ttl uint8 multicastTTL uint8 multicastAddr tcpip.Address multicastNICID tcpip.NICID - multicastLoop bool portFlags ports.Flags bindToDevice tcpip.NICID - noChecksum bool lastErrorMu sync.Mutex `state:"nosave"` lastError *tcpip.Error `state:".(string)"` @@ -122,17 +123,6 @@ type endpoint struct { // applied while sending packets. Defaults to 0 as on Linux. sendTOS uint8 - // receiveTOS determines if the incoming IPv4 TOS header field is passed - // as ancillary data to ControlMessages on Read. - receiveTOS bool - - // receiveTClass determines if the incoming IPv6 TClass header field is - // passed as ancillary data to ControlMessages on Read. - receiveTClass bool - - // receiveIPPacketInfo determines if the packet info is returned by Read. - receiveIPPacketInfo bool - // shutdownFlags represent the current shutdown state of the endpoint. shutdownFlags tcpip.ShutdownFlags @@ -154,9 +144,6 @@ type endpoint struct { // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner - // linger is used for SO_LINGER socket option. - linger tcpip.LingerOption - // ops is used to get socket level options. ops tcpip.SocketOptions } @@ -188,13 +175,14 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue // // Linux defaults to TTL=1. multicastTTL: 1, - multicastLoop: true, rcvBufSizeMax: 32 * 1024, sndBufSizeMax: 32 * 1024, multicastMemberships: make(map[multicastMembership]struct{}), state: StateInitial, uniqueID: s.UniqueID(), } + e.ops.InitHandler(e) + e.ops.SetMulticastLoop(true) // Override with stack defaults. var ss stack.SendBufferSizeOption @@ -210,6 +198,20 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue return e } +// setEndpointState updates the state of the endpoint to state atomically. This +// method is unexported as the only place we should update the state is in this +// package but we allow the state to be read freely without holding e.mu. +// +// Precondition: e.mu must be held to call this method. +func (e *endpoint) setEndpointState(state EndpointState) { + atomic.StoreUint32((*uint32)(&e.state), uint32(state)) +} + +// EndpointState() returns the current state of the endpoint. +func (e *endpoint) EndpointState() EndpointState { + return EndpointState(atomic.LoadUint32((*uint32)(&e.state))) +} + // UniqueID implements stack.TransportEndpoint.UniqueID. func (e *endpoint) UniqueID() uint64 { return e.uniqueID @@ -235,7 +237,7 @@ func (e *endpoint) Close() { e.mu.Lock() e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite - switch e.state { + switch e.EndpointState() { case StateBound, StateConnected: e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{}) @@ -258,10 +260,13 @@ func (e *endpoint) Close() { } e.rcvMu.Unlock() - e.route.Release() + if e.route != nil { + e.route.Release() + e.route = nil + } // Update the state. - e.state = StateClosed + e.setEndpointState(StateClosed) e.mu.Unlock() @@ -303,24 +308,23 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess HasTimestamp: true, Timestamp: p.timestamp, } - e.mu.RLock() - receiveTOS := e.receiveTOS - receiveTClass := e.receiveTClass - receiveIPPacketInfo := e.receiveIPPacketInfo - e.mu.RUnlock() - if receiveTOS { + if e.ops.GetReceiveTOS() { cm.HasTOS = true cm.TOS = p.tos } - if receiveTClass { + if e.ops.GetReceiveTClass() { cm.HasTClass = true // Although TClass is an 8-bit value it's read in the CMsg as a uint32. cm.TClass = uint32(p.tos) } - if receiveIPPacketInfo { + if e.ops.GetReceivePacketInfo() { cm.HasIPPacketInfo = true cm.PacketInfo = p.packetInfo } + if e.ops.GetReceiveOriginalDstAddress() { + cm.HasOriginalDstAddress = true + cm.OriginalDstAddress = p.destinationAddress + } return p.data.ToView(), cm, nil } @@ -330,7 +334,7 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess // // Returns true for retry if preparation should be retried. func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) { - switch e.state { + switch e.EndpointState() { case StateInitial: case StateConnected: return false, nil @@ -352,7 +356,7 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. - if e.state != StateInitial { + if e.EndpointState() != StateInitial { return true, nil } @@ -367,9 +371,9 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi // connectRoute establishes a route to the specified interface or the // configured multicast interface if no interface is specified and the // specified address is a multicast address. -func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (stack.Route, tcpip.NICID, *tcpip.Error) { +func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (*stack.Route, tcpip.NICID, *tcpip.Error) { localAddr := e.ID.LocalAddress - if isBroadcastOrMulticast(localAddr) { + if e.isBroadcastOrMulticast(nicID, netProto, localAddr) { // A packet can only originate from a unicast address (i.e., an interface). localAddr = "" } @@ -384,9 +388,9 @@ func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netPr } // Find a route to the desired destination. - r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.multicastLoop) + r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.ops.GetMulticastLoop()) if err != nil { - return stack.Route{}, 0, err + return nil, 0, err } return r, nicID, nil } @@ -429,7 +433,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c to := opts.To e.mu.RLock() - defer e.mu.RUnlock() + lockReleased := false + defer func() { + if lockReleased { + return + } + e.mu.RUnlock() + }() // If we've shutdown with SHUT_WR we are in an invalid state for sending. if e.shutdownFlags&tcpip.ShutdownWrite != 0 { @@ -448,36 +458,9 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } } - var route *stack.Route - var resolve func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error) - var dstPort uint16 - if to == nil { - route = &e.route - dstPort = e.dstPort - resolve = func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error) { - // Promote lock to exclusive if using a shared route, given that it may - // need to change in Route.Resolve() call below. - e.mu.RUnlock() - e.mu.Lock() - - // Recheck state after lock was re-acquired. - if e.state != StateConnected { - err = tcpip.ErrInvalidEndpointState - } - if err == nil && route.IsResolutionRequired() { - ch, err = route.Resolve(waker) - } - - e.mu.Unlock() - e.mu.RLock() - - // Recheck state after lock was re-acquired. - if e.state != StateConnected { - err = tcpip.ErrInvalidEndpointState - } - return - } - } else { + route := e.route + dstPort := e.dstPort + if to != nil { // Reject destination address if it goes through a different // NIC than the endpoint was bound to. nicID := to.NIC @@ -505,9 +488,8 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } defer r.Release() - route = &r + route = r dstPort = dst.Port - resolve = route.Resolve } if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() { @@ -515,7 +497,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c } if route.IsResolutionRequired() { - if ch, err := resolve(nil); err != nil { + if ch, err := route.Resolve(nil); err != nil { if err == tcpip.ErrWouldBlock { return 0, ch, tcpip.ErrNoLinkAddress } @@ -541,77 +523,46 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c useDefaultTTL = false } - if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner, e.noChecksum); err != nil { + localPort := e.ID.LocalPort + sendTOS := e.sendTOS + owner := e.owner + noChecksum := e.SocketOptions().GetNoChecksum() + lockReleased = true + e.mu.RUnlock() + + // Do not hold lock when sending as loopback is synchronous and if the UDP + // datagram ends up generating an ICMP response then it can result in a + // deadlock where the ICMP response handling ends up acquiring this endpoint's + // mutex using e.mu.RLock() in endpoint.HandleControlPacket which can cause a + // deadlock if another caller is trying to acquire e.mu in exclusive mode w/ + // e.mu.Lock(). Since e.mu.Lock() prevents any new read locks to ensure the + // lock can be eventually acquired. + // + // See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read + // locking is prohibited. + if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil { return 0, nil, err } return int64(len(v)), nil, nil } // Peek only returns data from a single datagram, so do nothing here. -func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { - return 0, tcpip.ControlMessages{}, nil +func (e *endpoint) Peek([][]byte) (int64, *tcpip.Error) { + return 0, nil } -// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool. -func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { - switch opt { - case tcpip.MulticastLoopOption: - e.mu.Lock() - e.multicastLoop = v - e.mu.Unlock() - - case tcpip.NoChecksumOption: - e.mu.Lock() - e.noChecksum = v - e.mu.Unlock() - - case tcpip.ReceiveTOSOption: - e.mu.Lock() - e.receiveTOS = v - e.mu.Unlock() - - case tcpip.ReceiveTClassOption: - // We only support this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return tcpip.ErrNotSupported - } - - e.mu.Lock() - e.receiveTClass = v - e.mu.Unlock() - - case tcpip.ReceiveIPPacketInfoOption: - e.mu.Lock() - e.receiveIPPacketInfo = v - e.mu.Unlock() - - case tcpip.ReuseAddressOption: - e.mu.Lock() - e.portFlags.MostRecent = v - e.mu.Unlock() - - case tcpip.ReusePortOption: - e.mu.Lock() - e.portFlags.LoadBalanced = v - e.mu.Unlock() - - case tcpip.V6OnlyOption: - // We only recognize this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return tcpip.ErrInvalidEndpointState - } - - e.mu.Lock() - defer e.mu.Unlock() - - // We only allow this to be set when we're in the initial state. - if e.state != StateInitial { - return tcpip.ErrInvalidEndpointState - } +// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. +func (e *endpoint) OnReuseAddressSet(v bool) { + e.mu.Lock() + e.portFlags.MostRecent = v + e.mu.Unlock() +} - e.v6only = v - } - return nil +// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. +func (e *endpoint) OnReusePortSet(v bool) { + e.mu.Lock() + e.portFlags.LoadBalanced = v + e.mu.Unlock() } // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. @@ -814,90 +765,10 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { case *tcpip.SocketDetachFilterOption: return nil - - case *tcpip.LingerOption: - e.mu.Lock() - e.linger = *v - e.mu.Unlock() } return nil } -// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. -func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - switch opt { - case tcpip.KeepaliveEnabledOption: - return false, nil - - case tcpip.MulticastLoopOption: - e.mu.RLock() - v := e.multicastLoop - e.mu.RUnlock() - return v, nil - - case tcpip.NoChecksumOption: - e.mu.RLock() - v := e.noChecksum - e.mu.RUnlock() - return v, nil - - case tcpip.ReceiveTOSOption: - e.mu.RLock() - v := e.receiveTOS - e.mu.RUnlock() - return v, nil - - case tcpip.ReceiveTClassOption: - // We only support this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return false, tcpip.ErrNotSupported - } - - e.mu.RLock() - v := e.receiveTClass - e.mu.RUnlock() - return v, nil - - case tcpip.ReceiveIPPacketInfoOption: - e.mu.RLock() - v := e.receiveIPPacketInfo - e.mu.RUnlock() - return v, nil - - case tcpip.ReuseAddressOption: - e.mu.RLock() - v := e.portFlags.MostRecent - e.mu.RUnlock() - - return v, nil - - case tcpip.ReusePortOption: - e.mu.RLock() - v := e.portFlags.LoadBalanced - e.mu.RUnlock() - - return v, nil - - case tcpip.V6OnlyOption: - // We only recognize this option on v6 endpoints. - if e.NetProto != header.IPv6ProtocolNumber { - return false, tcpip.ErrUnknownProtocolOption - } - - e.mu.RLock() - v := e.v6only - e.mu.RUnlock() - - return v, nil - - case tcpip.AcceptConnOption: - return false, nil - - default: - return false, tcpip.ErrUnknownProtocolOption - } -} - // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { switch opt { @@ -972,11 +843,6 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { *o = tcpip.BindToDeviceOption(e.bindToDevice) e.mu.RUnlock() - case *tcpip.LingerOption: - e.mu.RLock() - *o = e.linger - e.mu.RUnlock() - default: return tcpip.ErrUnknownProtocolOption } @@ -1036,7 +902,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u // checkV4MappedLocked determines the effective network protocol and converts // addr to its canonical form. func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) { - unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only) + unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) if err != nil { return tcpip.FullAddress{}, 0, err } @@ -1048,7 +914,7 @@ func (e *endpoint) Disconnect() *tcpip.Error { e.mu.Lock() defer e.mu.Unlock() - if e.state != StateConnected { + if e.EndpointState() != StateConnected { return nil } var ( @@ -1071,7 +937,7 @@ func (e *endpoint) Disconnect() *tcpip.Error { if err != nil { return err } - e.state = StateBound + e.setEndpointState(StateBound) boundPortFlags = e.boundPortFlags } else { if e.ID.LocalPort != 0 { @@ -1079,14 +945,14 @@ func (e *endpoint) Disconnect() *tcpip.Error { e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{}) e.boundPortFlags = ports.Flags{} } - e.state = StateInitial + e.setEndpointState(StateInitial) } e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice) e.ID = id e.boundBindToDevice = btd e.route.Release() - e.route = stack.Route{} + e.route = nil e.dstPort = 0 return nil @@ -1104,7 +970,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { nicID := addr.NIC var localPort uint16 - switch e.state { + switch e.EndpointState() { case StateInitial: case StateBound, StateConnected: localPort = e.ID.LocalPort @@ -1139,7 +1005,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { RemoteAddress: r.RemoteAddress, } - if e.state == StateInitial { + if e.EndpointState() == StateInitial { id.LocalAddress = r.LocalAddress } @@ -1147,7 +1013,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { // packets on a different network protocol, so we register both even if // v6only is set to false and this is an ipv6 endpoint. netProtos := []tcpip.NetworkProtocolNumber{netProto} - if netProto == header.IPv6ProtocolNumber && !e.v6only { + if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() { netProtos = []tcpip.NetworkProtocolNumber{ header.IPv4ProtocolNumber, header.IPv6ProtocolNumber, @@ -1173,7 +1039,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { e.RegisterNICID = nicID e.effectiveNetProtos = netProtos - e.state = StateConnected + e.setEndpointState(StateConnected) e.rcvMu.Lock() e.rcvReady = true @@ -1195,7 +1061,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { // A socket in the bound state can still receive multicast messages, // so we need to notify waiters on shutdown. - if e.state != StateBound && e.state != StateConnected { + if state := e.EndpointState(); state != StateBound && state != StateConnected { return tcpip.ErrNotConnected } @@ -1246,7 +1112,7 @@ func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.Networ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { // Don't allow binding once endpoint is not in the initial state // anymore. - if e.state != StateInitial { + if e.EndpointState() != StateInitial { return tcpip.ErrInvalidEndpointState } @@ -1259,7 +1125,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { // wildcard (empty) address, and this is an IPv6 endpoint with v6only // set to false. netProtos := []tcpip.NetworkProtocolNumber{netProto} - if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" { + if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() && addr.Addr == "" { netProtos = []tcpip.NetworkProtocolNumber{ header.IPv6ProtocolNumber, header.IPv4ProtocolNumber, @@ -1267,7 +1133,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { } nicID := addr.NIC - if len(addr.Addr) != 0 && !isBroadcastOrMulticast(addr.Addr) { + if len(addr.Addr) != 0 && !e.isBroadcastOrMulticast(addr.NIC, netProto, addr.Addr) { // A local unicast address was specified, verify that it's valid. nicID = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) if nicID == 0 { @@ -1290,7 +1156,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error { e.effectiveNetProtos = netProtos // Mark endpoint as bound. - e.state = StateBound + e.setEndpointState(StateBound) e.rcvMu.Lock() e.rcvReady = true @@ -1322,7 +1188,7 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { defer e.mu.RUnlock() addr := e.ID.LocalAddress - if e.state == StateConnected { + if e.EndpointState() == StateConnected { addr = e.route.LocalAddress } @@ -1338,7 +1204,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() - if e.state != StateConnected { + if e.EndpointState() != StateConnected { return tcpip.FullAddress{}, tcpip.ErrNotConnected } @@ -1393,7 +1259,6 @@ func verifyChecksum(hdr header.UDP, pkt *stack.PacketBuffer) bool { // HandlePacket is called by the stack when new packets arrive to this transport // endpoint. func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) { - // Get the header then trim it from the view. hdr := header.UDP(pkt.TransportHeader().View()) if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize { // Malformed packet. @@ -1402,6 +1267,10 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB return } + // TODO(gvisor.dev/issues/5033): We should mirror the Network layer and cap + // packets at "Parse" instead of when handling a packet. + pkt.Data.CapLength(int(hdr.PayloadLength())) + if !verifyChecksum(hdr, pkt) { // Checksum Error. e.stack.Stats().UDP.ChecksumErrors.Increment() @@ -1435,7 +1304,12 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB senderAddress: tcpip.FullAddress{ NIC: pkt.NICID, Addr: id.RemoteAddress, - Port: header.UDP(hdr).SourcePort(), + Port: hdr.SourcePort(), + }, + destinationAddress: tcpip.FullAddress{ + NIC: pkt.NICID, + Addr: id.LocalAddress, + Port: header.UDP(hdr).DestinationPort(), }, } packet.data = pkt.Data @@ -1470,25 +1344,20 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket. func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) { if typ == stack.ControlPortUnreachable { - e.mu.RLock() - if e.state == StateConnected { + if e.EndpointState() == StateConnected { e.lastErrorMu.Lock() e.lastError = tcpip.ErrConnectionRefused e.lastErrorMu.Unlock() - e.mu.RUnlock() e.waiterQueue.Notify(waiter.EventErr) return } - e.mu.RUnlock() } } // State implements tcpip.Endpoint.State. func (e *endpoint) State() uint32 { - e.mu.Lock() - defer e.mu.Unlock() - return uint32(e.state) + return uint32(e.EndpointState()) } // Info returns a copy of the endpoint info. @@ -1508,14 +1377,16 @@ func (e *endpoint) Stats() tcpip.EndpointStats { // Wait implements tcpip.Endpoint.Wait. func (*endpoint) Wait() {} -func isBroadcastOrMulticast(a tcpip.Address) bool { - return a == header.IPv4Broadcast || header.IsV4MulticastAddress(a) || header.IsV6MulticastAddress(a) +func (e *endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { + return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr) || e.stack.IsSubnetBroadcast(nicID, netProto, addr) } +// SetOwner implements tcpip.Endpoint.SetOwner. func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.owner = owner } +// SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go index 858c99a45..13b72dc88 100644 --- a/pkg/tcpip/transport/udp/endpoint_state.go +++ b/pkg/tcpip/transport/udp/endpoint_state.go @@ -98,7 +98,8 @@ func (e *endpoint) Resume(s *stack.Stack) { } } - if e.state != StateBound && e.state != StateConnected { + state := e.EndpointState() + if state != StateBound && state != StateConnected { return } @@ -113,12 +114,12 @@ func (e *endpoint) Resume(s *stack.Stack) { } var err *tcpip.Error - if e.state == StateConnected { - e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.multicastLoop) + if state == StateConnected { + e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.ops.GetMulticastLoop()) if err != nil { panic(err) } - } else if len(e.ID.LocalAddress) != 0 && !isBroadcastOrMulticast(e.ID.LocalAddress) { // stateBound + } else if len(e.ID.LocalAddress) != 0 && !e.isBroadcastOrMulticast(e.RegisterNICID, netProto, e.ID.LocalAddress) { // stateBound // A local unicast address is specified, verify that it's valid. if e.stack.CheckLocalAddress(e.RegisterNICID, netProto, e.ID.LocalAddress) == 0 { panic(tcpip.ErrBadLocalAddress) diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index 764ad0857..08980c298 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -22,6 +22,7 @@ import ( "testing" "time" + "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/checker" @@ -32,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/waiter" ) @@ -54,6 +56,7 @@ const ( stackPort = 1234 testAddr = "\x0a\x00\x00\x02" testPort = 4096 + invalidPort = 8192 multicastAddr = "\xe8\x2b\xd3\xea" multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" broadcastAddr = header.IPv4Broadcast @@ -295,7 +298,8 @@ func newDualTestContext(t *testing.T, mtu uint32) *testContext { t.Helper() return newDualTestContextWithOptions(t, mtu, stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol}, - TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, icmp.NewProtocol6, icmp.NewProtocol4}, + HandleLocal: true, }) } @@ -360,9 +364,7 @@ func (c *testContext) createEndpointForFlow(flow testFlow) { c.createEndpoint(flow.sockProto()) if flow.isV6Only() { - if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil { - c.t.Fatalf("SetSockOptBool failed: %s", err) - } + c.ep.SocketOptions().SetV6Only(true) } else if flow.isBroadcast() { c.ep.SocketOptions().SetBroadcast(true) } @@ -451,12 +453,12 @@ func (c *testContext) buildV6Packet(payload []byte, h *header4Tuple) buffer.View // Initialize the IP header. ip := header.IPv6(buf) ip.Encode(&header.IPv6Fields{ - TrafficClass: testTOS, - PayloadLength: uint16(header.UDPMinimumSize + len(payload)), - NextHeader: uint8(udp.ProtocolNumber), - HopLimit: 65, - SrcAddr: h.srcAddr.Addr, - DstAddr: h.dstAddr.Addr, + TrafficClass: testTOS, + PayloadLength: uint16(header.UDPMinimumSize + len(payload)), + TransportProtocol: udp.ProtocolNumber, + HopLimit: 65, + SrcAddr: h.srcAddr.Addr, + DstAddr: h.dstAddr.Addr, }) // Initialize the UDP header. @@ -972,7 +974,7 @@ func testFailingWrite(c *testContext, flow testFlow, wantErr *tcpip.Error) { // provided. func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 { c.t.Helper() - return testWriteInternal(c, flow, true, checkers...) + return testWriteAndVerifyInternal(c, flow, true, checkers...) } // testWriteWithoutDestination sends a packet of the given test flow from the @@ -981,10 +983,10 @@ func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker // checker functions provided. func testWriteWithoutDestination(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 { c.t.Helper() - return testWriteInternal(c, flow, false, checkers...) + return testWriteAndVerifyInternal(c, flow, false, checkers...) } -func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 { +func testWriteNoVerify(c *testContext, flow testFlow, setDest bool) buffer.View { c.t.Helper() // Take a snapshot of the stats to validate them at the end of the test. epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone() @@ -1006,6 +1008,12 @@ func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ... c.t.Fatalf("Bad number of bytes written: got %v, want %v", n, len(payload)) } c.checkEndpointWriteStats(1, epstats, err) + return payload +} + +func testWriteAndVerifyInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 { + c.t.Helper() + payload := testWriteNoVerify(c, flow, setDest) // Received the packet and check the payload. b := c.getPacketAndVerify(flow, checkers...) var udp header.UDP @@ -1150,6 +1158,39 @@ func TestV4WriteOnConnected(t *testing.T) { testWriteWithoutDestination(c, unicastV4) } +func TestWriteOnConnectedInvalidPort(t *testing.T) { + protocols := map[string]tcpip.NetworkProtocolNumber{ + "ipv4": ipv4.ProtocolNumber, + "ipv6": ipv6.ProtocolNumber, + } + for name, pn := range protocols { + t.Run(name, func(t *testing.T) { + c := newDualTestContext(t, defaultMTU) + defer c.cleanup() + + c.createEndpoint(pn) + if err := c.ep.Connect(tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}); err != nil { + c.t.Fatalf("Connect failed: %s", err) + } + writeOpts := tcpip.WriteOptions{ + To: &tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}, + } + payload := buffer.View(newPayload()) + n, _, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts) + if err != nil { + c.t.Fatalf("c.ep.Write(...) = %+s, want nil", err) + } + if got, want := n, int64(len(payload)); got != want { + c.t.Fatalf("c.ep.Write(...) wrote %d bytes, want %d bytes", got, want) + } + + if err := c.ep.LastError(); err != tcpip.ErrConnectionRefused { + c.t.Fatalf("expected c.ep.LastError() == ErrConnectionRefused, got: %+v", err) + } + }) + } +} + // TestWriteOnBoundToV4Multicast checks that we can send packets out of a socket // that is bound to a V4 multicast address. func TestWriteOnBoundToV4Multicast(t *testing.T) { @@ -1372,9 +1413,7 @@ func TestReadIPPacketInfo(t *testing.T) { } } - if err := c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true); err != nil { - t.Fatalf("c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true): %s", err) - } + c.ep.SocketOptions().SetReceivePacketInfo(true) testRead(c, test.flow, checker.ReceiveIPPacketInfo(tcpip.IPPacketInfo{ NIC: 1, @@ -1389,6 +1428,93 @@ func TestReadIPPacketInfo(t *testing.T) { } } +func TestReadRecvOriginalDstAddr(t *testing.T) { + tests := []struct { + name string + proto tcpip.NetworkProtocolNumber + flow testFlow + expectedOriginalDstAddr tcpip.FullAddress + }{ + { + name: "IPv4 unicast", + proto: header.IPv4ProtocolNumber, + flow: unicastV4, + expectedOriginalDstAddr: tcpip.FullAddress{1, stackAddr, stackPort}, + }, + { + name: "IPv4 multicast", + proto: header.IPv4ProtocolNumber, + flow: multicastV4, + // This should actually be a unicast address assigned to the interface. + // + // TODO(gvisor.dev/issue/3556): This check is validating incorrect + // behaviour. We still include the test so that once the bug is + // resolved, this test will start to fail and the individual tasked + // with fixing this bug knows to also fix this test :). + expectedOriginalDstAddr: tcpip.FullAddress{1, multicastAddr, stackPort}, + }, + { + name: "IPv4 broadcast", + proto: header.IPv4ProtocolNumber, + flow: broadcast, + // This should actually be a unicast address assigned to the interface. + // + // TODO(gvisor.dev/issue/3556): This check is validating incorrect + // behaviour. We still include the test so that once the bug is + // resolved, this test will start to fail and the individual tasked + // with fixing this bug knows to also fix this test :). + expectedOriginalDstAddr: tcpip.FullAddress{1, broadcastAddr, stackPort}, + }, + { + name: "IPv6 unicast", + proto: header.IPv6ProtocolNumber, + flow: unicastV6, + expectedOriginalDstAddr: tcpip.FullAddress{1, stackV6Addr, stackPort}, + }, + { + name: "IPv6 multicast", + proto: header.IPv6ProtocolNumber, + flow: multicastV6, + // This should actually be a unicast address assigned to the interface. + // + // TODO(gvisor.dev/issue/3556): This check is validating incorrect + // behaviour. We still include the test so that once the bug is + // resolved, this test will start to fail and the individual tasked + // with fixing this bug knows to also fix this test :). + expectedOriginalDstAddr: tcpip.FullAddress{1, multicastV6Addr, stackPort}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + c := newDualTestContext(t, defaultMTU) + defer c.cleanup() + + c.createEndpoint(test.proto) + + bindAddr := tcpip.FullAddress{Port: stackPort} + if err := c.ep.Bind(bindAddr); err != nil { + t.Fatalf("Bind(%#v): %s", bindAddr, err) + } + + if test.flow.isMulticast() { + ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: test.flow.getMcastAddr()} + if err := c.ep.SetSockOpt(&ifoptSet); err != nil { + c.t.Fatalf("SetSockOpt(&%#v): %s:", ifoptSet, err) + } + } + + c.ep.SocketOptions().SetReceiveOriginalDstAddress(true) + + testRead(c, test.flow, checker.ReceiveOriginalDstAddr(test.expectedOriginalDstAddr)) + + if got := c.s.Stats().UDP.PacketsReceived.Value(); got != 1 { + t.Fatalf("Read did not increment PacketsReceived: got = %d, want = 1", got) + } + }) + } +} + func TestWriteIncrementsPacketsSent(t *testing.T) { c := newDualTestContext(t, defaultMTU) defer c.cleanup() @@ -1412,16 +1538,12 @@ func TestNoChecksum(t *testing.T) { c.createEndpointForFlow(flow) // Disable the checksum generation. - if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, true); err != nil { - t.Fatalf("SetSockOptBool failed: %s", err) - } + c.ep.SocketOptions().SetNoChecksum(true) // This option is effective on IPv4 only. testWrite(c, flow, checker.UDP(checker.NoChecksum(flow.isV4()))) // Enable the checksum generation. - if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, false); err != nil { - t.Fatalf("SetSockOptBool failed: %s", err) - } + c.ep.SocketOptions().SetNoChecksum(false) testWrite(c, flow, checker.UDP(checker.NoChecksum(false))) }) } @@ -1591,13 +1713,15 @@ func TestSetTClass(t *testing.T) { } func TestReceiveTosTClass(t *testing.T) { + const RcvTOSOpt = "ReceiveTosOption" + const RcvTClassOpt = "ReceiveTClassOption" + testCases := []struct { - name string - getReceiveOption tcpip.SockOptBool - tests []testFlow + name string + tests []testFlow }{ - {"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}}, - {"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}}, + {RcvTOSOpt, []testFlow{unicastV4, broadcast}}, + {RcvTClassOpt, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}}, } for _, testCase := range testCases { for _, flow := range testCase.tests { @@ -1606,29 +1730,32 @@ func TestReceiveTosTClass(t *testing.T) { defer c.cleanup() c.createEndpointForFlow(flow) - option := testCase.getReceiveOption name := testCase.name - // Verify that setting and reading the option works. - v, err := c.ep.GetSockOptBool(option) - if err != nil { - c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err) + var optionGetter func() bool + var optionSetter func(bool) + switch name { + case RcvTOSOpt: + optionGetter = c.ep.SocketOptions().GetReceiveTOS + optionSetter = c.ep.SocketOptions().SetReceiveTOS + case RcvTClassOpt: + optionGetter = c.ep.SocketOptions().GetReceiveTClass + optionSetter = c.ep.SocketOptions().SetReceiveTClass + default: + t.Fatalf("unkown test variant: %s", name) } + + // Verify that setting and reading the option works. + v := optionGetter() // Test for expected default value. if v != false { c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false) } want := true - if err := c.ep.SetSockOptBool(option, want); err != nil { - c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err) - } - - got, err := c.ep.GetSockOptBool(option) - if err != nil { - c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err) - } + optionSetter(want) + got := optionGetter() if got != want { c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want) } @@ -1638,10 +1765,10 @@ func TestReceiveTosTClass(t *testing.T) { if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil { c.t.Fatalf("Bind failed: %s", err) } - switch option { - case tcpip.ReceiveTClassOption: + switch name { + case RcvTClassOpt: testRead(c, flow, checker.ReceiveTClass(testTOS)) - case tcpip.ReceiveTOSOption: + case RcvTOSOpt: testRead(c, flow, checker.ReceiveTOS(testTOS)) default: t.Fatalf("unknown test variant: %s", name) @@ -1788,27 +1915,31 @@ func TestV4UnknownDestination(t *testing.T) { icmpPkt := header.ICMPv4(hdr.Payload()) payloadIPHeader := header.IPv4(icmpPkt.Payload()) incomingHeaderLength := header.IPv4MinimumSize + header.UDPMinimumSize - wantLen := len(payload) + wantPayloadLen := len(payload) if tc.largePayload { // To work out the data size we need to simulate what the sender would // have done. The wanted size is the total available minus the sum of // the headers in the UDP AND ICMP packets, given that we know the test // had only a minimal IP header but the ICMP sender will have allowed // for a maximally sized packet header. - wantLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MaximumHeaderSize - header.ICMPv4MinimumSize - incomingHeaderLength + wantPayloadLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MaximumHeaderSize - header.ICMPv4MinimumSize - incomingHeaderLength } // In the case of large payloads the IP packet may be truncated. Update // the length field before retrieving the udp datagram payload. // Add back the two headers within the payload. - payloadIPHeader.SetTotalLength(uint16(wantLen + incomingHeaderLength)) - + payloadIPHeader.SetTotalLength(uint16(wantPayloadLen + incomingHeaderLength)) origDgram := header.UDP(payloadIPHeader.Payload()) - if got, want := len(origDgram.Payload()), wantLen; got != want { - t.Fatalf("unexpected payload length got: %d, want: %d", got, want) + wantDgramLen := wantPayloadLen + header.UDPMinimumSize + + if got, want := len(origDgram), wantDgramLen; got != want { + t.Fatalf("got len(origDgram) = %d, want = %d", got, want) } - if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) { - t.Fatalf("unexpected payload got: %d, want: %d", got, want) + // Correct UDP length to access payload. + origDgram.SetLength(uint16(wantDgramLen)) + + if got, want := origDgram.Payload(), payload[:wantPayloadLen]; !bytes.Equal(got, want) { + t.Fatalf("got origDgram.Payload() = %x, want = %x", got, want) } }) } @@ -1883,20 +2014,23 @@ func TestV6UnknownDestination(t *testing.T) { icmpPkt := header.ICMPv6(hdr.Payload()) payloadIPHeader := header.IPv6(icmpPkt.Payload()) - wantLen := len(payload) + wantPayloadLen := len(payload) if tc.largePayload { - wantLen = header.IPv6MinimumMTU - header.IPv6MinimumSize*2 - header.ICMPv6MinimumSize - header.UDPMinimumSize + wantPayloadLen = header.IPv6MinimumMTU - header.IPv6MinimumSize*2 - header.ICMPv6MinimumSize - header.UDPMinimumSize } + wantDgramLen := wantPayloadLen + header.UDPMinimumSize // In case of large payloads the IP packet may be truncated. Update // the length field before retrieving the udp datagram payload. - payloadIPHeader.SetPayloadLength(uint16(wantLen + header.UDPMinimumSize)) + payloadIPHeader.SetPayloadLength(uint16(wantDgramLen)) origDgram := header.UDP(payloadIPHeader.Payload()) - if got, want := len(origDgram.Payload()), wantLen; got != want { - t.Fatalf("unexpected payload length got: %d, want: %d", got, want) + if got, want := len(origDgram), wantPayloadLen+header.UDPMinimumSize; got != want { + t.Fatalf("got len(origDgram) = %d, want = %d", got, want) } - if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) { - t.Fatalf("unexpected payload got: %v, want: %v", got, want) + // Correct UDP length to access payload. + origDgram.SetLength(uint16(wantPayloadLen + header.UDPMinimumSize)) + if diff := cmp.Diff(payload[:wantPayloadLen], origDgram.Payload()); diff != "" { + t.Fatalf("origDgram.Payload() mismatch (-want +got):\n%s", diff) } }) } @@ -1955,12 +2089,12 @@ func TestShortHeader(t *testing.T) { // Initialize the IP header. ip := header.IPv6(buf) ip.Encode(&header.IPv6Fields{ - TrafficClass: testTOS, - PayloadLength: uint16(udpSize), - NextHeader: uint8(udp.ProtocolNumber), - HopLimit: 65, - SrcAddr: h.srcAddr.Addr, - DstAddr: h.dstAddr.Addr, + TrafficClass: testTOS, + PayloadLength: uint16(udpSize), + TransportProtocol: udp.ProtocolNumber, + HopLimit: 65, + SrcAddr: h.srcAddr.Addr, + DstAddr: h.dstAddr.Addr, }) // Initialize the UDP header. @@ -2409,3 +2543,67 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { }) } } + +func TestReceiveShortLength(t *testing.T) { + flows := []testFlow{unicastV4, unicastV6} + for _, flow := range flows { + t.Run(flow.String(), func(t *testing.T) { + c := newDualTestContext(t, defaultMTU) + defer c.cleanup() + + c.createEndpointForFlow(flow) + + // Bind to wildcard. + bindAddr := tcpip.FullAddress{Port: stackPort} + if err := c.ep.Bind(bindAddr); err != nil { + c.t.Fatalf("c.ep.Bind(%#v): %s", bindAddr, err) + } + + payload := newPayload() + extraBytes := []byte{1, 2, 3, 4} + h := flow.header4Tuple(incoming) + var buf buffer.View + var proto tcpip.NetworkProtocolNumber + + // Build packets with extra bytes not accounted for in the UDP length + // field. + var udp header.UDP + if flow.isV4() { + buf = c.buildV4Packet(payload, &h) + buf = append(buf, extraBytes...) + ip := header.IPv4(buf) + ip.SetTotalLength(ip.TotalLength() + uint16(len(extraBytes))) + ip.SetChecksum(0) + ip.SetChecksum(^ip.CalculateChecksum()) + proto = ipv4.ProtocolNumber + udp = ip.Payload() + } else { + buf = c.buildV6Packet(payload, &h) + buf = append(buf, extraBytes...) + ip := header.IPv6(buf) + ip.SetPayloadLength(ip.PayloadLength() + uint16(len(extraBytes))) + proto = ipv6.ProtocolNumber + udp = ip.Payload() + } + + if diff := cmp.Diff(payload, udp.Payload()); diff != "" { + t.Errorf("udp.Payload() mismatch (-want +got):\n%s", diff) + } + + c.linkEP.InjectInbound(proto, stack.NewPacketBuffer(stack.PacketBufferOptions{ + Data: buf.ToVectorisedView(), + })) + + // Try to receive the data. + v, _, err := c.ep.Read(nil) + if err != nil { + t.Fatalf("c.ep.Read(nil): %s", err) + } + + // Check the payload is read back without extra bytes. + if diff := cmp.Diff(buffer.View(payload), v); diff != "" { + t.Errorf("c.ep.Read(nil) mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/test/criutil/criutil.go b/pkg/test/criutil/criutil.go index 70945f234..e41769017 100644 --- a/pkg/test/criutil/criutil.go +++ b/pkg/test/criutil/criutil.go @@ -54,14 +54,20 @@ func ResolvePath(executable string) string { } } + // Favor /usr/local/bin, if it exists. + localBin := fmt.Sprintf("/usr/local/bin/%s", executable) + if _, err := os.Stat(localBin); err == nil { + return localBin + } + // Try to find via the path. - guess, err := exec.LookPath(executable) + guess, _ := exec.LookPath(executable) if err == nil { return guess } - // Return a default path. - return fmt.Sprintf("/usr/local/bin/%s", executable) + // Return a bare path; this generates a suitable error. + return executable } // NewCrictl returns a Crictl configured with a timeout and an endpoint over diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go index 64d17f661..2bf0a22ff 100644 --- a/pkg/test/dockerutil/container.go +++ b/pkg/test/dockerutil/container.go @@ -17,6 +17,7 @@ package dockerutil import ( "bytes" "context" + "errors" "fmt" "io/ioutil" "net" @@ -351,6 +352,9 @@ func (c *Container) SandboxPid(ctx context.Context) (int, error) { return resp.ContainerJSONBase.State.Pid, nil } +// ErrNoIP indicates that no IP address is available. +var ErrNoIP = errors.New("no IP available") + // FindIP returns the IP address of the container. func (c *Container) FindIP(ctx context.Context, ipv6 bool) (net.IP, error) { resp, err := c.client.ContainerInspect(ctx, c.id) @@ -365,7 +369,7 @@ func (c *Container) FindIP(ctx context.Context, ipv6 bool) (net.IP, error) { ip = net.ParseIP(resp.NetworkSettings.DefaultNetworkSettings.IPAddress) } if ip == nil { - return net.IP{}, fmt.Errorf("invalid IP: %q", ip) + return net.IP{}, ErrNoIP } return ip, nil } diff --git a/pkg/test/dockerutil/exec.go b/pkg/test/dockerutil/exec.go index 4c739c9e9..bf968acec 100644 --- a/pkg/test/dockerutil/exec.go +++ b/pkg/test/dockerutil/exec.go @@ -77,11 +77,6 @@ func (c *Container) doExec(ctx context.Context, r ExecOpts, args []string) (Proc return Process{}, fmt.Errorf("exec attach failed with err: %v", err) } - if err := c.client.ContainerExecStart(ctx, resp.ID, types.ExecStartCheck{}); err != nil { - hijack.Close() - return Process{}, fmt.Errorf("exec start failed with err: %v", err) - } - return Process{ container: c, execid: resp.ID, diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go index 49ab87c58..fdd416b5e 100644 --- a/pkg/test/testutil/testutil.go +++ b/pkg/test/testutil/testutil.go @@ -36,7 +36,6 @@ import ( "path/filepath" "strconv" "strings" - "sync/atomic" "syscall" "testing" "time" @@ -49,7 +48,10 @@ import ( ) var ( - checkpoint = flag.Bool("checkpoint", true, "control checkpoint/restore support") + checkpoint = flag.Bool("checkpoint", true, "control checkpoint/restore support") + partition = flag.Int("partition", 1, "partition number, this is 1-indexed") + totalPartitions = flag.Int("total_partitions", 1, "total number of partitions") + isRunningWithHostNet = flag.Bool("hostnet", false, "whether test is running with hostnet") ) // IsCheckpointSupported returns the relevant command line flag. @@ -57,6 +59,11 @@ func IsCheckpointSupported() bool { return *checkpoint } +// IsRunningWithHostNet returns the relevant command line flag. +func IsRunningWithHostNet() bool { + return *isRunningWithHostNet +} + // ImageByName mangles the image name used locally. This depends on the image // build infrastructure in images/ and tools/vm. func ImageByName(name string) string { @@ -249,14 +256,25 @@ func writeSpec(dir string, spec *specs.Spec) error { // idRandomSrc is a pseudo random generator used to in RandomID. var idRandomSrc = rand.New(rand.NewSource(time.Now().UnixNano())) +// idRandomSrcMtx is the mutex protecting idRandomSrc.Read from being used +// concurrently in differnt goroutines. +var idRandomSrcMtx sync.Mutex + // RandomID returns 20 random bytes following the given prefix. func RandomID(prefix string) string { // Read 20 random bytes. b := make([]byte, 20) + // Rand.Read is not safe for concurrent use. Packetimpact tests can be run in + // parallel now, so we have to protect the Read with a mutex. Otherwise we'll + // run into name conflicts. + // https://golang.org/pkg/math/rand/#Rand.Read + idRandomSrcMtx.Lock() // "[Read] always returns len(p) and a nil error." --godoc if _, err := idRandomSrc.Read(b); err != nil { + idRandomSrcMtx.Unlock() panic("rand.Read failed: " + err.Error()) } + idRandomSrcMtx.Unlock() if prefix != "" { prefix = prefix + "-" } @@ -417,33 +435,35 @@ func StartReaper() func() { // WaitUntilRead reads from the given reader until the wanted string is found // or until timeout. -func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error { +func WaitUntilRead(r io.Reader, want string, timeout time.Duration) error { sc := bufio.NewScanner(r) - if split != nil { - sc.Split(split) - } // done must be accessed atomically. A value greater than 0 indicates // that the read loop can exit. - var done uint32 - doneCh := make(chan struct{}) + doneCh := make(chan bool) + defer close(doneCh) go func() { for sc.Scan() { t := sc.Text() if strings.Contains(t, want) { - atomic.StoreUint32(&done, 1) - close(doneCh) - break + doneCh <- true + return } - if atomic.LoadUint32(&done) > 0 { - break + select { + case <-doneCh: + return + default: } } + doneCh <- false }() + select { case <-time.After(timeout): - atomic.StoreUint32(&done, 1) return fmt.Errorf("timeout waiting to read %q", want) - case <-doneCh: + case res := <-doneCh: + if !res { + return fmt.Errorf("reader closed while waiting to read %q", want) + } return nil } } @@ -509,7 +529,8 @@ func TouchShardStatusFile() error { } // TestIndicesForShard returns indices for this test shard based on the -// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars. +// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars, as well as +// the passed partition flags. // // If either of the env vars are not present, then the function will return all // tests. If there are more shards than there are tests, then the returned list @@ -534,6 +555,11 @@ func TestIndicesForShard(numTests int) ([]int, error) { } } + // Combine with the partitions. + partitionSize := shardTotal + shardTotal = (*totalPartitions) * shardTotal + shardIndex = partitionSize*(*partition-1) + shardIndex + // Calculate! var indices []int numBlocks := int(math.Ceil(float64(numTests) / float64(shardTotal))) diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go index 9b1e7a085..79db8895b 100644 --- a/pkg/usermem/usermem.go +++ b/pkg/usermem/usermem.go @@ -167,7 +167,7 @@ func (rw *IOReadWriter) Read(dst []byte) (int, error) { return n, err } -// Writer implements io.Writer.Write. +// Write implements io.Writer.Write. func (rw *IOReadWriter) Write(src []byte) (int, error) { n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts) end, ok := rw.Addr.AddLength(uint64(n)) diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go index 08519d986..83d4f893a 100644 --- a/pkg/waiter/waiter.go +++ b/pkg/waiter/waiter.go @@ -119,7 +119,10 @@ type EntryCallback interface { // The callback is supposed to perform minimal work, and cannot call // any method on the queue itself because it will be locked while the // callback is running. - Callback(e *Entry) + // + // The mask indicates the events that occurred and that the entry is + // interested in. + Callback(e *Entry, mask EventMask) } // Entry represents a waiter that can be add to the a wait queue. It can @@ -140,7 +143,7 @@ type channelCallback struct { } // Callback implements EntryCallback.Callback. -func (c *channelCallback) Callback(*Entry) { +func (c *channelCallback) Callback(*Entry, EventMask) { select { case c.ch <- struct{}{}: default: @@ -193,8 +196,8 @@ func (q *Queue) EventUnregister(e *Entry) { func (q *Queue) Notify(mask EventMask) { q.mu.RLock() for e := q.list.Front(); e != nil; e = e.Next() { - if mask&e.mask != 0 { - e.Callback.Callback(e) + if m := mask & e.mask; m != 0 { + e.Callback.Callback(e, m) } } q.mu.RUnlock() diff --git a/pkg/waiter/waiter_test.go b/pkg/waiter/waiter_test.go index c1b94a4f3..6928f28b4 100644 --- a/pkg/waiter/waiter_test.go +++ b/pkg/waiter/waiter_test.go @@ -20,12 +20,12 @@ import ( ) type callbackStub struct { - f func(e *Entry) + f func(e *Entry, m EventMask) } // Callback implements EntryCallback.Callback. -func (c *callbackStub) Callback(e *Entry) { - c.f(e) +func (c *callbackStub) Callback(e *Entry, m EventMask) { + c.f(e, m) } func TestEmptyQueue(t *testing.T) { @@ -36,7 +36,7 @@ func TestEmptyQueue(t *testing.T) { // Register then unregister a waiter, then notify the queue. cnt := 0 - e := Entry{Callback: &callbackStub{func(*Entry) { cnt++ }}} + e := Entry{Callback: &callbackStub{func(*Entry, EventMask) { cnt++ }}} q.EventRegister(&e, EventIn) q.EventUnregister(&e) q.Notify(EventIn) @@ -49,7 +49,7 @@ func TestMask(t *testing.T) { // Register a waiter. var q Queue var cnt int - e := Entry{Callback: &callbackStub{func(*Entry) { cnt++ }}} + e := Entry{Callback: &callbackStub{func(*Entry, EventMask) { cnt++ }}} q.EventRegister(&e, EventIn|EventErr) // Notify with an overlapping mask. @@ -101,11 +101,14 @@ func TestConcurrentRegistration(t *testing.T) { for i := 0; i < concurrency; i++ { go func() { var e Entry - e.Callback = &callbackStub{func(entry *Entry) { + e.Callback = &callbackStub{func(entry *Entry, mask EventMask) { cnt++ if entry != &e { t.Errorf("entry = %p, want %p", entry, &e) } + if mask != EventIn { + t.Errorf("mask = %#x want %#x", mask, EventIn) + } }} // Wait for notification, then register. @@ -158,11 +161,14 @@ func TestConcurrentNotification(t *testing.T) { // Register waiters. for i := 0; i < waiterCount; i++ { var e Entry - e.Callback = &callbackStub{func(entry *Entry) { + e.Callback = &callbackStub{func(entry *Entry, mask EventMask) { atomic.AddInt32(&cnt, 1) if entry != &e { t.Errorf("entry = %p, want %p", entry, &e) } + if mask != EventIn { + t.Errorf("mask = %#x want %#x", mask, EventIn) + } }} q.EventRegister(&e, EventIn|EventErr) diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 7076ae2e2..a3a76b609 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -53,7 +53,7 @@ type compatEmitter struct { func newCompatEmitter(logFD int) (*compatEmitter, error) { nameMap, ok := getSyscallNameMap() if !ok { - return nil, fmt.Errorf("Linux syscall table not found") + return nil, fmt.Errorf("syscall table not found") } c := &compatEmitter{ diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index fdf13c8e1..865126ac5 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -211,10 +211,31 @@ func (cm *containerManager) Processes(cid *string, out *[]*control.Process) erro return control.Processes(cm.l.k, *cid, out) } +// CreateArgs contains arguments to the Create method. +type CreateArgs struct { + // CID is the ID of the container to start. + CID string + + // FilePayload may contain a TTY file for the terminal, if enabled. + urpc.FilePayload +} + // Create creates a container within a sandbox. -func (cm *containerManager) Create(cid *string, _ *struct{}) error { - log.Debugf("containerManager.Create, cid: %s", *cid) - return cm.l.createContainer(*cid) +func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error { + log.Debugf("containerManager.Create: %s", args.CID) + + if len(args.Files) > 1 { + return fmt.Errorf("start arguments must have at most 1 files for TTY") + } + var tty *fd.FD + if len(args.Files) == 1 { + var err error + tty, err = fd.NewFromFile(args.Files[0]) + if err != nil { + return fmt.Errorf("error dup'ing TTY file: %w", err) + } + } + return cm.l.createContainer(args.CID, tty) } // StartArgs contains arguments to the Start method. @@ -229,9 +250,8 @@ type StartArgs struct { CID string // FilePayload contains, in order: - // * stdin, stdout, and stderr. - // * the file descriptor over which the sandbox will - // request files from its root filesystem. + // * stdin, stdout, and stderr (optional: if terminal is disabled). + // * file descriptors to connect to gofer to serve the root filesystem. urpc.FilePayload } @@ -251,23 +271,45 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if args.CID == "" { return errors.New("start argument missing container ID") } - if len(args.FilePayload.Files) < 4 { - return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") + if len(args.Files) < 1 { + return fmt.Errorf("start arguments must contain at least one file for the container root gofer") } // All validation passed, logs the spec for debugging. specutils.LogSpec(args.Spec) - fds, err := fd.NewFromFiles(args.FilePayload.Files) + goferFiles := args.Files + var stdios []*fd.FD + if !args.Spec.Process.Terminal { + // When not using a terminal, stdios come as the first 3 files in the + // payload. + if l := len(args.Files); l < 4 { + return fmt.Errorf("start arguments (len: %d) must contain stdios and files for the container root gofer", l) + } + var err error + stdios, err = fd.NewFromFiles(goferFiles[:3]) + if err != nil { + return fmt.Errorf("error dup'ing stdio files: %w", err) + } + goferFiles = goferFiles[3:] + } + defer func() { + for _, fd := range stdios { + _ = fd.Close() + } + }() + + goferFDs, err := fd.NewFromFiles(goferFiles) if err != nil { - return err + return fmt.Errorf("error dup'ing gofer files: %w", err) } defer func() { - for _, fd := range fds { + for _, fd := range goferFDs { _ = fd.Close() } }() - if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil { + + if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil { log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err) return err } @@ -330,18 +372,18 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { log.Debugf("containerManager.Restore") var specFile, deviceFile *os.File - switch numFiles := len(o.FilePayload.Files); numFiles { + switch numFiles := len(o.Files); numFiles { case 2: // The device file is donated to the platform. // Can't take ownership away from os.File. dup them to get a new FD. - fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd())) + fd, err := syscall.Dup(int(o.Files[1].Fd())) if err != nil { return fmt.Errorf("failed to dup file: %v", err) } deviceFile = os.NewFile(uintptr(fd), "platform device") fallthrough case 1: - specFile = o.FilePayload.Files[0] + specFile = o.Files[0] case 0: return fmt.Errorf("at least one file must be passed to Restore") default: diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index a7c4ebb0c..4e3bb9ac7 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -343,6 +343,16 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_PKTINFO), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IPV6), seccomp.EqualTo(syscall.IPV6_TCLASS), }, @@ -358,6 +368,11 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_SOCKET), seccomp.EqualTo(syscall.SO_ERROR), }, @@ -393,6 +408,11 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TIMESTAMP), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_TCP), seccomp.EqualTo(syscall.TCP_NODELAY), }, @@ -401,6 +421,11 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.EqualTo(syscall.SOL_TCP), seccomp.EqualTo(syscall.TCP_INFO), }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(linux.TCP_INQ), + }, }, syscall.SYS_IOCTL: []seccomp.Rule{ { @@ -449,6 +474,13 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TIMESTAMP), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_TCP), seccomp.EqualTo(syscall.TCP_NODELAY), seccomp.MatchAny{}, @@ -456,6 +488,13 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(linux.TCP_INQ), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IP), seccomp.EqualTo(syscall.IP_TOS), seccomp.MatchAny{}, @@ -470,6 +509,20 @@ func hostInetFilters() seccomp.SyscallRules { }, { seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_PKTINFO), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, + { + seccomp.MatchAny{}, seccomp.EqualTo(syscall.SOL_IPV6), seccomp.EqualTo(syscall.IPV6_TCLASS), seccomp.MatchAny{}, @@ -482,6 +535,13 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.MatchAny{}, seccomp.EqualTo(4), }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), + }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 6b6ae98d7..2b0d2cd51 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -22,15 +22,6 @@ import ( "strings" "syscall" - // Include filesystem types that OCI spec might mount. - _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" - _ "gvisor.dev/gvisor/pkg/sentry/fs/host" - _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" - _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" - _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" - "gvisor.dev/gvisor/pkg/sentry/vfs" - specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -48,9 +39,18 @@ import ( tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" + + // Include filesystem types that OCI spec might mount. + _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" + _ "gvisor.dev/gvisor/pkg/sentry/fs/host" + _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" + _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" ) const ( diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index ebdd518d0..3df013d34 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -157,6 +157,11 @@ type execProcess struct { // pidnsPath is the pid namespace path in spec pidnsPath string + + // hostTTY is present when creating a sub-container with terminal enabled. + // TTY file is passed during container create and must be saved until + // container start. + hostTTY *fd.FD } func init() { @@ -588,7 +593,9 @@ func (l *Loader) run() error { // Create the root container init task. It will begin running // when the kernel is started. - if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { + var err error + _, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(true, l.sandboxID, &l.root) + if err != nil { return err } @@ -627,7 +634,7 @@ func (l *Loader) run() error { } // createContainer creates a new container inside the sandbox. -func (l *Loader) createContainer(cid string) error { +func (l *Loader) createContainer(cid string, tty *fd.FD) error { l.mu.Lock() defer l.mu.Unlock() @@ -635,14 +642,14 @@ func (l *Loader) createContainer(cid string) error { if _, ok := l.processes[eid]; ok { return fmt.Errorf("container %q already exists", cid) } - l.processes[eid] = &execProcess{} + l.processes[eid] = &execProcess{hostTTY: tty} return nil } // startContainer starts a child container. It returns the thread group ID of // the newly created process. Used FDs are either closed or released. It's safe // for the caller to close any remaining files upon return. -func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error { +func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -695,36 +702,41 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin info := &containerInfo{ conf: conf, spec: spec, - stdioFDs: files[:3], - goferFDs: files[3:], + goferFDs: goferFDs, } info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } - tg, err := l.createContainerProcess(false, cid, info, ep) + + // Use stdios or TTY depending on the spec configuration. + if spec.Process.Terminal { + if len(stdioFDs) > 0 { + return fmt.Errorf("using TTY, stdios not expected: %v", stdioFDs) + } + if ep.hostTTY == nil { + return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") + } + info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} + ep.hostTTY = nil + } else { + info.stdioFDs = stdioFDs + } + + ep.tg, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(false, cid, info) if err != nil { return err } - - // Success! - l.k.StartProcess(tg) - ep.tg = tg + l.k.StartProcess(ep.tg) return nil } -func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) { - console := false - if root { - // Only root container supports terminal for now. - console = info.spec.Process.Terminal - } - +func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Create the FD map, which will set stdin, stdout, and stderr. ctx := info.procArgs.NewContext(l.k) - fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs) if err != nil { - return nil, fmt.Errorf("importing fds: %v", err) + return nil, nil, nil, fmt.Errorf("importing fds: %v", err) } // CreateProcess takes a reference on fdTable if successful. We won't need // ours either way. @@ -736,11 +748,11 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints) if root { if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { - return nil, err + return nil, nil, nil, err } } if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil { - return nil, err + return nil, nil, nil, err } // Add the HOME environment variable if it is not already set. @@ -754,29 +766,25 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } if err != nil { - return nil, err + return nil, nil, nil, err } info.procArgs.Envv = envv // Create and start the new process. tg, _, err := l.k.CreateProcess(info.procArgs) if err != nil { - return nil, fmt.Errorf("creating process: %v", err) + return nil, nil, nil, fmt.Errorf("creating process: %v", err) } // CreateProcess takes a reference on FDTable if successful. info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. - if root { - switch { - case ttyFileVFS2 != nil: - ep.ttyVFS2 = ttyFileVFS2 - ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) - case ttyFile != nil: - ep.tty = ttyFile - ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) - } + switch { + case ttyFileVFS2 != nil: + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFile != nil: + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) } // Install seccomp filters with the new task if there are any. @@ -784,7 +792,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) if err != nil { - return nil, fmt.Errorf("building seccomp program: %v", err) + return nil, nil, nil, fmt.Errorf("building seccomp program: %v", err) } if log.IsLogging(log.Debug) { @@ -795,7 +803,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn task := tg.Leader() // NOTE: It seems Flags are ignored by runc so we ignore them too. if err := task.AppendSyscallFilter(program, true); err != nil { - return nil, fmt.Errorf("appending seccomp filters: %v", err) + return nil, nil, nil, fmt.Errorf("appending seccomp filters: %v", err) } } } else { @@ -804,7 +812,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } } - return tg, nil + return tg, ttyFile, ttyFileVFS2, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on @@ -1074,7 +1082,12 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} - transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4} + transProtos := []stack.TransportProtocolFactory{ + tcp.NewProtocol, + udp.NewProtocol, + icmp.NewProtocol4, + icmp.NewProtocol6, + } s := netstack.Stack{stack.New(stack.Options{ NetworkProtocols: netProtos, TransportProtocols: transProtos, diff --git a/runsc/boot/network.go b/runsc/boot/network.go index f58b09942..3d3a813df 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -40,9 +40,9 @@ var ( // "::1/8" on "lo" interface. DefaultLoopbackLink = LoopbackLink{ Name: "lo", - Addresses: []net.IP{ - net.IP("\x7f\x00\x00\x01"), - net.IPv6loopback, + Addresses: []IPWithPrefix{ + {Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8}, + {Address: net.IPv6loopback, PrefixLen: 128}, }, Routes: []Route{ { @@ -82,7 +82,7 @@ type DefaultRoute struct { type FDBasedLink struct { Name string MTU int - Addresses []net.IP + Addresses []IPWithPrefix Routes []Route GSOMaxSize uint32 SoftwareGSOEnabled bool @@ -99,7 +99,7 @@ type FDBasedLink struct { // LoopbackLink configures a loopback li nk. type LoopbackLink struct { Name string - Addresses []net.IP + Addresses []IPWithPrefix Routes []Route } @@ -117,6 +117,19 @@ type CreateLinksAndRoutesArgs struct { Defaultv6Gateway DefaultRoute } +// IPWithPrefix is an address with its subnet prefix length. +type IPWithPrefix struct { + // Address is a network address. + Address net.IP + + // PrefixLen is the subnet prefix length. + PrefixLen int +} + +func (ip IPWithPrefix) String() string { + return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen) +} + // Empty returns true if route hasn't been set. func (r *Route) Empty() bool { return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil @@ -264,15 +277,19 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct // createNICWithAddrs creates a NIC in the network stack and adds the given // addresses. -func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error { +func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []IPWithPrefix) error { opts := stack.NICOptions{Name: name} if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil { return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err) } for _, addr := range addrs { - proto, tcpipAddr := ipToAddressAndProto(addr) - if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil { + proto, tcpipAddr := ipToAddressAndProto(addr.Address) + ap := tcpip.AddressWithPrefix{ + Address: tcpipAddr, + PrefixLen: addr.PrefixLen, + } + if err := n.Stack.AddAddressWithPrefix(id, proto, ap); err != nil { return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err) } } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index b157387ef..3fd28e516 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -250,36 +250,76 @@ func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Cre overlayOpts := *lowerOpts overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} - // Next mount upper and lower. Upper is a tmpfs mount to keep all - // modifications inside the sandbox. - upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) - if err != nil { - return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) - } - cu := cleanup.Make(func() { upper.DecRef(ctx) }) - defer cu.Clean() - // All writes go to the upper layer, be paranoid and make lower readonly. lowerOpts.ReadOnly = true lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) if err != nil { return nil, nil, err } - cu.Add(func() { lower.DecRef(ctx) }) + cu := cleanup.Make(func() { lower.DecRef(ctx) }) + defer cu.Clean() - // Propagate the lower layer's root's owner, group, and mode to the upper - // layer's root for consistency with VFS1. - upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) + // Determine the lower layer's root's type. lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root()) stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{ Root: lowerRootVD, Start: lowerRootVD, }, &vfs.StatOptions{ - Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE, + Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE, }) if err != nil { - return nil, nil, err + return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err) + } + if stat.Mask&linux.STATX_TYPE == 0 { + return nil, nil, fmt.Errorf("failed to get file type of lower layer's root") + } + rootType := stat.Mode & linux.S_IFMT + if rootType != linux.S_IFDIR && rootType != linux.S_IFREG { + return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType) + } + + // Upper is a tmpfs mount to keep all modifications inside the sandbox. + upperOpts.GetFilesystemOptions.InternalData = tmpfs.FilesystemOpts{ + RootFileType: uint16(rootType), + } + upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) + if err != nil { + return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) + } + cu.Add(func() { upper.DecRef(ctx) }) + + // If the overlay mount consists of a regular file, copy up its contents + // from the lower layer, since in the overlay the otherwise-empty upper + // layer file will take precedence. + upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) + if rootType == linux.S_IFREG { + lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ + Root: lowerRootVD, + Start: lowerRootVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err) + } + defer lowerFD.DecRef(ctx) + upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ + Root: upperRootVD, + Start: upperRootVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY, + }) + if err != nil { + return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err) + } + defer upperFD.DecRef(ctx) + if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil { + return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err) + } } + + // Propagate the lower layer's root's owner, group, and mode to the upper + // layer's root for consistency with VFS1. err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{ Root: upperRootVD, Start: upperRootVD, diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index 5bd0afc52..e5294de55 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -234,7 +234,7 @@ func loadPathsHelper(cgroup io.Reader) (map[string]string, error) { type Cgroup struct { Name string `json:"name"` Parents map[string]string `json:"parents"` - Own bool `json:"own"` + Own map[string]bool `json:"own"` } // New creates a new Cgroup instance if the spec includes a cgroup path. @@ -251,9 +251,11 @@ func New(spec *specs.Spec) (*Cgroup, error) { return nil, fmt.Errorf("finding current cgroups: %w", err) } } + own := make(map[string]bool) return &Cgroup{ Name: spec.Linux.CgroupsPath, Parents: parents, + Own: own, }, nil } @@ -261,18 +263,8 @@ func New(spec *specs.Spec) (*Cgroup, error) { // already exists, it means that the caller has already provided a // pre-configured cgroups, and 'res' is ignored. func (c *Cgroup) Install(res *specs.LinuxResources) error { - if _, err := os.Stat(c.makePath("memory")); err == nil { - // If cgroup has already been created; it has been setup by caller. Don't - // make any changes to configuration, just join when sandbox/gofer starts. - log.Debugf("Using pre-created cgroup %q", c.Name) - return nil - } - log.Debugf("Creating cgroup %q", c.Name) - // Mark that cgroup resources are owned by me. - c.Own = true - // The Cleanup object cleans up partially created cgroups when an error occurs. // Errors occuring during cleanup itself are ignored. clean := cleanup.Make(func() { _ = c.Uninstall() }) @@ -280,6 +272,16 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { for key, cfg := range controllers { path := c.makePath(key) + if _, err := os.Stat(path); err == nil { + // If cgroup has already been created; it has been setup by caller. Don't + // make any changes to configuration, just join when sandbox/gofer starts. + log.Debugf("Using pre-created cgroup %q", path) + continue + } + + // Mark that cgroup resources are owned by me. + c.Own[key] = true + if err := os.MkdirAll(path, 0755); err != nil { if cfg.optional && errors.Is(err, syscall.EROFS) { log.Infof("Skipping cgroup %q", key) @@ -298,12 +300,12 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { // Uninstall removes the settings done in Install(). If cgroup path already // existed when Install() was called, Uninstall is a noop. func (c *Cgroup) Uninstall() error { - if !c.Own { - // cgroup is managed by caller, don't touch it. - return nil - } log.Debugf("Deleting cgroup %q", c.Name) for key := range controllers { + if !c.Own[key] { + // cgroup is managed by caller, don't touch it. + continue + } path := c.makePath(key) log.Debugf("Removing cgroup controller for key=%q path=%q", key, path) diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go index 9794517a7..931144cf9 100644 --- a/runsc/cgroup/cgroup_test.go +++ b/runsc/cgroup/cgroup_test.go @@ -29,7 +29,10 @@ func TestUninstallEnoent(t *testing.T) { c := Cgroup{ // set a non-existent name Name: "runsc-test-uninstall-656e6f656e740a", - Own: true, + } + c.Own = make(map[string]bool) + for key := range controllers { + c.Own[key] = true } if err := c.Uninstall(); err != nil { t.Errorf("Uninstall() failed: %v", err) diff --git a/runsc/cli/main.go b/runsc/cli/main.go index bca015db5..6c3bf4d21 100644 --- a/runsc/cli/main.go +++ b/runsc/cli/main.go @@ -22,6 +22,7 @@ import ( "io/ioutil" "os" "os/signal" + "runtime" "syscall" "time" @@ -82,6 +83,7 @@ func Main(version string) { subcommands.Register(new(cmd.Spec), "") subcommands.Register(new(cmd.State), "") subcommands.Register(new(cmd.Start), "") + subcommands.Register(new(cmd.Symbolize), "") subcommands.Register(new(cmd.Wait), "") // Register internal commands with the internal group name. This causes @@ -207,6 +209,8 @@ func Main(version string) { log.Infof("***************************") log.Infof("Args: %s", os.Args) log.Infof("Version %s", version) + log.Infof("GOOS: %s", runtime.GOOS) + log.Infof("GOARCH: %s", runtime.GOARCH) log.Infof("PID: %d", os.Getpid()) log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid()) log.Infof("Configuration:") diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 2556f6d9e..19520d7ab 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -32,6 +32,7 @@ go_library( "start.go", "state.go", "statefile.go", + "symbolize.go", "syscalls.go", "wait.go", ], @@ -39,6 +40,7 @@ go_library( "//runsc:__subpackages__", ], deps = [ + "//pkg/coverage", "//pkg/log", "//pkg/p9", "//pkg/sentry/control", diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 640de4c47..8a8d9f752 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -81,7 +81,7 @@ func (c *Do) SetFlags(f *flag.FlagSet) { // Execute implements subcommands.Command.Execute. func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { if len(f.Args()) == 0 { - c.Usage() + f.Usage() return subcommands.ExitUsageError } diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 86c02a22a..eafd6285c 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -150,7 +150,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus { - // Start the new process and get it pid. + // Start the new process and get its pid. pid, err := c.Execute(e) if err != nil { return Errorf("executing processes for container: %v", err) diff --git a/runsc/cmd/symbolize.go b/runsc/cmd/symbolize.go new file mode 100644 index 000000000..fc0c69358 --- /dev/null +++ b/runsc/cmd/symbolize.go @@ -0,0 +1,91 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "bufio" + "context" + "os" + "strconv" + "strings" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/runsc/flag" +) + +// Symbolize implements subcommands.Command for the "symbolize" command. +type Symbolize struct { + dumpAll bool +} + +// Name implements subcommands.Command.Name. +func (*Symbolize) Name() string { + return "symbolize" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Symbolize) Synopsis() string { + return "Convert synthetic instruction pointers from kcov into positions in the runsc source code. Only used when Go coverage is enabled." +} + +// Usage implements subcommands.Command.Usage. +func (*Symbolize) Usage() string { + return `symbolize - converts synthetic instruction pointers into positions in the runsc source code. + +This command takes instruction pointers from stdin and converts them into their +corresponding file names and line/column numbers in the runsc source code. The +inputs are not interpreted as actual addresses, but as synthetic values that are +exposed through /sys/kernel/debug/kcov. One can extract coverage information +from kcov and translate those values into locations in the source code by +running symbolize on the same runsc binary. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (c *Symbolize) SetFlags(f *flag.FlagSet) { + f.BoolVar(&c.dumpAll, "all", false, "dump information on all coverage blocks along with their synthetic PCs") +} + +// Execute implements subcommands.Command.Execute. +func (c *Symbolize) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 0 { + f.Usage() + return subcommands.ExitUsageError + } + if !coverage.KcovAvailable() { + return Errorf("symbolize can only be used when coverage is available.") + } + coverage.InitCoverageData() + + if c.dumpAll { + coverage.WriteAllBlocks(os.Stdout) + return subcommands.ExitSuccess + } + + scanner := bufio.NewScanner(os.Stdin) + for scanner.Scan() { + // Input is always base 16, but may or may not have a leading "0x". + str := strings.TrimPrefix(scanner.Text(), "0x") + pc, err := strconv.ParseUint(str, 16 /* base */, 64 /* bitSize */) + if err != nil { + return Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err) + } + if err := coverage.Symbolize(os.Stdout, pc); err != nil { + return Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err) + } + } + return subcommands.ExitSuccess +} diff --git a/runsc/console/console.go b/runsc/console/console.go index dbb88e117..b36028792 100644 --- a/runsc/console/console.go +++ b/runsc/console/console.go @@ -24,8 +24,8 @@ import ( "golang.org/x/sys/unix" ) -// NewWithSocket creates pty master/replica pair, sends the master FD over the given -// socket, and returns the replica. +// NewWithSocket creates pty master/replica pair, sends the master FD over the +// given socket, and returns the replica. func NewWithSocket(socketPath string) (*os.File, error) { // Create a new pty master and replica. ptyMaster, ptyReplica, err := pty.Open() diff --git a/runsc/container/BUILD b/runsc/container/BUILD index c33755482..8793c8916 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "more_shards") package(licenses = ["notice"]) @@ -24,6 +24,7 @@ go_library( "//runsc/boot", "//runsc/cgroup", "//runsc/config", + "//runsc/console", "//runsc/sandbox", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", @@ -48,7 +49,7 @@ go_test( "//test/cmd/test_app", ], library = ":container", - shard_count = 10, + shard_count = more_shards, tags = [ "requires-kvm", ], diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 4228399b8..1b0fdebd6 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "io" + "math/rand" "os" "path/filepath" "syscall" @@ -27,7 +28,6 @@ import ( "github.com/kr/pty" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/pkg/unet" @@ -38,19 +38,22 @@ import ( // path is under 108 charactors (the unix socket path length limit), // relativizing the path if necessary. func socketPath(bundleDir string) (string, error) { - path := filepath.Join(bundleDir, "socket") + num := rand.Intn(10000) + path := filepath.Join(bundleDir, fmt.Sprintf("socket-%4d", num)) + const maxPathLen = 108 + if len(path) <= maxPathLen { + return path, nil + } + + // Path is too large, try to make it smaller. cwd, err := os.Getwd() if err != nil { return "", fmt.Errorf("error getting cwd: %v", err) } - relPath, err := filepath.Rel(cwd, path) + path, err = filepath.Rel(cwd, path) if err != nil { return "", fmt.Errorf("error getting relative path for %q from cwd %q: %v", path, cwd, err) } - if len(path) > len(relPath) { - path = relPath - } - const maxPathLen = 108 if len(path) > maxPathLen { return "", fmt.Errorf("could not get socket path under length limit %d: %s", maxPathLen, path) } @@ -159,6 +162,82 @@ func TestConsoleSocket(t *testing.T) { } } +// Test that an pty FD is sent over the console socket if one is provided. +func TestMultiContainerConsoleSocket(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + tru := []string{"true"} + testSpecs, ids := createSpecs(sleep, tru) + testSpecs[1].Process.Terminal = true + + bundleDir, cleanup, err := testutil.SetupBundleDir(testSpecs[0]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: ids[0], + Spec: testSpecs[0], + BundleDir: bundleDir, + } + rootCont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer rootCont.Destroy() + if err := rootCont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + bundleDir, cleanup, err = testutil.SetupBundleDir(testSpecs[0]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args = Args{ + ID: ids[1], + Spec: testSpecs[1], + BundleDir: bundleDir, + ConsoleSocket: sock, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + ptyMaster.Close() + }) + } +} + // Test that job control signals work on a console created with "exec -ti". func TestJobControlSignalExec(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/sleep", "10000") @@ -221,9 +300,9 @@ func TestJobControlSignalExec(t *testing.T) { // Make sure all the processes are running. expectedPL := []*control.Process{ // Root container process. - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().Cmd("sleep").Process(), // Bash from exec process. - {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}}, + newProcessBuilder().PID(2).Cmd("bash").Process(), } if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) @@ -233,7 +312,7 @@ func TestJobControlSignalExec(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for it to start. Sleep's PPID is bash's PID. - expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}) + expectedPL = append(expectedPL, newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process()) if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) } @@ -254,7 +333,7 @@ func TestJobControlSignalExec(t *testing.T) { // Sleep is dead, but it may take more time for bash to notice and // change the foreground process back to itself. We know it is done // when bash writes "Terminated" to the pty. - if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil { + if err := testutil.WaitUntilRead(ptyMaster, "Terminated", 5*time.Second); err != nil { t.Fatalf("bash did not take over pty: %v", err) } @@ -359,7 +438,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Wait for bash to start. expectedPL := []*control.Process{ - {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).Cmd("bash").Process(), } if err := waitForProcessList(c, expectedPL); err != nil { t.Fatalf("error waiting for processes: %v", err) @@ -369,7 +448,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for sleep to start. - expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}}) + expectedPL = append(expectedPL, newProcessBuilder().PID(2).PPID(1).Cmd("sleep").Process()) if err := waitForProcessList(c, expectedPL); err != nil { t.Fatalf("error waiting for processes: %v", err) } @@ -393,7 +472,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Sleep is dead, but it may take more time for bash to notice and // change the foreground process back to itself. We know it is done // when bash writes "Terminated" to the pty. - if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil { + if err := testutil.WaitUntilRead(ptyBuf, "Terminated", 5*time.Second); err != nil { t.Fatalf("bash did not take over pty: %v", err) } @@ -414,6 +493,104 @@ func TestJobControlSignalRootContainer(t *testing.T) { } } +// Test that terminal works with root and sub-containers. +func TestMultiContainerTerminal(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Don't let bash execute from profile or rc files, otherwise our PID + // counts get messed up. + bash := []string{"/bin/bash", "--noprofile", "--norc"} + testSpecs, ids := createSpecs(bash, bash) + + type termContainer struct { + container *Container + master *os.File + } + var containers []termContainer + for i, spec := range testSpecs { + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + spec.Process.Terminal = true + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args := Args{ + ID: ids[i], + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: sock, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + defer ptyMaster.Close() + + containers = append(containers, termContainer{ + container: cont, + master: ptyMaster, + }) + } + + for _, tc := range containers { + // Bash output as well as sandbox output will be written to the PTY + // file. Writes after a certain point will block unless we drain the + // PTY, so we must continually copy from it. + // + // We log the output to stderr for debugabilitly, and also to a buffer, + // since we wait on particular output from bash below. We use a custom + // blockingBuffer which is thread-safe and also blocks on Read calls, + // which makes this a suitable Reader for WaitUntilRead. + ptyBuf := newBlockingBuffer() + tee := io.TeeReader(tc.master, ptyBuf) + go io.Copy(os.Stderr, tee) + + // Wait for bash to start. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("bash").Process(), + } + if err := waitForProcessList(tc.container, expectedPL); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + // Execute echo command and check that it was executed correctly. Use + // a variable to ensure it's not matching against command echo. + tc.master.Write([]byte("echo foo-${PWD}-123\n")) + if err := testutil.WaitUntilRead(ptyBuf, "foo-/-123", 5*time.Second); err != nil { + t.Fatalf("echo didn't execute: %v", err) + } + } + }) + } +} + // blockingBuffer is a thread-safe buffer that blocks when reading if the // buffer is empty. It implements io.ReadWriter. type blockingBuffer struct { diff --git a/runsc/container/container.go b/runsc/container/container.go index 4aa139c88..418a27beb 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -38,6 +38,7 @@ import ( "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/sandbox" "gvisor.dev/gvisor/runsc/specutils" ) @@ -79,6 +80,7 @@ func validateID(id string) error { // - It calls 'runsc delete'. runc implementation kills --all SIGKILL once // again just to be sure, waits, and then proceeds with remaining teardown. // +// Container is thread-unsafe. type Container struct { // ID is the container ID. ID string `json:"id"` @@ -397,7 +399,22 @@ func New(conf *config.Config, args Args) (*Container, error) { return nil, err } c.Sandbox = sb.Sandbox - if err := c.Sandbox.CreateContainer(c.ID); err != nil { + + // If the console control socket file is provided, then create a new + // pty master/slave pair and send the TTY to the sandbox process. + var tty *os.File + if c.ConsoleSocket != "" { + // Create a new TTY pair and send the master on the provided socket. + var err error + tty, err = console.NewWithSocket(c.ConsoleSocket) + if err != nil { + return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err) + } + // tty file is transferred to the sandbox, then it can be closed here. + defer tty.Close() + } + + if err := c.Sandbox.CreateContainer(c.ID, tty); err != nil { return nil, err } } @@ -451,11 +468,16 @@ func (c *Container) Start(conf *config.Config) error { // the start (and all their children processes). if err := runInCgroup(c.Sandbox.Cgroup, func() error { // Create the gofer process. - ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) + goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) if err != nil { return err } - defer mountsFile.Close() + defer func() { + _ = mountsFile.Close() + for _, f := range goferFiles { + _ = f.Close() + } + }() cleanMounts, err := specutils.ReadMounts(mountsFile) if err != nil { @@ -463,7 +485,14 @@ func (c *Container) Start(conf *config.Config) error { } c.Spec.Mounts = cleanMounts - return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles) + // Setup stdios if the container is not using terminal. Otherwise TTY was + // already setup in create. + var stdios []*os.File + if !c.Spec.Process.Terminal { + stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} + } + + return c.Sandbox.StartContainer(c.Spec, conf, c.ID, stdios, goferFiles) }); err != nil { return err } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index cadc63bf3..45d4e6e6e 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -301,54 +301,21 @@ func TestMultiContainerWait(t *testing.T) { } defer cleanup() - // Check via ps that multiple processes are running. - expectedPL := []*control.Process{ - newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(), - } - if err := waitForProcessList(containers[1], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } - - // Wait on the short lived container from multiple goroutines. - wg := sync.WaitGroup{} - for i := 0; i < 3; i++ { - wg.Add(1) - go func(c *Container) { - defer wg.Done() - if ws, err := c.Wait(); err != nil { - t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err) - } else if es := ws.ExitStatus(); es != 0 { - t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es) - } - if _, err := c.Wait(); err != nil { - t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err) - } - }(containers[1]) + // Check that we can wait for the sub-container. + c := containers[1] + if ws, err := c.Wait(); err != nil { + t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err) + } else if es := ws.ExitStatus(); es != 0 { + t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es) } - - // Also wait via PID. - for i := 0; i < 3; i++ { - wg.Add(1) - go func(c *Container) { - defer wg.Done() - const pid = 2 - if ws, err := c.WaitPID(pid); err != nil { - t.Errorf("failed to wait for PID %d: %v", pid, err) - } else if es := ws.ExitStatus(); es != 0 { - t.Errorf("PID %d exited with non-zero status %d", pid, es) - } - if _, err := c.WaitPID(pid); err == nil { - t.Errorf("wait for stopped PID %d should fail", pid) - } - }(containers[1]) + if _, err := c.Wait(); err != nil { + t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err) } - wg.Wait() - // After Wait returns, ensure that the root container is running and // the child has finished. - expectedPL = []*control.Process{ - newProcessBuilder().Cmd("sleep").Process(), + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sleep").PID(1).Process(), } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err) diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 96c57a426..c56e1d4d0 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -29,9 +29,12 @@ go_test( srcs = ["fsgofer_test.go"], library = ":fsgofer", deps = [ + "//pkg/fd", "//pkg/log", "//pkg/p9", "//pkg/test/testutil", + "//runsc/specutils", + "@com_github_syndtr_gocapability//capability:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 0b628c8ce..3d94ffeb4 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -49,6 +49,21 @@ const ( allowedOpenFlags = unix.O_TRUNC ) +var ( + // Remember the process uid/gid to skip chown calls when file owner/group + // doesn't need to be changed. + processUID = p9.UID(os.Getuid()) + processGID = p9.GID(os.Getgid()) +) + +// join is equivalent to path.Join() but skips path.Clean() which is expensive. +func join(parent, child string) string { + if child == "." || child == ".." { + panic(fmt.Sprintf("invalid child path %q", child)) + } + return parent + "/" + child +} + // Config sets configuration options for each attach point. type Config struct { // ROMount is set to true if this is a readonly mount. @@ -115,7 +130,7 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) } - lf, err := newLocalFile(a, f, a.prefix, readable, stat) + lf, err := newLocalFile(a, f, a.prefix, readable, &stat) if err != nil { return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err) } @@ -124,7 +139,7 @@ func (a *attachPoint) Attach() (p9.File, error) { } // makeQID returns a unique QID for the given stat buffer. -func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID { +func (a *attachPoint) makeQID(stat *unix.Stat_t) p9.QID { a.deviceMu.Lock() defer a.deviceMu.Unlock() @@ -245,7 +260,7 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { } func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) { - pathDebug := path.Join(parent.hostPath, name) + pathDebug := join(parent.hostPath, name) f, readable, err := openAnyFile(pathDebug, func(mode int) (*fd.FD, error) { return fd.OpenAt(parent.file, name, openFlags|mode, 0) }) @@ -297,8 +312,8 @@ func openAnyFile(pathDebug string, fn func(mode int) (*fd.FD, error)) (*fd.FD, b return nil, false, extractErrno(err) } -func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error { - switch stat.Mode & unix.S_IFMT { +func checkSupportedFileType(mode uint32, permitSocket bool) error { + switch mode & unix.S_IFMT { case unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK: return nil @@ -313,8 +328,8 @@ func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error { } } -func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat unix.Stat_t) (*localFile, error) { - if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil { +func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat *unix.Stat_t) (*localFile, error) { + if err := checkSupportedFileType(stat.Mode, a.conf.HostUDS); err != nil { return nil, err } @@ -442,8 +457,10 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode, }) defer cu.Clean() - if err := fchown(child.FD(), uid, gid); err != nil { - return nil, nil, p9.QID{}, 0, extractErrno(err) + if uid != processUID || gid != processGID { + if err := fchown(child.FD(), uid, gid); err != nil { + return nil, nil, p9.QID{}, 0, extractErrno(err) + } } stat, err := fstat(child.FD()) if err != nil { @@ -452,11 +469,11 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode, c := &localFile{ attachPoint: l.attachPoint, - hostPath: path.Join(l.hostPath, name), + hostPath: join(l.hostPath, name), file: child, mode: mode, fileType: unix.S_IFREG, - qid: l.attachPoint.makeQID(stat), + qid: l.attachPoint.makeQID(&stat), } cu.Release() @@ -488,8 +505,10 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) } defer f.Close() - if err := fchown(f.FD(), uid, gid); err != nil { - return p9.QID{}, extractErrno(err) + if uid != processUID || gid != processGID { + if err := fchown(f.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } } stat, err := fstat(f.FD()) if err != nil { @@ -497,7 +516,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) } cu.Release() - return l.attachPoint.makeQID(stat), nil + return l.attachPoint.makeQID(&stat), nil } // Walk implements p9.File. @@ -512,7 +531,7 @@ func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, if err != nil { return nil, nil, p9.AttrMask{}, p9.Attr{}, err } - mask, attr := l.fillAttr(stat) + mask, attr := l.fillAttr(&stat) return qids, file, mask, attr, nil } @@ -538,13 +557,13 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error) file: newFile, mode: invalidMode, fileType: l.fileType, - qid: l.attachPoint.makeQID(stat), + qid: l.attachPoint.makeQID(&stat), controlReadable: readable, } return []p9.QID{c.qid}, c, stat, nil } - var qids []p9.QID + qids := make([]p9.QID, 0, len(names)) var lastStat unix.Stat_t last := l for _, name := range names { @@ -560,7 +579,7 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error) _ = f.Close() return nil, nil, unix.Stat_t{}, extractErrno(err) } - c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat) + c, err := newLocalFile(last.attachPoint, f, path, readable, &lastStat) if err != nil { _ = f.Close() return nil, nil, unix.Stat_t{}, extractErrno(err) @@ -609,11 +628,11 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) if err != nil { return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) } - mask, attr := l.fillAttr(stat) + mask, attr := l.fillAttr(&stat) return l.qid, mask, attr, nil } -func (l *localFile) fillAttr(stat unix.Stat_t) (p9.AttrMask, p9.Attr) { +func (l *localFile) fillAttr(stat *unix.Stat_t) (p9.AttrMask, p9.Attr) { attr := p9.Attr{ Mode: p9.FileMode(stat.Mode), UID: p9.UID(stat.Uid), @@ -881,8 +900,10 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. } defer f.Close() - if err := fchown(f.FD(), uid, gid); err != nil { - return p9.QID{}, extractErrno(err) + if uid != processUID || gid != processGID { + if err := fchown(f.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } } stat, err := fstat(f.FD()) if err != nil { @@ -890,7 +911,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. } cu.Release() - return l.attachPoint.makeQID(stat), nil + return l.attachPoint.makeQID(&stat), nil } // Link implements p9.File. @@ -938,8 +959,10 @@ func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid } defer child.Close() - if err := fchown(child.FD(), uid, gid); err != nil { - return p9.QID{}, extractErrno(err) + if uid != processUID || gid != processGID { + if err := fchown(child.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } } stat, err := fstat(child.FD()) if err != nil { @@ -947,7 +970,7 @@ func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid } cu.Release() - return l.attachPoint.makeQID(stat), nil + return l.attachPoint.makeQID(&stat), nil } // UnlinkAt implements p9.File. @@ -1045,7 +1068,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) log.Warningf("Readdir is skipping file with failed stat %q, err: %v", l.hostPath, err) continue } - qid := l.attachPoint.makeQID(stat) + qid := l.attachPoint.makeQID(&stat) offset++ dirents = append(dirents, p9.Dirent{ QID: qid, @@ -1139,7 +1162,7 @@ func (l *localFile) isOpen() bool { // Renamed implements p9.Renamed. func (l *localFile) Renamed(newDir p9.File, newName string) { - l.hostPath = path.Join(newDir.(*localFile).hostPath, newName) + l.hostPath = join(newDir.(*localFile).hostPath, newName) } // extractErrno tries to determine the errno. diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index a84206686..c5daebe5e 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -23,10 +23,13 @@ import ( "path/filepath" "testing" + "github.com/syndtr/gocapability/capability" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/runsc/specutils" ) var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} @@ -197,10 +200,13 @@ func setup(fileType uint32) (string, string, error) { switch fileType { case unix.S_IFREG: name = "file" - _, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + fd, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err) } + if fd != nil { + fd.Close() + } defer f.Close() case unix.S_IFDIR: name = "dir" @@ -556,7 +562,28 @@ func TestROMountChecks(t *testing.T) { func TestWalkNotFound(t *testing.T) { runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) { if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT { - t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: unix.ENOENT", s, "nobody-here", err) + t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "nobody-here", err) + } + if _, _, err := s.file.Walk([]string{"nobody", "here"}); err != unix.ENOENT { + t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "nobody/here", err) + } + if !s.conf.ROMount { + if _, err := s.file.Mkdir("dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("MkDir(dir) failed, err: %v", err) + } + if _, _, err := s.file.Walk([]string{"dir", "nobody-here"}); err != unix.ENOENT { + t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "dir/nobody-here", err) + } + } + }) +} + +func TestWalkPanic(t *testing.T) { + runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) { + for _, name := range []string{".", ".."} { + assertPanic(t, func() { + s.file.Walk([]string{name}) + }) } }) } @@ -574,6 +601,27 @@ func TestWalkDup(t *testing.T) { }) } +func TestWalkMultiple(t *testing.T) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + var names []string + var parent p9.File = s.file + for i := 0; i < 5; i++ { + name := fmt.Sprintf("dir%d", i) + names = append(names, name) + + if _, err := parent.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("MkDir(%q) failed, err: %v", name, err) + } + + var err error + _, parent, err = s.file.Walk(names) + if err != nil { + t.Errorf("Walk(%q): %v", name, err) + } + } + }) +} + func TestReaddir(t *testing.T) { runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { name := "dir" @@ -819,3 +867,168 @@ func TestMknod(t *testing.T) { } }) } + +func BenchmarkWalkOne(b *testing.B) { + path, name, err := setup(unix.S_IFDIR) + if err != nil { + b.Fatalf("%v", err) + } + defer os.RemoveAll(path) + + a, err := NewAttachPoint(path, Config{}) + if err != nil { + b.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + b.Fatalf("Attach failed, err: %v", err) + } + defer root.Close() + + names := []string{name} + files := make([]p9.File, 0, 1000) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, file, err := root.Walk(names) + if err != nil { + b.Fatalf("Walk(%q): %v", name, err) + } + files = append(files, file) + + // Avoid running out of FDs. + if len(files) == cap(files) { + b.StopTimer() + for _, file := range files { + file.Close() + } + files = files[:0] + b.StartTimer() + } + } + + b.StopTimer() + for _, file := range files { + file.Close() + } +} + +func BenchmarkCreate(b *testing.B) { + path, _, err := setup(unix.S_IFDIR) + if err != nil { + b.Fatalf("%v", err) + } + defer os.RemoveAll(path) + + a, err := NewAttachPoint(path, Config{}) + if err != nil { + b.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + b.Fatalf("Attach failed, err: %v", err) + } + defer root.Close() + + files := make([]p9.File, 0, 500) + fds := make([]*fd.FD, 0, 500) + uid := p9.UID(os.Getuid()) + gid := p9.GID(os.Getgid()) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + name := fmt.Sprintf("same-%d", i) + fd, file, _, _, err := root.Create(name, p9.ReadOnly, 0777, uid, gid) + if err != nil { + b.Fatalf("Create(%q): %v", name, err) + } + files = append(files, file) + if fd != nil { + fds = append(fds, fd) + } + + // Avoid running out of FDs. + if len(files) == cap(files) { + b.StopTimer() + for _, file := range files { + file.Close() + } + files = files[:0] + for _, fd := range fds { + fd.Close() + } + fds = fds[:0] + b.StartTimer() + } + } + + b.StopTimer() + for _, file := range files { + file.Close() + } + for _, fd := range fds { + fd.Close() + } +} + +func BenchmarkCreateDiffOwner(b *testing.B) { + if !specutils.HasCapabilities(capability.CAP_CHOWN) { + b.Skipf("Test requires CAP_CHOWN") + } + + path, _, err := setup(unix.S_IFDIR) + if err != nil { + b.Fatalf("%v", err) + } + defer os.RemoveAll(path) + + a, err := NewAttachPoint(path, Config{}) + if err != nil { + b.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + b.Fatalf("Attach failed, err: %v", err) + } + defer root.Close() + + files := make([]p9.File, 0, 500) + fds := make([]*fd.FD, 0, 500) + gid := p9.GID(os.Getgid()) + const nobody = 65534 + + b.ResetTimer() + for i := 0; i < b.N; i++ { + name := fmt.Sprintf("diff-%d", i) + fd, file, _, _, err := root.Create(name, p9.ReadOnly, 0777, nobody, gid) + if err != nil { + b.Fatalf("Create(%q): %v", name, err) + } + files = append(files, file) + if fd != nil { + fds = append(fds, fd) + } + + // Avoid running out of FDs. + if len(files) == cap(files) { + b.StopTimer() + for _, file := range files { + file.Close() + } + files = files[:0] + for _, fd := range fds { + fd.Close() + } + fds = fds[:0] + b.StartTimer() + } + } + + b.StopTimer() + for _, file := range files { + file.Close() + } + for _, fd := range fds { + fd.Close() + } +} diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 8f66dd1f8..9e429f7d5 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -127,7 +127,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Get all interfaces in the namespace. ifaces, err := net.Interfaces() if err != nil { - return fmt.Errorf("querying interfaces: %v", err) + return fmt.Errorf("querying interfaces: %w", err) } isRoot, err := isRootNS() @@ -148,14 +148,14 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG allAddrs, err := iface.Addrs() if err != nil { - return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err) + return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) } // We build our own loopback device. if iface.Flags&net.FlagLoopback != 0 { link, err := loopbackLink(iface, allAddrs) if err != nil { - return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err) + return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) } args.LoopbackLinks = append(args.LoopbackLinks, link) continue @@ -209,7 +209,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Get the link for the interface. ifaceLink, err := netlink.LinkByName(iface.Name) if err != nil { - return fmt.Errorf("getting link for interface %q: %v", iface.Name, err) + return fmt.Errorf("getting link for interface %q: %w", iface.Name, err) } link.LinkAddress = ifaceLink.Attrs().HardwareAddr @@ -219,7 +219,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG log.Debugf("Creating Channel %d", i) socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO) if err != nil { - return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) + return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err) } if i == 0 { link.GSOMaxSize = socketEntry.gsoMaxSize @@ -241,11 +241,12 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG // Collect the addresses for the interface, enable forwarding, // and remove them from the host. for _, addr := range ipAddrs { - link.Addresses = append(link.Addresses, addr.IP) + prefix, _ := addr.Mask.Size() + link.Addresses = append(link.Addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix}) // Steal IP address from NIC. if err := removeAddress(ifaceLink, addr.String()); err != nil { - return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err) + return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err) } } @@ -254,7 +255,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG log.Debugf("Setting up network, config: %+v", args) if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { - return fmt.Errorf("creating links and routes: %v", err) + return fmt.Errorf("creating links and routes: %w", err) } return nil } @@ -278,8 +279,6 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) ( ll := syscall.SockaddrLinklayer{ Protocol: protocol, Ifindex: iface.Index, - Hatype: 0, // No ARP type. - Pkttype: syscall.PACKET_OTHERHOST, } if err := syscall.Bind(fd, &ll); err != nil { return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) @@ -339,9 +338,15 @@ func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, err if !ok { return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr) } + + prefix, _ := ipNet.Mask.Size() + link.Addresses = append(link.Addresses, boot.IPWithPrefix{ + Address: ipNet.IP, + PrefixLen: prefix, + }) + dst := *ipNet dst.IP = dst.IP.Mask(dst.Mask) - link.Addresses = append(link.Addresses, ipNet.IP) link.Routes = append(link.Routes, boot.Route{ Destination: dst, }) diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 4a4110477..c84ebcd8a 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -173,7 +173,7 @@ func New(conf *config.Config, args *Args) (*Sandbox, error) { } // CreateContainer creates a non-root container inside the sandbox. -func (s *Sandbox) CreateContainer(cid string) error { +func (s *Sandbox) CreateContainer(cid string, tty *os.File) error { log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) sandboxConn, err := s.sandboxConnect() if err != nil { @@ -181,7 +181,16 @@ func (s *Sandbox) CreateContainer(cid string) error { } defer sandboxConn.Close() - if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil { + var files []*os.File + if tty != nil { + files = []*os.File{tty} + } + + args := boot.CreateArgs{ + CID: cid, + FilePayload: urpc.FilePayload{Files: files}, + } + if err := sandboxConn.Call(boot.ContainerCreate, &args, nil); err != nil { return fmt.Errorf("creating non-root container %q: %v", cid, err) } return nil @@ -211,11 +220,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error { } // StartContainer starts running a non-root container inside the sandbox. -func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error { - for _, f := range goferFiles { - defer f.Close() - } - +func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error { log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) sandboxConn, err := s.sandboxConnect() if err != nil { @@ -223,15 +228,18 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid stri } defer sandboxConn.Close() - // The payload must container stdin/stdout/stderr followed by gofer - // files. - files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...) + // The payload must contain stdin/stdout/stderr (which may be empty if using + // TTY) followed by gofer files. + payload := urpc.FilePayload{} + payload.Files = append(payload.Files, stdios...) + payload.Files = append(payload.Files, goferFiles...) + // Start running the container. args := boot.StartArgs{ Spec: spec, Conf: conf, CID: cid, - FilePayload: urpc.FilePayload{Files: files}, + FilePayload: payload, } if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err) diff --git a/test/benchmarks/BUILD b/test/benchmarks/BUILD new file mode 100644 index 000000000..faf310676 --- /dev/null +++ b/test/benchmarks/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "bzl_library") + +package(licenses = ["notice"]) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = [ + "//:sandbox", + ], +) diff --git a/test/benchmarks/base/BUILD b/test/benchmarks/base/BUILD index b4b55317b..697ab5837 100644 --- a/test/benchmarks/base/BUILD +++ b/test/benchmarks/base/BUILD @@ -1,4 +1,5 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library") +load("//test/benchmarks:defs.bzl", "benchmark_test") package(licenses = ["notice"]) @@ -14,7 +15,7 @@ go_library( ], ) -go_test( +benchmark_test( name = "startup_test", size = "enormous", srcs = ["startup_test.go"], @@ -26,7 +27,7 @@ go_test( ], ) -go_test( +benchmark_test( name = "size_test", size = "enormous", srcs = ["size_test.go"], @@ -39,7 +40,7 @@ go_test( ], ) -go_test( +benchmark_test( name = "sysbench_test", size = "enormous", srcs = ["sysbench_test.go"], diff --git a/test/benchmarks/database/BUILD b/test/benchmarks/database/BUILD index 93b380e8a..0b1743603 100644 --- a/test/benchmarks/database/BUILD +++ b/test/benchmarks/database/BUILD @@ -1,4 +1,5 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library") +load("//test/benchmarks:defs.bzl", "benchmark_test") package(licenses = ["notice"]) @@ -6,19 +7,13 @@ go_library( name = "database", testonly = 1, srcs = ["database.go"], - deps = ["//test/benchmarks/harness"], ) -go_test( - name = "database_test", +benchmark_test( + name = "redis_test", size = "enormous", srcs = ["redis_test.go"], library = ":database", - tags = [ - # Requires docker and runsc to be configured before test runs. - "manual", - "local", - ], visibility = ["//:sandbox"], deps = [ "//pkg/test/dockerutil", diff --git a/test/benchmarks/database/database.go b/test/benchmarks/database/database.go index 9eeb59f9a..c15ca661c 100644 --- a/test/benchmarks/database/database.go +++ b/test/benchmarks/database/database.go @@ -14,18 +14,3 @@ // Package database holds benchmarks around database applications. package database - -import ( - "os" - "testing" - - "gvisor.dev/gvisor/test/benchmarks/harness" -) - -var h harness.Harness - -// TestMain is the main method for package database. -func TestMain(m *testing.M) { - h.Init() - os.Exit(m.Run()) -} diff --git a/test/benchmarks/database/redis_test.go b/test/benchmarks/database/redis_test.go index 02e67154e..f8075a04b 100644 --- a/test/benchmarks/database/redis_test.go +++ b/test/benchmarks/database/redis_test.go @@ -16,6 +16,7 @@ package database import ( "context" + "os" "testing" "time" @@ -24,6 +25,8 @@ import ( "gvisor.dev/gvisor/test/benchmarks/tools" ) +var h harness.Harness + // All possible operations from redis. Note: "ping" will // run both PING_INLINE and PING_BUILD. var operations []string = []string{ @@ -111,21 +114,23 @@ func BenchmarkRedis(b *testing.B) { // Reset profiles and timer to begin the measurement. server.RestartProfiles() b.ResetTimer() - for i := 0; i < b.N; i++ { - client := clientMachine.GetNativeContainer(ctx, b) - defer client.CleanUp(ctx) - out, err := client.Run(ctx, dockerutil.RunOpts{ - Image: "benchmarks/redis", - }, redis.MakeCmd(ip, serverPort)...) - if err != nil { - b.Fatalf("redis-benchmark failed with: %v", err) - } - - // Stop time while we parse results. - b.StopTimer() - redis.Report(b, out) - b.StartTimer() + client := clientMachine.GetNativeContainer(ctx, b) + defer client.CleanUp(ctx) + out, err := client.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/redis", + }, redis.MakeCmd(ip, serverPort, b.N /*requests*/)...) + if err != nil { + b.Fatalf("redis-benchmark failed with: %v", err) } + + // Stop time while we parse results. + b.StopTimer() + redis.Report(b, out) }) } } + +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) +} diff --git a/test/benchmarks/defs.bzl b/test/benchmarks/defs.bzl new file mode 100644 index 000000000..ef44b46e3 --- /dev/null +++ b/test/benchmarks/defs.bzl @@ -0,0 +1,14 @@ +"""Defines a rule for benchmark test targets.""" + +load("//tools:defs.bzl", "go_test") + +def benchmark_test(name, tags = [], **kwargs): + go_test( + name, + tags = [ + # Requires docker and runsc to be configured before the test runs. + "local", + "manual", + ], + **kwargs + ) diff --git a/test/benchmarks/fs/BUILD b/test/benchmarks/fs/BUILD index 021fae38d..b4f967441 100644 --- a/test/benchmarks/fs/BUILD +++ b/test/benchmarks/fs/BUILD @@ -1,8 +1,8 @@ -load("//tools:defs.bzl", "go_test") +load("//test/benchmarks:defs.bzl", "benchmark_test") package(licenses = ["notice"]) -go_test( +benchmark_test( name = "bazel_test", size = "enormous", srcs = ["bazel_test.go"], @@ -14,7 +14,7 @@ go_test( ], ) -go_test( +benchmark_test( name = "fio_test", size = "enormous", srcs = ["fio_test.go"], diff --git a/test/benchmarks/fs/bazel_test.go b/test/benchmarks/fs/bazel_test.go index 53ed3f9f2..3fb4da9d1 100644 --- a/test/benchmarks/fs/bazel_test.go +++ b/test/benchmarks/fs/bazel_test.go @@ -61,10 +61,10 @@ func runBuildBenchmark(b *testing.B, image, workdir, target string) { for _, bm := range benchmarks { pageCache := tools.Parameter{ Name: "page_cache", - Value: "clean", + Value: "dirty", } if bm.clearCache { - pageCache.Value = "dirty" + pageCache.Value = "clean" } filesystem := tools.Parameter{ @@ -129,12 +129,14 @@ func runBuildBenchmark(b *testing.B, image, workdir, target string) { if !strings.Contains(got, want) { b.Fatalf("string %s not in: %s", want, got) } - // Clean bazel in case we use b.N. - _, err = container.Exec(ctx, dockerutil.ExecOpts{ - WorkDir: prefix + workdir, - }, "bazel", "clean") - if err != nil { - b.Fatalf("build failed with: %v", err) + + // Clean bazel in the case we are doing another run. + if i < b.N-1 { + if _, err = container.Exec(ctx, dockerutil.ExecOpts{ + WorkDir: prefix + workdir, + }, "bazel", "clean"); err != nil { + b.Fatalf("build failed with: %v", err) + } } b.StartTimer() } diff --git a/test/benchmarks/harness/harness.go b/test/benchmarks/harness/harness.go index 5c9d0e01e..4c6e724aa 100644 --- a/test/benchmarks/harness/harness.go +++ b/test/benchmarks/harness/harness.go @@ -39,7 +39,7 @@ func (h *Harness) Init() error { flag.PrintDefaults() } flag.Parse() - if flag.NFlag() == 0 || *help { + if *help { flag.Usage() os.Exit(0) } diff --git a/test/benchmarks/media/BUILD b/test/benchmarks/media/BUILD index bb242d385..380783f0b 100644 --- a/test/benchmarks/media/BUILD +++ b/test/benchmarks/media/BUILD @@ -1,4 +1,5 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library") +load("//test/benchmarks:defs.bzl", "benchmark_test") package(licenses = ["notice"]) @@ -6,12 +7,11 @@ go_library( name = "media", testonly = 1, srcs = ["media.go"], - deps = ["//test/benchmarks/harness"], ) -go_test( - name = "media_test", - size = "large", +benchmark_test( + name = "ffmpeg_test", + size = "enormous", srcs = ["ffmpeg_test.go"], library = ":media", visibility = ["//:sandbox"], diff --git a/test/benchmarks/media/ffmpeg_test.go b/test/benchmarks/media/ffmpeg_test.go index 7822dfad7..a462ec2a6 100644 --- a/test/benchmarks/media/ffmpeg_test.go +++ b/test/benchmarks/media/ffmpeg_test.go @@ -15,6 +15,7 @@ package media import ( "context" + "os" "strings" "testing" @@ -22,6 +23,8 @@ import ( "gvisor.dev/gvisor/test/benchmarks/harness" ) +var h harness.Harness + // BenchmarkFfmpeg runs ffmpeg in a container and records runtime. // BenchmarkFfmpeg should run as root to drop caches. func BenchmarkFfmpeg(b *testing.B) { @@ -32,13 +35,13 @@ func BenchmarkFfmpeg(b *testing.B) { defer machine.CleanUp() ctx := context.Background() - container := machine.GetContainer(ctx, b) - defer container.CleanUp(ctx) cmd := strings.Split("ffmpeg -i video.mp4 -c:v libx264 -preset veryslow output.mp4", " ") b.ResetTimer() for i := 0; i < b.N; i++ { b.StopTimer() + container := machine.GetContainer(ctx, b) + defer container.CleanUp(ctx) if err := harness.DropCaches(machine); err != nil { b.Skipf("failed to drop caches: %v. You probably need root.", err) } @@ -51,3 +54,8 @@ func BenchmarkFfmpeg(b *testing.B) { } } } + +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) +} diff --git a/test/benchmarks/media/media.go b/test/benchmarks/media/media.go index c7b35b758..ed7b24651 100644 --- a/test/benchmarks/media/media.go +++ b/test/benchmarks/media/media.go @@ -14,18 +14,3 @@ // Package media holds benchmarks around media processing applications. package media - -import ( - "os" - "testing" - - "gvisor.dev/gvisor/test/benchmarks/harness" -) - -var h harness.Harness - -// TestMain is the main method for package media. -func TestMain(m *testing.M) { - h.Init() - os.Exit(m.Run()) -} diff --git a/test/benchmarks/ml/BUILD b/test/benchmarks/ml/BUILD index 970f52706..285ec35d9 100644 --- a/test/benchmarks/ml/BUILD +++ b/test/benchmarks/ml/BUILD @@ -1,4 +1,5 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library") +load("//test/benchmarks:defs.bzl", "benchmark_test") package(licenses = ["notice"]) @@ -6,12 +7,11 @@ go_library( name = "ml", testonly = 1, srcs = ["ml.go"], - deps = ["//test/benchmarks/harness"], ) -go_test( - name = "ml_test", - size = "large", +benchmark_test( + name = "tensorflow_test", + size = "enormous", srcs = ["tensorflow_test.go"], library = ":ml", visibility = ["//:sandbox"], diff --git a/test/benchmarks/ml/ml.go b/test/benchmarks/ml/ml.go index 13282d7bb..d5fc5b7da 100644 --- a/test/benchmarks/ml/ml.go +++ b/test/benchmarks/ml/ml.go @@ -14,18 +14,3 @@ // Package ml holds benchmarks around machine learning performance. package ml - -import ( - "os" - "testing" - - "gvisor.dev/gvisor/test/benchmarks/harness" -) - -var h harness.Harness - -// TestMain is the main method for package ml. -func TestMain(m *testing.M) { - h.Init() - os.Exit(m.Run()) -} diff --git a/test/benchmarks/ml/tensorflow_test.go b/test/benchmarks/ml/tensorflow_test.go index f7746897d..a55329d82 100644 --- a/test/benchmarks/ml/tensorflow_test.go +++ b/test/benchmarks/ml/tensorflow_test.go @@ -15,12 +15,15 @@ package ml import ( "context" + "os" "testing" "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/test/benchmarks/harness" ) +var h harness.Harness + // BenchmarkTensorflow runs workloads from a TensorFlow tutorial. // See: https://github.com/aymericdamien/TensorFlow-Examples func BenchmarkTensorflow(b *testing.B) { @@ -44,12 +47,12 @@ func BenchmarkTensorflow(b *testing.B) { for name, workload := range workloads { b.Run(name, func(b *testing.B) { ctx := context.Background() - container := machine.GetContainer(ctx, b) - defer container.CleanUp(ctx) b.ResetTimer() for i := 0; i < b.N; i++ { b.StopTimer() + container := machine.GetContainer(ctx, b) + defer container.CleanUp(ctx) if err := harness.DropCaches(machine); err != nil { b.Skipf("failed to drop caches: %v. You probably need root.", err) } @@ -67,3 +70,8 @@ func BenchmarkTensorflow(b *testing.B) { } } + +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) +} diff --git a/test/benchmarks/network/BUILD b/test/benchmarks/network/BUILD index 472b5c387..2741570f5 100644 --- a/test/benchmarks/network/BUILD +++ b/test/benchmarks/network/BUILD @@ -1,4 +1,5 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library") +load("//test/benchmarks:defs.bzl", "benchmark_test") package(licenses = ["notice"]) @@ -7,7 +8,6 @@ go_library( testonly = 1, srcs = [ "network.go", - "static_server.go", ], deps = [ "//pkg/test/dockerutil", @@ -16,22 +16,74 @@ go_library( ], ) -go_test( - name = "network_test", - size = "large", +benchmark_test( + name = "iperf_test", + size = "enormous", srcs = [ - "httpd_test.go", "iperf_test.go", - "nginx_test.go", + ], + library = ":network", + visibility = ["//:sandbox"], + deps = [ + "//pkg/test/dockerutil", + "//pkg/test/testutil", + "//test/benchmarks/harness", + "//test/benchmarks/tools", + ], +) + +benchmark_test( + name = "node_test", + size = "enormous", + srcs = [ "node_test.go", + ], + library = ":network", + visibility = ["//:sandbox"], + deps = [ + "//pkg/test/dockerutil", + "//test/benchmarks/harness", + "//test/benchmarks/tools", + ], +) + +benchmark_test( + name = "ruby_test", + size = "enormous", + srcs = [ "ruby_test.go", ], library = ":network", - tags = [ - # Requires docker and runsc to be configured before test runs. - "manual", - "local", + visibility = ["//:sandbox"], + deps = [ + "//pkg/test/dockerutil", + "//test/benchmarks/harness", + "//test/benchmarks/tools", + ], +) + +benchmark_test( + name = "nginx_test", + size = "enormous", + srcs = [ + "nginx_test.go", + ], + library = ":network", + visibility = ["//:sandbox"], + deps = [ + "//pkg/test/dockerutil", + "//test/benchmarks/harness", + "//test/benchmarks/tools", ], +) + +benchmark_test( + name = "httpd_test", + size = "enormous", + srcs = [ + "httpd_test.go", + ], + library = ":network", visibility = ["//:sandbox"], deps = [ "//pkg/test/dockerutil", diff --git a/test/benchmarks/network/httpd_test.go b/test/benchmarks/network/httpd_test.go index 8d7d5f750..b07274662 100644 --- a/test/benchmarks/network/httpd_test.go +++ b/test/benchmarks/network/httpd_test.go @@ -14,13 +14,17 @@ package network import ( + "os" "strconv" "testing" "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/benchmarks/harness" "gvisor.dev/gvisor/test/benchmarks/tools" ) +var h harness.Harness + // see Dockerfile '//images/benchmarks/httpd'. var httpdDocs = map[string]string{ "notfound": "notfound", @@ -43,6 +47,22 @@ func BenchmarkReverseHttpd(b *testing.B) { benchmarkHttpdDocSize(b, true /* reverse */) } +// BenchmarkContinuousHttpd runs specific benchmarks for continous jobs. +// The runtime under test is the server serving a runc client. +func BenchmarkContinuousHttpd(b *testing.B) { + sizes := []string{"10Kb", "100Kb", "1Mb"} + threads := []int{1, 25, 100, 1000} + benchmarkHttpdContinuous(b, threads, sizes, false /*reverse*/) +} + +// BenchmarkContinuousHttpdReverse runs specific benchmarks for continous jobs. +// The runtime under test is the client downloading from a runc server. +func BenchmarkContinuousHttpdReverse(b *testing.B) { + sizes := []string{"10Kb", "100Kb", "1Mb"} + threads := []int{1, 25, 100, 1000} + benchmarkHttpdContinuous(b, threads, sizes, true /*reverse*/) +} + // benchmarkHttpdDocSize iterates through all doc sizes, running subbenchmarks // for each size. func benchmarkHttpdDocSize(b *testing.B, reverse bool) { @@ -62,9 +82,51 @@ func benchmarkHttpdDocSize(b *testing.B, reverse bool) { if err != nil { b.Fatalf("Failed to parse parameters: %v", err) } + requests := b.N + if requests < c { + b.Logf("b.N is %d must be greater than threads %d. Consider running with --test.benchtime=Nx where N >= %d", b.N, c, c) + requests = c + } + b.Run(name, func(b *testing.B) { + hey := &tools.Hey{ + Requests: requests, + Concurrency: c, + Doc: filename, + } + runHttpd(b, hey, reverse) + }) + } + } +} + +// benchmarkHttpdContinuous iterates through given sizes and concurrencies. +func benchmarkHttpdContinuous(b *testing.B, concurrency []int, sizes []string, reverse bool) { + for _, size := range sizes { + filename := httpdDocs[size] + for _, c := range concurrency { + fsize := tools.Parameter{ + Name: "filesize", + Value: size, + } + + threads := tools.Parameter{ + Name: "concurrency", + Value: strconv.Itoa(c), + } + + name, err := tools.ParametersToName(fsize, threads) + if err != nil { + b.Fatalf("Failed to parse parameters: %v", err) + } + + requests := b.N + if requests < c { + b.Logf("b.N is %d must be greater than threads %d. Consider running with --test.benchtime=Nx where N >= %d", b.N, c, c) + requests = c + } b.Run(name, func(b *testing.B) { hey := &tools.Hey{ - Requests: c * b.N, + Requests: requests, Concurrency: c, Doc: filename, } @@ -91,5 +153,10 @@ func runHttpd(b *testing.B, hey *tools.Hey, reverse bool) { }, } httpdCmd := []string{"sh", "-c", "mkdir -p /tmp/html; cp -r /local/* /tmp/html/.; apache2 -X"} - runStaticServer(b, httpdRunOpts, httpdCmd, port, hey, reverse) + runStaticServer(b, h, httpdRunOpts, httpdCmd, port, hey, reverse) +} + +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) } diff --git a/test/benchmarks/network/iperf_test.go b/test/benchmarks/network/iperf_test.go index b8ab7dfb8..9d64db943 100644 --- a/test/benchmarks/network/iperf_test.go +++ b/test/benchmarks/network/iperf_test.go @@ -15,6 +15,7 @@ package network import ( "context" + "os" "testing" "gvisor.dev/gvisor/pkg/test/dockerutil" @@ -23,9 +24,11 @@ import ( "gvisor.dev/gvisor/test/benchmarks/tools" ) +var h harness.Harness + func BenchmarkIperf(b *testing.B) { iperf := tools.Iperf{ - Time: 10, // time in seconds to run client. + Time: b.N, // time in seconds to run client. } clientMachine, err := h.GetMachine() @@ -97,17 +100,19 @@ func BenchmarkIperf(b *testing.B) { // Restart the server profiles. If the server isn't being profiled // this does nothing. server.RestartProfiles() - for i := 0; i < b.N; i++ { - out, err := client.Run(ctx, dockerutil.RunOpts{ - Image: "benchmarks/iperf", - }, iperf.MakeCmd(ip, servingPort)...) - if err != nil { - b.Fatalf("failed to run client: %v", err) - } - b.StopTimer() - iperf.Report(b, out) - b.StartTimer() + out, err := client.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/iperf", + }, iperf.MakeCmd(ip, servingPort)...) + if err != nil { + b.Fatalf("failed to run client: %v", err) } + b.StopTimer() + iperf.Report(b, out) }) } } + +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) +} diff --git a/test/benchmarks/network/network.go b/test/benchmarks/network/network.go index ce17ddb94..b18bc2b3c 100644 --- a/test/benchmarks/network/network.go +++ b/test/benchmarks/network/network.go @@ -16,16 +16,73 @@ package network import ( - "os" + "context" "testing" + "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/test/benchmarks/harness" + "gvisor.dev/gvisor/test/benchmarks/tools" ) -var h harness.Harness +// runStaticServer runs static serving workloads (httpd, nginx). +func runStaticServer(b *testing.B, h harness.Harness, serverOpts dockerutil.RunOpts, serverCmd []string, port int, hey *tools.Hey, reverse bool) { + ctx := context.Background() -// TestMain is the main method for package network. -func TestMain(m *testing.M) { - h.Init() - os.Exit(m.Run()) + // Get two machines: a client and server. + clientMachine, err := h.GetMachine() + if err != nil { + b.Fatalf("failed to get machine: %v", err) + } + defer clientMachine.CleanUp() + + serverMachine, err := h.GetMachine() + if err != nil { + b.Fatalf("failed to get machine: %v", err) + } + defer serverMachine.CleanUp() + + // Make the containers. 'reverse=true' specifies that the client should use the + // runtime under test. + var client, server *dockerutil.Container + if reverse { + client = clientMachine.GetContainer(ctx, b) + server = serverMachine.GetNativeContainer(ctx, b) + } else { + client = clientMachine.GetNativeContainer(ctx, b) + server = serverMachine.GetContainer(ctx, b) + } + defer client.CleanUp(ctx) + defer server.CleanUp(ctx) + + // Start the server. + if err := server.Spawn(ctx, serverOpts, serverCmd...); err != nil { + b.Fatalf("failed to start server: %v", err) + } + + // Get its IP. + ip, err := serverMachine.IPAddress() + if err != nil { + b.Fatalf("failed to find server ip: %v", err) + } + + // Get the published port. + servingPort, err := server.FindPort(ctx, port) + if err != nil { + b.Fatalf("failed to find server port %d: %v", port, err) + } + + // Make sure the server is serving. + harness.WaitUntilServing(ctx, clientMachine, ip, servingPort) + b.ResetTimer() + server.RestartProfiles() + out, err := client.Run(ctx, dockerutil.RunOpts{ + Image: "benchmarks/hey", + }, hey.MakeCmd(ip, servingPort)...) + if err != nil { + b.Fatalf("run failed with: %v", err) + } + + b.StopTimer() + hey.Report(b, out) + b.StartTimer() } diff --git a/test/benchmarks/network/nginx_test.go b/test/benchmarks/network/nginx_test.go index 08565d0b2..87449612a 100644 --- a/test/benchmarks/network/nginx_test.go +++ b/test/benchmarks/network/nginx_test.go @@ -14,13 +14,17 @@ package network import ( + "os" "strconv" "testing" "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/test/benchmarks/harness" "gvisor.dev/gvisor/test/benchmarks/tools" ) +var h harness.Harness + // see Dockerfile '//images/benchmarks/nginx'. var nginxDocs = map[string]string{ "notfound": "notfound", @@ -44,6 +48,22 @@ func BenchmarkReverseNginxDocSize(b *testing.B) { benchmarkNginxDocSize(b, true /* reverse */, true /* tmpfs */) } +// BenchmarkContinuousNginx runs specific benchmarks for continous jobs. +// The runtime under test is the sever serving a runc client. +func BenchmarkContinuousNginx(b *testing.B) { + sizes := []string{"10Kb", "100Kb", "1Mb"} + threads := []int{1, 25, 100, 1000} + benchmarkNginxContinuous(b, threads, sizes, false /*reverse*/) +} + +// BenchmarkContinuousNginxReverse runs specific benchmarks for continous jobs. +// The runtime under test is the client downloading from a runc server. +func BenchmarkContinuousNginxReverse(b *testing.B) { + sizes := []string{"10Kb", "100Kb", "1Mb"} + threads := []int{1, 25, 100, 1000} + benchmarkNginxContinuous(b, threads, sizes, true /*reverse*/) +} + // benchmarkNginxDocSize iterates through all doc sizes, running subbenchmarks // for each size. func benchmarkNginxDocSize(b *testing.B, reverse, tmpfs bool) { @@ -72,9 +92,14 @@ func benchmarkNginxDocSize(b *testing.B, reverse, tmpfs bool) { b.Fatalf("Failed to parse parameters: %v", err) } + requests := b.N + if requests < c { + b.Logf("b.N is %d must be greater than threads %d. Consider running with --test.benchtime=Nx where N >= %d", b.N, c, c) + requests = c + } b.Run(name, func(b *testing.B) { hey := &tools.Hey{ - Requests: c * b.N, + Requests: requests, Concurrency: c, Doc: filename, } @@ -84,6 +109,47 @@ func benchmarkNginxDocSize(b *testing.B, reverse, tmpfs bool) { } } +// benchmarkNginxContinuous iterates through given sizes and concurrencies on a tmpfs mount. +func benchmarkNginxContinuous(b *testing.B, concurrency []int, sizes []string, reverse bool) { + for _, size := range sizes { + filename := nginxDocs[size] + for _, c := range concurrency { + fsize := tools.Parameter{ + Name: "filesize", + Value: size, + } + + threads := tools.Parameter{ + Name: "concurrency", + Value: strconv.Itoa(c), + } + + fs := tools.Parameter{ + Name: "filesystem", + Value: "tmpfs", + } + + name, err := tools.ParametersToName(fsize, threads, fs) + if err != nil { + b.Fatalf("Failed to parse parameters: %v", err) + } + requests := b.N + if requests < c { + b.Logf("b.N is %d must be greater than threads %d. Consider running with --test.benchtime=Nx where N >= %d", b.N, c, c) + requests = c + } + b.Run(name, func(b *testing.B) { + hey := &tools.Hey{ + Requests: requests, + Concurrency: c, + Doc: filename, + } + runNginx(b, hey, reverse, true /*tmpfs*/) + }) + } + } +} + // runNginx configures the static serving methods to run httpd. func runNginx(b *testing.B, hey *tools.Hey, reverse, tmpfs bool) { // nginx runs on port 80. @@ -99,5 +165,10 @@ func runNginx(b *testing.B, hey *tools.Hey, reverse, tmpfs bool) { } // Command copies nginxDocs to tmpfs serving directory and runs nginx. - runStaticServer(b, nginxRunOpts, nginxCmd, port, hey, reverse) + runStaticServer(b, h, nginxRunOpts, nginxCmd, port, hey, reverse) +} + +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) } diff --git a/test/benchmarks/network/node_test.go b/test/benchmarks/network/node_test.go index 254538899..3e837a9e4 100644 --- a/test/benchmarks/network/node_test.go +++ b/test/benchmarks/network/node_test.go @@ -15,6 +15,7 @@ package network import ( "context" + "os" "strconv" "testing" "time" @@ -24,6 +25,8 @@ import ( "gvisor.dev/gvisor/test/benchmarks/tools" ) +var h harness.Harness + // BenchmarkNode runs requests using 'hey' against a Node server run on // 'runtime'. The server responds to requests by grabbing some data in a // redis instance and returns the data in its reponse. The test loops through @@ -39,9 +42,14 @@ func BenchmarkNode(b *testing.B) { if err != nil { b.Fatalf("Failed to parse parameters: %v", err) } + requests := b.N + if requests < c { + b.Logf("b.N is %d must be greater than threads %d. Consider running with --test.benchtime=Nx where N >= %d", b.N, c, c) + requests = c + } b.Run(name, func(b *testing.B) { hey := &tools.Hey{ - Requests: b.N * c, // Requests b.N requests per thread. + Requests: requests, Concurrency: c, } runNode(b, hey) @@ -131,5 +139,9 @@ func runNode(b *testing.B, hey *tools.Hey) { // Stop the timer to parse the data and report stats. b.StopTimer() hey.Report(b, out) - b.StartTimer() +} + +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) } diff --git a/test/benchmarks/network/ruby_test.go b/test/benchmarks/network/ruby_test.go index 0174ff3f3..c89672873 100644 --- a/test/benchmarks/network/ruby_test.go +++ b/test/benchmarks/network/ruby_test.go @@ -16,6 +16,7 @@ package network import ( "context" "fmt" + "os" "strconv" "testing" "time" @@ -25,6 +26,8 @@ import ( "gvisor.dev/gvisor/test/benchmarks/tools" ) +var h harness.Harness + // BenchmarkRuby runs requests using 'hey' against a ruby application server. // On start, ruby app generates some random data and pushes it to a redis // instance. On a request, the app grabs for random entries from the redis @@ -40,9 +43,14 @@ func BenchmarkRuby(b *testing.B) { if err != nil { b.Fatalf("Failed to parse parameters: %v", err) } + requests := b.N + if requests < c { + b.Logf("b.N is %d must be greater than threads %d. Consider running with --test.benchtime=Nx where N >= %d", b.N, c, c) + requests = c + } b.Run(name, func(b *testing.B) { hey := &tools.Hey{ - Requests: b.N * c, // b.N requests per thread. + Requests: requests, Concurrency: c, } runRuby(b, hey) @@ -52,7 +60,6 @@ func BenchmarkRuby(b *testing.B) { // runRuby runs the test for a given # of requests and concurrency. func runRuby(b *testing.B, hey *tools.Hey) { - b.Helper() // The machine to hold Redis and the Ruby Server. serverMachine, err := h.GetMachine() if err != nil { @@ -141,3 +148,8 @@ func runRuby(b *testing.B, hey *tools.Hey) { hey.Report(b, out) b.StartTimer() } + +func TestMain(m *testing.M) { + h.Init() + os.Exit(m.Run()) +} diff --git a/test/benchmarks/network/static_server.go b/test/benchmarks/network/static_server.go deleted file mode 100644 index e747a1395..000000000 --- a/test/benchmarks/network/static_server.go +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package network - -import ( - "context" - "testing" - - "gvisor.dev/gvisor/pkg/test/dockerutil" - "gvisor.dev/gvisor/test/benchmarks/harness" - "gvisor.dev/gvisor/test/benchmarks/tools" -) - -// runStaticServer runs static serving workloads (httpd, nginx). -func runStaticServer(b *testing.B, serverOpts dockerutil.RunOpts, serverCmd []string, port int, hey *tools.Hey, reverse bool) { - ctx := context.Background() - - // Get two machines: a client and server. - clientMachine, err := h.GetMachine() - if err != nil { - b.Fatalf("failed to get machine: %v", err) - } - defer clientMachine.CleanUp() - - serverMachine, err := h.GetMachine() - if err != nil { - b.Fatalf("failed to get machine: %v", err) - } - defer serverMachine.CleanUp() - - // Make the containers. 'reverse=true' specifies that the client should use the - // runtime under test. - var client, server *dockerutil.Container - if reverse { - client = clientMachine.GetContainer(ctx, b) - server = serverMachine.GetNativeContainer(ctx, b) - } else { - client = clientMachine.GetNativeContainer(ctx, b) - server = serverMachine.GetContainer(ctx, b) - } - defer client.CleanUp(ctx) - defer server.CleanUp(ctx) - - // Start the server. - if err := server.Spawn(ctx, serverOpts, serverCmd...); err != nil { - b.Fatalf("failed to start server: %v", err) - } - - // Get its IP. - ip, err := serverMachine.IPAddress() - if err != nil { - b.Fatalf("failed to find server ip: %v", err) - } - - // Get the published port. - servingPort, err := server.FindPort(ctx, port) - if err != nil { - b.Fatalf("failed to find server port %d: %v", port, err) - } - - // Make sure the server is serving. - harness.WaitUntilServing(ctx, clientMachine, ip, servingPort) - b.ResetTimer() - server.RestartProfiles() - out, err := client.Run(ctx, dockerutil.RunOpts{ - Image: "benchmarks/hey", - }, hey.MakeCmd(ip, servingPort)...) - if err != nil { - b.Fatalf("run failed with: %v", err) - } - - b.StopTimer() - hey.Report(b, out) - b.StartTimer() -} diff --git a/test/benchmarks/tools/iperf.go b/test/benchmarks/tools/iperf.go index 5c4e7125b..891d32704 100644 --- a/test/benchmarks/tools/iperf.go +++ b/test/benchmarks/tools/iperf.go @@ -31,7 +31,7 @@ type Iperf struct { // MakeCmd returns a iperf client command. func (i *Iperf) MakeCmd(ip net.IP, port int) []string { // iperf report in Kb realtime - return strings.Split(fmt.Sprintf("iperf -f K --realtime --time %d -c %s -p %d", i.Time, ip, port), " ") + return strings.Split(fmt.Sprintf("iperf -f K --realtime --time %d --client %s --port %d", i.Time, ip, port), " ") } // Report parses output from iperf client and reports metrics. diff --git a/test/benchmarks/tools/redis.go b/test/benchmarks/tools/redis.go index e35886437..a42e3456e 100644 --- a/test/benchmarks/tools/redis.go +++ b/test/benchmarks/tools/redis.go @@ -29,17 +29,17 @@ type Redis struct { } // MakeCmd returns a redis-benchmark client command. -func (r *Redis) MakeCmd(ip net.IP, port int) []string { +func (r *Redis) MakeCmd(ip net.IP, port, requests int) []string { // There is no -t PING_BULK for redis-benchmark, so adjust the command in that case. // Note that "ping" will run both PING_INLINE and PING_BULK. if r.Operation == "PING_BULK" { return strings.Split( - fmt.Sprintf("redis-benchmark --csv -t ping -h %s -p %d", ip, port), " ") + fmt.Sprintf("redis-benchmark --csv -t ping -h %s -p %d -n %d", ip, port, requests), " ") } // runs redis-benchmark -t operation for 100K requests against server. return strings.Split( - fmt.Sprintf("redis-benchmark --csv -t %s -h %s -p %d", r.Operation, ip, port), " ") + fmt.Sprintf("redis-benchmark --csv -t %s -h %s -p %d -n %d", r.Operation, ip, port, requests), " ") } // Report parses output from redis-benchmark client and reports metrics. diff --git a/test/cmd/test_app/fds.go b/test/cmd/test_app/fds.go index a7658eefd..d4354f0d3 100644 --- a/test/cmd/test_app/fds.go +++ b/test/cmd/test_app/fds.go @@ -16,6 +16,7 @@ package main import ( "context" + "io" "io/ioutil" "log" "os" @@ -168,8 +169,8 @@ func (fdr *fdReceiver) Execute(ctx context.Context, f *flag.FlagSet, args ...int file := os.NewFile(uintptr(fd), "received file") defer file.Close() - if _, err := file.Seek(0, os.SEEK_SET); err != nil { - log.Fatalf("Seek(0, 0) failed: %v", err) + if _, err := file.Seek(0, io.SeekStart); err != nil { + log.Fatalf("Error from seek(0, 0): %v", err) } got, err := ioutil.ReadAll(file) diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index 8425abecb..03bdfa889 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -494,6 +494,55 @@ func TestLink(t *testing.T) { } } +// This test ensures we can run ping without errors. +func TestPing4Loopback(t *testing.T) { + if testutil.IsRunningWithHostNet() { + // TODO(gvisor.dev/issue/5011): support ICMP sockets in hostnet and enable + // this test. + t.Skip("hostnet only supports TCP/UDP sockets, so ping is not supported.") + } + + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + if got, err := d.Run(ctx, dockerutil.RunOpts{ + Image: "basic/ping4test", + }, "/root/ping4.sh"); err != nil { + t.Fatalf("docker run failed: %s", err) + } else if got != "" { + t.Errorf("test failed:\n%s", got) + } +} + +// This test ensures we can enable ipv6 on loopback and run ping6 without +// errors. +func TestPing6Loopback(t *testing.T) { + if testutil.IsRunningWithHostNet() { + // TODO(gvisor.dev/issue/5011): support ICMP sockets in hostnet and enable + // this test. + t.Skip("hostnet only supports TCP/UDP sockets, so ping6 is not supported.") + } + + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + if got, err := d.Run(ctx, dockerutil.RunOpts{ + Image: "basic/ping6test", + // The CAP_NET_ADMIN capability is required to use the `ip` utility, which + // we use to enable ipv6 on loopback. + // + // By default, ipv6 loopback is not enabled by runsc, because docker does + // not assign an ipv6 address to the test container. + CapAdd: []string{"NET_ADMIN"}, + }, "/root/ping6.sh"); err != nil { + t.Fatalf("docker run failed: %s", err) + } else if got != "" { + t.Errorf("test failed:\n%s", got) + } +} + func TestMain(m *testing.M) { dockerutil.EnsureSupportedDockerVersion() flag.Parse() diff --git a/test/fuse/BUILD b/test/fuse/BUILD index 8e31fdd41..74500ec84 100644 --- a/test/fuse/BUILD +++ b/test/fuse/BUILD @@ -71,3 +71,8 @@ syscall_test( fuse = "True", test = "//test/fuse/linux:setstat_test", ) + +syscall_test( + fuse = "True", + test = "//test/fuse/linux:mount_test", +) diff --git a/test/fuse/linux/BUILD b/test/fuse/linux/BUILD index 7673252ec..d1fb178e8 100644 --- a/test/fuse/linux/BUILD +++ b/test/fuse/linux/BUILD @@ -228,3 +228,15 @@ cc_binary( "//test/util:test_util", ], ) + +cc_binary( + name = "mount_test", + testonly = 1, + srcs = ["mount_test.cc"], + deps = [ + gtest, + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + ], +) diff --git a/test/fuse/linux/mount_test.cc b/test/fuse/linux/mount_test.cc new file mode 100644 index 000000000..a5c2fbb01 --- /dev/null +++ b/test/fuse/linux/mount_test.cc @@ -0,0 +1,41 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <errno.h> +#include <fcntl.h> +#include <sys/mount.h> + +#include "gtest/gtest.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +TEST(FuseMount, FDNotParsable) { + int devfd; + EXPECT_THAT(devfd = open("/dev/fuse", O_RDWR), SyscallSucceeds()); + std::string mount_opts = "fd=thiscantbeparsed"; + TempPath mount_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + EXPECT_THAT(mount("fuse", mount_dir.path().c_str(), "fuse", + MS_NODEV | MS_NOSUID, mount_opts.c_str()), + SyscallFailsWithErrno(EINVAL)); +} + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/iptables/filter_output.go b/test/iptables/filter_output.go index d3e5efd4f..f4af45e96 100644 --- a/test/iptables/filter_output.go +++ b/test/iptables/filter_output.go @@ -248,7 +248,7 @@ func (FilterOutputOwnerFail) Name() string { // ContainerAction implements TestCase.ContainerAction. func (FilterOutputOwnerFail) ContainerAction(ctx context.Context, ip net.IP, ipv6 bool) error { if err := filterTable(ipv6, "-A", "OUTPUT", "-p", "udp", "-m", "owner", "-j", "ACCEPT"); err == nil { - return fmt.Errorf("Invalid argument") + return fmt.Errorf("invalid argument") } return nil diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go index 834f7615f..4733146c0 100644 --- a/test/iptables/iptables_test.go +++ b/test/iptables/iptables_test.go @@ -89,6 +89,10 @@ func iptablesTest(t *testing.T, test TestCase, ipv6 bool) { // Get the container IP. ip, err := d.FindIP(ctx, ipv6) if err != nil { + // If ipv6 is not configured, don't fail. + if ipv6 && err == dockerutil.ErrNoIP { + t.Skipf("No ipv6 address is available.") + } t.Fatalf("failed to get container IP: %v", err) } diff --git a/test/packetdrill/BUILD b/test/packetdrill/BUILD index 49642f282..5d95516ee 100644 --- a/test/packetdrill/BUILD +++ b/test/packetdrill/BUILD @@ -38,6 +38,15 @@ packetdrill_test( scripts = ["tcp_defer_accept_timeout.pkt"], ) +test_suite( + name = "all_tests", + tags = [ + "manual", + "packetdrill", + ], + tests = existing_rules(), +) + bzl_library( name = "defs_bzl", srcs = ["defs.bzl"], diff --git a/test/packetdrill/defs.bzl b/test/packetdrill/defs.bzl index fc28ce9ba..a6cbcc376 100644 --- a/test/packetdrill/defs.bzl +++ b/test/packetdrill/defs.bzl @@ -15,7 +15,7 @@ def _packetdrill_test_impl(ctx): # Make sure that everything is readable here. "find . -type f -exec chmod a+rx {} \\;", "find . -type d -exec chmod a+rx {} \\;", - "%s %s --init_script %s $@ -- %s\n" % ( + "%s %s --init_script %s \"$@\" -- %s\n" % ( test_runner.short_path, " ".join(ctx.attr.flags), ctx.files._init_script[0].short_path, @@ -80,9 +80,7 @@ def packetdrill_netstack_test(name, **kwargs): kwargs["tags"] = PACKETDRILL_TAGS _packetdrill_test( name = name, - # This is the default runtime unless - # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value. - flags = ["--dut_platform", "netstack", "--runtime", "runsc-d"], + flags = ["--dut_platform", "netstack"], **kwargs ) diff --git a/test/packetdrill/packetdrill_test.sh b/test/packetdrill/packetdrill_test.sh index 922547d65..d25cad83a 100755 --- a/test/packetdrill/packetdrill_test.sh +++ b/test/packetdrill/packetdrill_test.sh @@ -29,7 +29,7 @@ function failure() { } trap 'failure ${LINENO} "$BASH_COMMAND"' ERR -declare -r LONGOPTS="dut_platform:,init_script:,runtime:" +declare -r LONGOPTS="dut_platform:,init_script:,runtime:,partition:,total_partitions:" # Don't use declare below so that the error from getopt will end the script. PARSED=$(getopt --options "" --longoptions=$LONGOPTS --name "$0" -- "$@") @@ -48,12 +48,17 @@ while true; do shift 2 ;; --runtime) - # Not readonly because there might be multiple --runtime arguments and we - # want to use just the last one. Only used if --dut_platform is - # "netstack". declare RUNTIME="$2" shift 2 ;; + --partition) + # Ignored. + shift 2 + ;; + --total_partitions) + # Ignored. + shift 2 + ;; --) shift break diff --git a/test/packetimpact/runner/BUILD b/test/packetimpact/runner/BUILD index 605dd4972..888c44343 100644 --- a/test/packetimpact/runner/BUILD +++ b/test/packetimpact/runner/BUILD @@ -32,6 +32,7 @@ go_library( deps = [ "//pkg/test/dockerutil", "//test/packetimpact/netdevs", + "//test/packetimpact/testbench", "@com_github_docker_docker//api/types/mount:go_default_library", ], ) diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl index 1038e3c8d..c6c95546a 100644 --- a/test/packetimpact/runner/defs.bzl +++ b/test/packetimpact/runner/defs.bzl @@ -12,10 +12,11 @@ def _packetimpact_test_impl(ctx): # current user, and no other users will be mapped in that namespace. # Make sure that everything is readable here. "find . -type f -or -type d -exec chmod a+rx {} \\;", - "%s %s --testbench_binary %s $@\n" % ( + "%s %s --testbench_binary %s --num_duts %d $@\n" % ( test_runner.short_path, " ".join(ctx.attr.flags), ctx.files.testbench_binary[0].short_path, + ctx.attr.num_duts, ), ]) ctx.actions.write(bench, bench_content, is_executable = True) @@ -51,6 +52,10 @@ _packetimpact_test = rule( mandatory = False, default = [], ), + "num_duts": attr.int( + mandatory = False, + default = 1, + ), }, test = True, implementation = _packetimpact_test_impl, @@ -110,24 +115,27 @@ def packetimpact_netstack_test( **kwargs ) -def packetimpact_go_test(name, expect_native_failure = False, expect_netstack_failure = False): +def packetimpact_go_test(name, expect_native_failure = False, expect_netstack_failure = False, num_duts = 1): """Add packetimpact tests written in go. Args: name: name of the test expect_native_failure: the test must fail natively expect_netstack_failure: the test must fail for Netstack + num_duts: how many DUTs are needed for the test """ testbench_binary = name + "_test" packetimpact_native_test( name = name, expect_failure = expect_native_failure, testbench_binary = testbench_binary, + num_duts = num_duts, ) packetimpact_netstack_test( name = name, expect_failure = expect_netstack_failure, testbench_binary = testbench_binary, + num_duts = num_duts, ) def packetimpact_testbench(name, size = "small", pure = True, **kwargs): @@ -153,7 +161,7 @@ def packetimpact_testbench(name, size = "small", pure = True, **kwargs): PacketimpactTestInfo = provider( doc = "Provide information for packetimpact tests", - fields = ["name", "expect_netstack_failure"], + fields = ["name", "expect_netstack_failure", "num_duts"], ) ALL_TESTS = [ @@ -216,6 +224,9 @@ ALL_TESTS = [ name = "tcp_user_timeout", ), PacketimpactTestInfo( + name = "tcp_zero_receive_window", + ), + PacketimpactTestInfo( name = "tcp_queue_receive_in_syn_sent", ), PacketimpactTestInfo( @@ -243,13 +254,9 @@ ALL_TESTS = [ ), PacketimpactTestInfo( name = "icmpv6_param_problem", - # TODO(b/153485026): Fix netstack then remove the line below. - expect_netstack_failure = True, ), PacketimpactTestInfo( name = "ipv6_unknown_options_action", - # TODO(b/159928940): Fix netstack then remove the line below. - expect_netstack_failure = True, ), PacketimpactTestInfo( name = "ipv4_fragment_reassembly", @@ -259,6 +266,7 @@ ALL_TESTS = [ ), PacketimpactTestInfo( name = "ipv6_fragment_icmp_error", + num_duts = 3, ), PacketimpactTestInfo( name = "udp_send_recv_dgram", diff --git a/test/packetimpact/runner/dut.go b/test/packetimpact/runner/dut.go index 59bb68eb1..3e26c73cb 100644 --- a/test/packetimpact/runner/dut.go +++ b/test/packetimpact/runner/dut.go @@ -17,6 +17,7 @@ package runner import ( "context" + "encoding/json" "flag" "fmt" "io/ioutil" @@ -34,6 +35,7 @@ import ( "github.com/docker/docker/api/types/mount" "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/test/packetimpact/netdevs" + "gvisor.dev/gvisor/test/packetimpact/testbench" ) // stringList implements flag.Value. @@ -56,9 +58,10 @@ var ( tshark = false extraTestArgs = stringList{} expectFailure = false + numDUTs = 1 - // DutAddr is the IP addres for DUT. - DutAddr = net.IPv4(0, 0, 0, 10) + // DUTAddr is the IP addres for DUT. + DUTAddr = net.IPv4(0, 0, 0, 10) testbenchAddr = net.IPv4(0, 0, 0, 20) ) @@ -71,10 +74,15 @@ func RegisterFlags(fs *flag.FlagSet) { fs.BoolVar(&tshark, "tshark", false, "use more verbose tshark in logs instead of tcpdump") fs.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench") fs.BoolVar(&expectFailure, "expect_failure", false, "expect that the test will fail when run") + fs.IntVar(&numDUTs, "num_duts", numDUTs, "the number of duts to create") } -// CtrlPort is the port that posix_server listens on. -const CtrlPort = "40000" +const ( + // CtrlPort is the port that posix_server listens on. + CtrlPort uint16 = 40000 + // testOutputDir is the directory in each container that holds test output. + testOutputDir = "/tmp/testoutput" +) // logger implements testutil.Logger. // @@ -95,16 +103,21 @@ func (l logger) Logf(format string, args ...interface{}) { } } -// TestWithDUT runs a packetimpact test with the given information. -func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Container) DUT, containerAddr net.IP) { - if testbenchBinary == "" { - t.Fatal("--testbench_binary is missing") - } - dockerutil.EnsureSupportedDockerVersion() +// dutInfo encapsulates all the essential information to set up testbench +// container. +type dutInfo struct { + dut DUT + ctrlNet, testNet *dockerutil.Network + netInfo *testbench.DUTTestNet +} - // Create the networks needed for the test. One control network is needed for - // the gRPC control packets and one test network on which to transmit the test - // packets. +// setUpDUT will set up one DUT and return information for setting up the +// container for testbench. +func setUpDUT(ctx context.Context, t *testing.T, id int, mkDevice func(*dockerutil.Container) DUT) (dutInfo, error) { + // Create the networks needed for the test. One control network is needed + // for the gRPC control packets and one test network on which to transmit + // the test packets. + var info dutInfo ctrlNet := dockerutil.NewNetwork(ctx, logger("ctrlNet")) testNet := dockerutil.NewNetwork(ctx, logger("testNet")) for _, dn := range []*dockerutil.Network{ctrlNet, testNet} { @@ -113,8 +126,8 @@ func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Co t.Log("creating docker network:", err) const wait = 100 * time.Millisecond t.Logf("sleeping %s and will try creating docker network again", wait) - // This can fail if another docker network claimed the same IP so we'll - // just try again. + // This can fail if another docker network claimed the same IP so we + // will just try again. time.Sleep(wait) continue } @@ -128,115 +141,204 @@ func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Co }) // Sanity check. if inspect, err := dn.Inspect(ctx); err != nil { - t.Fatalf("failed to inspect network %s: %v", dn.Name, err) + return dutInfo{}, fmt.Errorf("failed to inspect network %s: %w", dn.Name, err) } else if inspect.Name != dn.Name { - t.Fatalf("name mismatch for network want: %s got: %s", dn.Name, inspect.Name) + return dutInfo{}, fmt.Errorf("name mismatch for network want: %s got: %s", dn.Name, inspect.Name) } } - - tmpDir, err := ioutil.TempDir("", "container-output") - if err != nil { - t.Fatal("creating temp dir:", err) - } - t.Cleanup(func() { - if err := exec.Command("/bin/cp", "-r", tmpDir, os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")).Run(); err != nil { - t.Errorf("unable to copy container output files: %s", err) - } - if err := os.RemoveAll(tmpDir); err != nil { - t.Errorf("failed to remove tmpDir %s: %s", tmpDir, err) - } - }) - - const testOutputDir = "/tmp/testoutput" + info.ctrlNet = ctrlNet + info.testNet = testNet // Create the Docker container for the DUT. - var dut *dockerutil.Container + var dut DUT if native { - dut = dockerutil.MakeNativeContainer(ctx, logger("dut")) + dut = mkDevice(dockerutil.MakeNativeContainer(ctx, logger(fmt.Sprintf("dut-%d", id)))) } else { - dut = dockerutil.MakeContainer(ctx, logger("dut")) + dut = mkDevice(dockerutil.MakeContainer(ctx, logger(fmt.Sprintf("dut-%d", id)))) } - t.Cleanup(func() { - dut.CleanUp(ctx) - }) + info.dut = dut runOpts := dockerutil.RunOpts{ Image: "packetimpact", CapAdd: []string{"NET_ADMIN"}, - Mounts: []mount.Mount{{ - Type: mount.TypeBind, - Source: tmpDir, - Target: testOutputDir, - ReadOnly: false, - }}, + } + if _, err := MountTempDirectory(t, &runOpts, "dut-output", testOutputDir); err != nil { + return dutInfo{}, err } - device := mkDevice(dut) - remoteIPv6, remoteMAC, dutDeviceID, dutTestNetDev := device.Prepare(ctx, t, runOpts, ctrlNet, testNet, containerAddr) + ipv4PrefixLength, _ := testNet.Subnet.Mask.Size() + remoteIPv6, remoteMAC, dutDeviceID, dutTestNetDev, err := dut.Prepare(ctx, t, runOpts, ctrlNet, testNet) + if err != nil { + return dutInfo{}, err + } + info.netInfo = &testbench.DUTTestNet{ + RemoteMAC: remoteMAC, + RemoteIPv4: AddressInSubnet(DUTAddr, *testNet.Subnet), + RemoteIPv6: remoteIPv6, + RemoteDevID: dutDeviceID, + RemoteDevName: dutTestNetDev, + LocalIPv4: AddressInSubnet(testbenchAddr, *testNet.Subnet), + IPv4PrefixLength: ipv4PrefixLength, + POSIXServerIP: AddressInSubnet(DUTAddr, *ctrlNet.Subnet), + POSIXServerPort: CtrlPort, + } + return info, nil +} - // Create the Docker container for the testbench. - testbench := dockerutil.MakeNativeContainer(ctx, logger("testbench")) +// TestWithDUT runs a packetimpact test with the given information. +func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Container) DUT) { + if testbenchBinary == "" { + t.Fatal("--testbench_binary is missing") + } + dockerutil.EnsureSupportedDockerVersion() - tbb := path.Base(testbenchBinary) - containerTestbenchBinary := filepath.Join("/packetimpact", tbb) - testbench.CopyFiles(&runOpts, "/packetimpact", filepath.Join("test/packetimpact/tests", tbb)) - - // snifferNetDev is a network device on the test orchestrator that we will - // run sniffer (tcpdump or tshark) on and inject traffic to, not to be - // confused with the device on the DUT. - const snifferNetDev = "eth2" - // Run tcpdump in the test bench unbuffered, without DNS resolution, just on - // the interface with the test packets. - snifferArgs := []string{ - "tcpdump", - "-S", "-vvv", "-U", "-n", - "-i", snifferNetDev, - "-w", testOutputDir + "/dump.pcap", + dutInfoChan := make(chan dutInfo, numDUTs) + errChan := make(chan error, numDUTs) + var dockerNetworks []*dockerutil.Network + var dutTestNets []*testbench.DUTTestNet + var duts []DUT + + setUpCtx, cancelSetup := context.WithCancel(ctx) + t.Cleanup(cancelSetup) + for i := 0; i < numDUTs; i++ { + go func(i int) { + info, err := setUpDUT(setUpCtx, t, i, mkDevice) + if err != nil { + errChan <- err + } else { + dutInfoChan <- info + } + }(i) } - snifferRegex := "tcpdump: listening.*\n" - if tshark { - // Run tshark in the test bench unbuffered, without DNS resolution, just on - // the interface with the test packets. - snifferArgs = []string{ - "tshark", "-V", "-l", "-n", "-i", snifferNetDev, - "-o", "tcp.check_checksum:TRUE", - "-o", "udp.check_checksum:TRUE", + for i := 0; i < numDUTs; i++ { + select { + case info := <-dutInfoChan: + dockerNetworks = append(dockerNetworks, info.ctrlNet, info.testNet) + dutTestNets = append(dutTestNets, info.netInfo) + duts = append(duts, info.dut) + case err := <-errChan: + t.Fatal(err) } - snifferRegex = "Capturing on.*\n" } + // Create the Docker container for the testbench. + testbenchContainer := dockerutil.MakeNativeContainer(ctx, logger("testbench")) + + runOpts := dockerutil.RunOpts{ + Image: "packetimpact", + CapAdd: []string{"NET_ADMIN"}, + } + if _, err := MountTempDirectory(t, &runOpts, "testbench-output", testOutputDir); err != nil { + t.Fatal(err) + } + tbb := path.Base(testbenchBinary) + containerTestbenchBinary := filepath.Join("/packetimpact", tbb) + testbenchContainer.CopyFiles(&runOpts, "/packetimpact", filepath.Join("test/packetimpact/tests", tbb)) + if err := StartContainer( ctx, runOpts, - testbench, + testbenchContainer, testbenchAddr, - []*dockerutil.Network{ctrlNet, testNet}, - snifferArgs..., + dockerNetworks, + "tail", "-f", "/dev/null", ); err != nil { - t.Fatalf("failed to start docker container for testbench sniffer: %s", err) + t.Fatalf("cannot start testbench container: %s", err) } - // Kill so that it will flush output. - t.Cleanup(func() { - time.Sleep(1 * time.Second) - testbench.Exec(ctx, dockerutil.ExecOpts{}, "killall", snifferArgs[0]) - }) - if _, err := testbench.WaitForOutput(ctx, snifferRegex, 60*time.Second); err != nil { - t.Fatalf("sniffer on %s never listened: %s", dut.Name, err) + for i := range dutTestNets { + name, info, err := deviceByIP(ctx, testbenchContainer, dutTestNets[i].LocalIPv4) + if err != nil { + t.Fatalf("failed to get the device name associated with %s: %s", dutTestNets[i].LocalIPv4, err) + } + dutTestNets[i].LocalDevName = name + dutTestNets[i].LocalDevID = info.ID + dutTestNets[i].LocalMAC = info.MAC + localIPv6, err := getOrAssignIPv6Addr(ctx, testbenchContainer, name) + if err != nil { + t.Fatalf("failed to get IPV6 address on %s: %s", testbenchContainer.Name, err) + } + dutTestNets[i].LocalIPv6 = localIPv6 + } + dutTestNetsBytes, err := json.Marshal(dutTestNets) + if err != nil { + t.Fatalf("failed to marshal %v into json: %s", dutTestNets, err) } - // When the Linux kernel receives a SYN-ACK for a SYN it didn't send, it - // will respond with an RST. In most packetimpact tests, the SYN is sent - // by the raw socket and the kernel knows nothing about the connection, this - // behavior will break lots of TCP related packetimpact tests. To prevent - // this, we can install the following iptables rules. The raw socket that - // packetimpact tests use will still be able to see everything. - for _, bin := range []string{"iptables", "ip6tables"} { - if logs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, bin, "-A", "INPUT", "-i", snifferNetDev, "-p", "tcp", "-j", "DROP"); err != nil { - t.Fatalf("unable to Exec %s on container %s: %s, logs from testbench:\n%s", bin, testbench.Name, err, logs) + baseSnifferArgs := []string{ + "tcpdump", + "-vvv", + "--absolute-tcp-sequence-numbers", + "--packet-buffered", + // Disable DNS resolution. + "-n", + // run tcpdump as root since the output directory is owned by root. From + // `man tcpdump`: + // + // -Z user + // --relinquish-privileges=user + // If tcpdump is running as root, after opening the capture device + // or input savefile, change the user ID to user and the group ID to + // the primary group of user. + // This behavior is enabled by default (-Z tcpdump), and can be + // disabled by -Z root. + "-Z", "root", + } + if tshark { + baseSnifferArgs = []string{ + "tshark", + "-V", + "-o", "tcp.check_checksum:TRUE", + "-o", "udp.check_checksum:TRUE", + // Disable buffering. + "-l", + // Disable DNS resolution. + "-n", + } + } + for _, n := range dutTestNets { + snifferArgs := append(baseSnifferArgs, "-i", n.LocalDevName) + if !tshark { + snifferArgs = append( + snifferArgs, + "-w", + filepath.Join(testOutputDir, fmt.Sprintf("%s.pcap", n.LocalDevName)), + ) + } + p, err := testbenchContainer.ExecProcess(ctx, dockerutil.ExecOpts{}, snifferArgs...) + if err != nil { + t.Fatalf("failed to start exec a sniffer on %s: %s", n.LocalDevName, err) + } + t.Cleanup(func() { + if snifferOut, err := p.Logs(); err != nil { + t.Errorf("sniffer logs failed: %s\n%s", err, snifferOut) + } else { + t.Logf("sniffer logs:\n%s", snifferOut) + } + }) + // When the Linux kernel receives a SYN-ACK for a SYN it didn't send, it + // will respond with an RST. In most packetimpact tests, the SYN is sent + // by the raw socket, the kernel knows nothing about the connection, this + // behavior will break lots of TCP related packetimpact tests. To prevent + // this, we can install the following iptables rules. The raw socket that + // packetimpact tests use will still be able to see everything. + for _, bin := range []string{"iptables", "ip6tables"} { + if logs, err := testbenchContainer.Exec(ctx, dockerutil.ExecOpts{}, bin, "-A", "INPUT", "-i", n.LocalDevName, "-p", "tcp", "-j", "DROP"); err != nil { + t.Fatalf("unable to Exec %s on container %s: %s, logs from testbench:\n%s", bin, testbenchContainer.Name, err, logs) + } } } + t.Cleanup(func() { + // Wait 1 second before killing tcpdump to give it time to flush + // any packets. On linux tests killing it immediately can + // sometimes result in partial pcaps. + time.Sleep(1 * time.Second) + if logs, err := testbenchContainer.Exec(ctx, dockerutil.ExecOpts{}, "killall", baseSnifferArgs[0]); err != nil { + t.Errorf("failed to kill all sniffers: %s, logs: %s", err, logs) + } + }) + // FIXME(b/156449515): Some piece of the system has a race. The old // bash script version had a sleep, so we have one too. The race should // be fixed and this sleep removed. @@ -248,31 +350,29 @@ func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Co testArgs := []string{containerTestbenchBinary} testArgs = append(testArgs, extraTestArgs...) testArgs = append(testArgs, - "--posix_server_ip", AddressInSubnet(DutAddr, *ctrlNet.Subnet).String(), - "--posix_server_port", CtrlPort, - "--remote_ipv4", AddressInSubnet(DutAddr, *testNet.Subnet).String(), - "--local_ipv4", AddressInSubnet(testbenchAddr, *testNet.Subnet).String(), - "--remote_ipv6", remoteIPv6.String(), - "--remote_mac", remoteMAC.String(), - "--remote_interface_id", fmt.Sprintf("%d", dutDeviceID), - "--local_device", snifferNetDev, - "--remote_device", dutTestNetDev, fmt.Sprintf("--native=%t", native), + "--dut_test_nets_json", string(dutTestNetsBytes), ) - testbenchLogs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, testArgs...) + testbenchLogs, err := testbenchContainer.Exec(ctx, dockerutil.ExecOpts{}, testArgs...) if (err != nil) != expectFailure { var dutLogs string - if logs, err := device.Logs(ctx); err != nil { - dutLogs = fmt.Sprintf("failed to fetch DUT logs: %s", err) - } else { - dutLogs = logs + for i, dut := range duts { + logs, err := dut.Logs(ctx) + if err != nil { + logs = fmt.Sprintf("failed to fetch DUT logs: %s", err) + } + dutLogs = fmt.Sprintf(`%s====== Begin of DUT-%d Logs ====== + +%s + +====== End of DUT-%d Logs ====== + +`, dutLogs, i, logs, i) } t.Errorf(`test error: %v, expect failure: %t -%s - -====== Begin of Testbench Logs ====== +%s====== Begin of Testbench Logs ====== %s @@ -285,7 +385,9 @@ func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Co type DUT interface { // Prepare prepares the dut, starts posix_server and returns the IPv6, MAC // address, the interface ID, and the interface name for the testNet on DUT. - Prepare(ctx context.Context, t *testing.T, runOpts dockerutil.RunOpts, ctrlNet, testNet *dockerutil.Network, containerAddr net.IP) (net.IP, net.HardwareAddr, uint32, string) + // The t parameter is supposed to be used for t.Cleanup. Don't use it for + // t.Fatal/FailNow functions. + Prepare(ctx context.Context, t *testing.T, runOpts dockerutil.RunOpts, ctrlNet, testNet *dockerutil.Network) (net.IP, net.HardwareAddr, uint32, string, error) // Logs retrieves the logs from the dut. Logs(ctx context.Context) (string, error) } @@ -303,7 +405,7 @@ func NewDockerDUT(c *dockerutil.Container) DUT { } // Prepare implements DUT.Prepare. -func (dut *DockerDUT) Prepare(ctx context.Context, t *testing.T, runOpts dockerutil.RunOpts, ctrlNet, testNet *dockerutil.Network, containerAddr net.IP) (net.IP, net.HardwareAddr, uint32, string) { +func (dut *DockerDUT) Prepare(ctx context.Context, _ *testing.T, runOpts dockerutil.RunOpts, ctrlNet, testNet *dockerutil.Network) (net.IP, net.HardwareAddr, uint32, string, error) { const containerPosixServerBinary = "/packetimpact/posix_server" dut.c.CopyFiles(&runOpts, "/packetimpact", "test/packetimpact/dut/posix_server") @@ -311,45 +413,31 @@ func (dut *DockerDUT) Prepare(ctx context.Context, t *testing.T, runOpts dockeru ctx, runOpts, dut.c, - containerAddr, + DUTAddr, []*dockerutil.Network{ctrlNet, testNet}, containerPosixServerBinary, "--ip=0.0.0.0", - "--port="+CtrlPort, + fmt.Sprintf("--port=%d", CtrlPort), ); err != nil { - t.Fatalf("failed to start docker container for DUT: %s", err) + return nil, nil, 0, "", fmt.Errorf("failed to start docker container for DUT: %w", err) } if _, err := dut.c.WaitForOutput(ctx, "Server listening.*\n", 60*time.Second); err != nil { - t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.c.Name, err) + return nil, nil, 0, "", fmt.Errorf("%s on container %s never listened: %s", containerPosixServerBinary, dut.c.Name, err) } - dutTestDevice, dutDeviceInfo, err := deviceByIP(ctx, dut.c, AddressInSubnet(containerAddr, *testNet.Subnet)) + dutTestDevice, dutDeviceInfo, err := deviceByIP(ctx, dut.c, AddressInSubnet(DUTAddr, *testNet.Subnet)) if err != nil { - t.Fatal(err) + return nil, nil, 0, "", err } - remoteMAC := dutDeviceInfo.MAC - remoteIPv6 := dutDeviceInfo.IPv6Addr - // Netstack as DUT doesn't assign IPv6 addresses automatically so do it if - // needed. - if remoteIPv6 == nil { - if _, err := dut.c.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil { - t.Fatalf("unable to ip addr add on container %s: %s", dut.c.Name, err) - } - // Now try again, to make sure that it worked. - _, dutDeviceInfo, err = deviceByIP(ctx, dut.c, AddressInSubnet(containerAddr, *testNet.Subnet)) - if err != nil { - t.Fatal(err) - } - remoteIPv6 = dutDeviceInfo.IPv6Addr - if remoteIPv6 == nil { - t.Fatalf("unable to set IPv6 address on container %s", dut.c.Name) - } + remoteIPv6, err := getOrAssignIPv6Addr(ctx, dut.c, dutTestDevice) + if err != nil { + return nil, nil, 0, "", fmt.Errorf("failed to get IPv6 address on %s: %s", dut.c.Name, err) } const testNetDev = "eth2" - return remoteIPv6, dutDeviceInfo.MAC, dutDeviceInfo.ID, testNetDev + return remoteIPv6, dutDeviceInfo.MAC, dutDeviceInfo.ID, testNetDev, nil } // Logs implements DUT.Logs. @@ -358,11 +446,7 @@ func (dut *DockerDUT) Logs(ctx context.Context) (string, error) { if err != nil { return "", err } - return fmt.Sprintf(`====== Begin of DUT Logs ====== - -%s - -====== End of DUT Logs ======`, logs), nil + return logs, nil } // AddNetworks connects docker network with the container and assigns the specific IP. @@ -378,25 +462,35 @@ func AddNetworks(ctx context.Context, d *dockerutil.Container, addr net.IP, netw } // AddressInSubnet combines the subnet provided with the address and returns a -// new address. The return address bits come from the subnet where the mask is 1 -// and from the ip address where the mask is 0. +// new address. The return address bits come from the subnet where the mask is +// 1 and from the ip address where the mask is 0. func AddressInSubnet(addr net.IP, subnet net.IPNet) net.IP { - var octets []byte + var octets net.IP for i := 0; i < 4; i++ { octets = append(octets, (subnet.IP.To4()[i]&subnet.Mask[i])+(addr.To4()[i]&(^subnet.Mask[i]))) } - return net.IP(octets) + return octets } -// deviceByIP finds a deviceInfo and device name from an IP address. -func deviceByIP(ctx context.Context, d *dockerutil.Container, ip net.IP) (string, netdevs.DeviceInfo, error) { +// devicesInfo will run "ip addr show" on the container and parse the output +// to a map[string]netdevs.DeviceInfo. +func devicesInfo(ctx context.Context, d *dockerutil.Container) (map[string]netdevs.DeviceInfo, error) { out, err := d.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "show") if err != nil { - return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w\n%s", d.Name, err, out) + return map[string]netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w\n%s", d.Name, err, out) } devs, err := netdevs.ParseDevices(out) if err != nil { - return "", netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w\n%s", d.Name, err, out) + return map[string]netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w\n%s", d.Name, err, out) + } + return devs, nil +} + +// deviceByIP finds a deviceInfo and device name from an IP address. +func deviceByIP(ctx context.Context, d *dockerutil.Container, ip net.IP) (string, netdevs.DeviceInfo, error) { + devs, err := devicesInfo(ctx, d) + if err != nil { + return "", netdevs.DeviceInfo{}, err } testDevice, deviceInfo, err := netdevs.FindDeviceByIP(ip, devs) if err != nil { @@ -405,6 +499,36 @@ func deviceByIP(ctx context.Context, d *dockerutil.Container, ip net.IP) (string return testDevice, deviceInfo, nil } +// getOrAssignIPv6Addr will try to get the IPv6 address for the interface; if an +// address was not assigned, a link-local address based on MAC will be assigned +// to that interface. +func getOrAssignIPv6Addr(ctx context.Context, d *dockerutil.Container, iface string) (net.IP, error) { + devs, err := devicesInfo(ctx, d) + if err != nil { + return net.IP{}, err + } + info := devs[iface] + if info.IPv6Addr != nil { + return info.IPv6Addr, nil + } + if info.MAC == nil { + return nil, fmt.Errorf("unable to find MAC address of %s", iface) + } + if logs, err := d.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "add", netdevs.MACToIP(info.MAC).String(), "scope", "link", "dev", iface); err != nil { + return net.IP{}, fmt.Errorf("unable to ip addr add on container %s: %w, logs: %s", d.Name, err, logs) + } + // Now try again, to make sure that it worked. + devs, err = devicesInfo(ctx, d) + if err != nil { + return net.IP{}, err + } + info = devs[iface] + if info.IPv6Addr == nil { + return net.IP{}, fmt.Errorf("unable to set IPv6 address on container %s", d.Name) + } + return info.IPv6Addr, nil +} + // createDockerNetwork makes a randomly-named network that will start with the // namePrefix. The network will be a random /24 subnet. func createDockerNetwork(ctx context.Context, n *dockerutil.Network) error { @@ -440,3 +564,30 @@ func StartContainer(ctx context.Context, runOpts dockerutil.RunOpts, c *dockerut } return nil } + +// MountTempDirectory creates a temporary directory on host with the template +// and then mounts it into the container under the name provided. The temporary +// directory name is returned. Content in that directory will be copied to +// TEST_UNDECLARED_OUTPUTS_DIR in cleanup phase. +func MountTempDirectory(t *testing.T, runOpts *dockerutil.RunOpts, hostDirTemplate, containerDir string) (string, error) { + t.Helper() + tmpDir, err := ioutil.TempDir("", hostDirTemplate) + if err != nil { + return "", fmt.Errorf("failed to create a temp dir: %w", err) + } + t.Cleanup(func() { + if err := exec.Command("/bin/cp", "-r", tmpDir, os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")).Run(); err != nil { + t.Errorf("unable to copy container output files: %s", err) + } + if err := os.RemoveAll(tmpDir); err != nil { + t.Errorf("failed to remove tmpDir %s: %s", tmpDir, err) + } + }) + runOpts.Mounts = append(runOpts.Mounts, mount.Mount{ + Type: mount.TypeBind, + Source: tmpDir, + Target: containerDir, + ReadOnly: false, + }) + return tmpDir, nil +} diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go index c598bfc29..46334b7ab 100644 --- a/test/packetimpact/runner/packetimpact_test.go +++ b/test/packetimpact/runner/packetimpact_test.go @@ -28,5 +28,5 @@ func init() { } func TestOne(t *testing.T) { - runner.TestWithDUT(context.Background(), t, runner.NewDockerDUT, runner.DutAddr) + runner.TestWithDUT(context.Background(), t, runner.NewDockerDUT) } diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD index 5a0ee1367..983c2c030 100644 --- a/test/packetimpact/testbench/BUILD +++ b/test/packetimpact/testbench/BUILD @@ -21,7 +21,6 @@ go_library( "//pkg/tcpip/header", "//pkg/tcpip/seqnum", "//pkg/usermem", - "//test/packetimpact/netdevs", "//test/packetimpact/proto:posix_server_go_proto", "@com_github_google_go_cmp//cmp:go_default_library", "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go index 919b4fd25..576577310 100644 --- a/test/packetimpact/testbench/connections.go +++ b/test/packetimpact/testbench/connections.go @@ -17,7 +17,6 @@ package testbench import ( "fmt" "math/rand" - "net" "testing" "time" @@ -42,7 +41,7 @@ func portFromSockaddr(sa unix.Sockaddr) (uint16, error) { // pickPort makes a new socket and returns the socket FD and port. The domain // should be AF_INET or AF_INET6. The caller must close the FD when done with // the port if there is no error. -func pickPort(domain, typ int) (fd int, port uint16, err error) { +func (n *DUTTestNet) pickPort(domain, typ int) (fd int, port uint16, err error) { fd, err = unix.Socket(domain, typ, 0) if err != nil { return -1, 0, fmt.Errorf("creating socket: %w", err) @@ -58,11 +57,11 @@ func pickPort(domain, typ int) (fd int, port uint16, err error) { switch domain { case unix.AF_INET: var sa4 unix.SockaddrInet4 - copy(sa4.Addr[:], net.ParseIP(LocalIPv4).To4()) + copy(sa4.Addr[:], n.LocalIPv4) sa = &sa4 case unix.AF_INET6: - sa6 := unix.SockaddrInet6{ZoneId: uint32(LocalInterfaceID)} - copy(sa6.Addr[:], net.ParseIP(LocalIPv6).To16()) + sa6 := unix.SockaddrInet6{ZoneId: n.LocalDevID} + copy(sa6.Addr[:], n.LocalIPv6) sa = &sa6 default: return -1, 0, fmt.Errorf("invalid domain %d, it should be one of unix.AF_INET or unix.AF_INET6", domain) @@ -117,19 +116,12 @@ type etherState struct { var _ layerState = (*etherState)(nil) // newEtherState creates a new etherState. -func newEtherState(out, in Ether) (*etherState, error) { - lMAC, err := tcpip.ParseMACAddress(LocalMAC) - if err != nil { - return nil, fmt.Errorf("parsing local MAC: %q: %w", LocalMAC, err) - } - - rMAC, err := tcpip.ParseMACAddress(RemoteMAC) - if err != nil { - return nil, fmt.Errorf("parsing remote MAC: %q: %w", RemoteMAC, err) - } +func (n *DUTTestNet) newEtherState(out, in Ether) (*etherState, error) { + lmac := tcpip.LinkAddress(n.LocalMAC) + rmac := tcpip.LinkAddress(n.RemoteMAC) s := etherState{ - out: Ether{SrcAddr: &lMAC, DstAddr: &rMAC}, - in: Ether{SrcAddr: &rMAC, DstAddr: &lMAC}, + out: Ether{SrcAddr: &lmac, DstAddr: &rmac}, + in: Ether{SrcAddr: &rmac, DstAddr: &lmac}, } if err := s.out.merge(&out); err != nil { return nil, err @@ -169,9 +161,9 @@ type ipv4State struct { var _ layerState = (*ipv4State)(nil) // newIPv4State creates a new ipv4State. -func newIPv4State(out, in IPv4) (*ipv4State, error) { - lIP := tcpip.Address(net.ParseIP(LocalIPv4).To4()) - rIP := tcpip.Address(net.ParseIP(RemoteIPv4).To4()) +func (n *DUTTestNet) newIPv4State(out, in IPv4) (*ipv4State, error) { + lIP := tcpip.Address(n.LocalIPv4) + rIP := tcpip.Address(n.RemoteIPv4) s := ipv4State{ out: IPv4{SrcAddr: &lIP, DstAddr: &rIP}, in: IPv4{SrcAddr: &rIP, DstAddr: &lIP}, @@ -214,9 +206,9 @@ type ipv6State struct { var _ layerState = (*ipv6State)(nil) // newIPv6State creates a new ipv6State. -func newIPv6State(out, in IPv6) (*ipv6State, error) { - lIP := tcpip.Address(net.ParseIP(LocalIPv6).To16()) - rIP := tcpip.Address(net.ParseIP(RemoteIPv6).To16()) +func (n *DUTTestNet) newIPv6State(out, in IPv6) (*ipv6State, error) { + lIP := tcpip.Address(n.LocalIPv6) + rIP := tcpip.Address(n.RemoteIPv6) s := ipv6State{ out: IPv6{SrcAddr: &lIP, DstAddr: &rIP}, in: IPv6{SrcAddr: &rIP, DstAddr: &lIP}, @@ -272,8 +264,8 @@ func SeqNumValue(v seqnum.Value) *seqnum.Value { } // newTCPState creates a new TCPState. -func newTCPState(domain int, out, in TCP) (*tcpState, error) { - portPickerFD, localPort, err := pickPort(domain, unix.SOCK_STREAM) +func (n *DUTTestNet) newTCPState(domain int, out, in TCP) (*tcpState, error) { + portPickerFD, localPort, err := n.pickPort(domain, unix.SOCK_STREAM) if err != nil { return nil, err } @@ -314,11 +306,11 @@ func (s *tcpState) incoming(received Layer) Layer { if s.remoteSeqNum != nil { newIn.SeqNum = Uint32(uint32(*s.remoteSeqNum)) } - if s.localSeqNum != nil && (*tcpReceived.Flags&header.TCPFlagAck) != 0 { + if seq, flags := s.localSeqNum, tcpReceived.Flags; seq != nil && flags != nil && *flags&header.TCPFlagAck != 0 { // The caller didn't specify an AckNum so we'll expect the calculated one, // but only if the ACK flag is set because the AckNum is not valid in a // header if ACK is not set. - newIn.AckNum = Uint32(uint32(*s.localSeqNum)) + newIn.AckNum = Uint32(uint32(*seq)) } return &newIn } @@ -376,8 +368,8 @@ type udpState struct { var _ layerState = (*udpState)(nil) // newUDPState creates a new udpState. -func newUDPState(domain int, out, in UDP) (*udpState, error) { - portPickerFD, localPort, err := pickPort(domain, unix.SOCK_DGRAM) +func (n *DUTTestNet) newUDPState(domain int, out, in UDP) (*udpState, error) { + portPickerFD, localPort, err := n.pickPort(domain, unix.SOCK_DGRAM) if err != nil { return nil, fmt.Errorf("picking port: %w", err) } @@ -606,14 +598,14 @@ func (conn *Connection) ExpectFrame(t *testing.T, layers Layers, timeout time.Du var errs error for { var gotLayers Layers - if timeout = time.Until(deadline); timeout > 0 { + if timeout := time.Until(deadline); timeout > 0 { gotLayers = conn.recvFrame(t, timeout) } if gotLayers == nil { if errs == nil { - return nil, fmt.Errorf("got no frames matching %v during %s", layers, timeout) + return nil, fmt.Errorf("got no frames matching %s during %s", layers, timeout) } - return nil, fmt.Errorf("got frames %w want %v during %s", errs, layers, timeout) + return nil, fmt.Errorf("got frames:\n%w want %s during %s", errs, layers, timeout) } if conn.match(layers, gotLayers) { for i, s := range conn.layerStates { @@ -623,7 +615,12 @@ func (conn *Connection) ExpectFrame(t *testing.T, layers Layers, timeout time.Du } return gotLayers, nil } - errs = multierr.Combine(errs, &layersError{got: gotLayers, want: conn.incoming(gotLayers)}) + want := conn.incoming(layers) + if err := want.merge(layers); err != nil { + errs = multierr.Combine(errs, err) + } else { + errs = multierr.Combine(errs, &layersError{got: gotLayers, want: want}) + } } } @@ -639,26 +636,26 @@ func (conn *Connection) Drain(t *testing.T) { type TCPIPv4 Connection // NewTCPIPv4 creates a new TCPIPv4 connection with reasonable defaults. -func NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 { +func (n *DUTTestNet) NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 { t.Helper() - etherState, err := newEtherState(Ether{}, Ether{}) + etherState, err := n.newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make etherState: %s", err) } - ipv4State, err := newIPv4State(IPv4{}, IPv4{}) + ipv4State, err := n.newIPv4State(IPv4{}, IPv4{}) if err != nil { t.Fatalf("can't make ipv4State: %s", err) } - tcpState, err := newTCPState(unix.AF_INET, outgoingTCP, incomingTCP) + tcpState, err := n.newTCPState(unix.AF_INET, outgoingTCP, incomingTCP) if err != nil { t.Fatalf("can't make tcpState: %s", err) } - injector, err := NewInjector(t) + injector, err := n.NewInjector(t) if err != nil { t.Fatalf("can't make injector: %s", err) } - sniffer, err := NewSniffer(t) + sniffer, err := n.NewSniffer(t) if err != nil { t.Fatalf("can't make sniffer: %s", err) } @@ -841,23 +838,23 @@ func (conn *TCPIPv4) Drain(t *testing.T) { type IPv4Conn Connection // NewIPv4Conn creates a new IPv4Conn connection with reasonable defaults. -func NewIPv4Conn(t *testing.T, outgoingIPv4, incomingIPv4 IPv4) IPv4Conn { +func (n *DUTTestNet) NewIPv4Conn(t *testing.T, outgoingIPv4, incomingIPv4 IPv4) IPv4Conn { t.Helper() - etherState, err := newEtherState(Ether{}, Ether{}) + etherState, err := n.newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make EtherState: %s", err) } - ipv4State, err := newIPv4State(outgoingIPv4, incomingIPv4) + ipv4State, err := n.newIPv4State(outgoingIPv4, incomingIPv4) if err != nil { t.Fatalf("can't make IPv4State: %s", err) } - injector, err := NewInjector(t) + injector, err := n.NewInjector(t) if err != nil { t.Fatalf("can't make injector: %s", err) } - sniffer, err := NewSniffer(t) + sniffer, err := n.NewSniffer(t) if err != nil { t.Fatalf("can't make sniffer: %s", err) } @@ -896,23 +893,23 @@ func (c *IPv4Conn) ExpectFrame(t *testing.T, frame Layers, timeout time.Duration type IPv6Conn Connection // NewIPv6Conn creates a new IPv6Conn connection with reasonable defaults. -func NewIPv6Conn(t *testing.T, outgoingIPv6, incomingIPv6 IPv6) IPv6Conn { +func (n *DUTTestNet) NewIPv6Conn(t *testing.T, outgoingIPv6, incomingIPv6 IPv6) IPv6Conn { t.Helper() - etherState, err := newEtherState(Ether{}, Ether{}) + etherState, err := n.newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make EtherState: %s", err) } - ipv6State, err := newIPv6State(outgoingIPv6, incomingIPv6) + ipv6State, err := n.newIPv6State(outgoingIPv6, incomingIPv6) if err != nil { t.Fatalf("can't make IPv6State: %s", err) } - injector, err := NewInjector(t) + injector, err := n.NewInjector(t) if err != nil { t.Fatalf("can't make injector: %s", err) } - sniffer, err := NewSniffer(t) + sniffer, err := n.NewSniffer(t) if err != nil { t.Fatalf("can't make sniffer: %s", err) } @@ -951,26 +948,26 @@ func (conn *IPv6Conn) ExpectFrame(t *testing.T, frame Layers, timeout time.Durat type UDPIPv4 Connection // NewUDPIPv4 creates a new UDPIPv4 connection with reasonable defaults. -func NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 { +func (n *DUTTestNet) NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 { t.Helper() - etherState, err := newEtherState(Ether{}, Ether{}) + etherState, err := n.newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make etherState: %s", err) } - ipv4State, err := newIPv4State(IPv4{}, IPv4{}) + ipv4State, err := n.newIPv4State(IPv4{}, IPv4{}) if err != nil { t.Fatalf("can't make ipv4State: %s", err) } - udpState, err := newUDPState(unix.AF_INET, outgoingUDP, incomingUDP) + udpState, err := n.newUDPState(unix.AF_INET, outgoingUDP, incomingUDP) if err != nil { t.Fatalf("can't make udpState: %s", err) } - injector, err := NewInjector(t) + injector, err := n.NewInjector(t) if err != nil { t.Fatalf("can't make injector: %s", err) } - sniffer, err := NewSniffer(t) + sniffer, err := n.NewSniffer(t) if err != nil { t.Fatalf("can't make sniffer: %s", err) } @@ -1075,26 +1072,26 @@ func (conn *UDPIPv4) Drain(t *testing.T) { type UDPIPv6 Connection // NewUDPIPv6 creates a new UDPIPv6 connection with reasonable defaults. -func NewUDPIPv6(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv6 { +func (n *DUTTestNet) NewUDPIPv6(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv6 { t.Helper() - etherState, err := newEtherState(Ether{}, Ether{}) + etherState, err := n.newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make etherState: %s", err) } - ipv6State, err := newIPv6State(IPv6{}, IPv6{}) + ipv6State, err := n.newIPv6State(IPv6{}, IPv6{}) if err != nil { t.Fatalf("can't make IPv6State: %s", err) } - udpState, err := newUDPState(unix.AF_INET6, outgoingUDP, incomingUDP) + udpState, err := n.newUDPState(unix.AF_INET6, outgoingUDP, incomingUDP) if err != nil { t.Fatalf("can't make udpState: %s", err) } - injector, err := NewInjector(t) + injector, err := n.NewInjector(t) if err != nil { t.Fatalf("can't make injector: %s", err) } - sniffer, err := NewSniffer(t) + sniffer, err := n.NewSniffer(t) if err != nil { t.Fatalf("can't make sniffer: %s", err) } @@ -1126,14 +1123,14 @@ func (conn *UDPIPv6) ipv6State(t *testing.T) *ipv6State { } // LocalAddr gets the local socket address of this connection. -func (conn *UDPIPv6) LocalAddr(t *testing.T) *unix.SockaddrInet6 { +func (conn *UDPIPv6) LocalAddr(t *testing.T, zoneID uint32) *unix.SockaddrInet6 { t.Helper() sa := &unix.SockaddrInet6{ Port: int(*conn.udpState(t).out.SrcPort), // Local address is in perspective to the remote host, so it's scoped to the // ID of the remote interface. - ZoneId: uint32(RemoteInterfaceID), + ZoneId: zoneID, } copy(sa.Addr[:], *conn.ipv6State(t).out.SrcAddr) return sa @@ -1203,24 +1200,24 @@ func (conn *UDPIPv6) Drain(t *testing.T) { type TCPIPv6 Connection // NewTCPIPv6 creates a new TCPIPv6 connection with reasonable defaults. -func NewTCPIPv6(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv6 { - etherState, err := newEtherState(Ether{}, Ether{}) +func (n *DUTTestNet) NewTCPIPv6(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv6 { + etherState, err := n.newEtherState(Ether{}, Ether{}) if err != nil { t.Fatalf("can't make etherState: %s", err) } - ipv6State, err := newIPv6State(IPv6{}, IPv6{}) + ipv6State, err := n.newIPv6State(IPv6{}, IPv6{}) if err != nil { t.Fatalf("can't make ipv6State: %s", err) } - tcpState, err := newTCPState(unix.AF_INET6, outgoingTCP, incomingTCP) + tcpState, err := n.newTCPState(unix.AF_INET6, outgoingTCP, incomingTCP) if err != nil { t.Fatalf("can't make tcpState: %s", err) } - injector, err := NewInjector(t) + injector, err := n.NewInjector(t) if err != nil { t.Fatalf("can't make injector: %s", err) } - sniffer, err := NewSniffer(t) + sniffer, err := n.NewSniffer(t) if err != nil { t.Fatalf("can't make sniffer: %s", err) } diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go index 6165ab293..66a0255b8 100644 --- a/test/packetimpact/testbench/dut.go +++ b/test/packetimpact/testbench/dut.go @@ -17,9 +17,8 @@ package testbench import ( "context" "encoding/binary" - "flag" + "fmt" "net" - "strconv" "syscall" "testing" "time" @@ -35,18 +34,26 @@ import ( type DUT struct { conn *grpc.ClientConn posixServer POSIXClient + Net *DUTTestNet } // NewDUT creates a new connection with the DUT over gRPC. func NewDUT(t *testing.T) DUT { t.Helper() + n := GetDUTTestNet() + dut := n.ConnectToDUT(t) + t.Cleanup(func() { + dut.TearDownConnection() + dut.Net.Release() + }) + return dut +} - flag.Parse() - if err := genPseudoFlags(); err != nil { - t.Fatal("generating psuedo flags:", err) - } +// ConnectToDUT connects to DUT through gRPC. +func (n *DUTTestNet) ConnectToDUT(t *testing.T) DUT { + t.Helper() - posixServerAddress := POSIXServerIP + ":" + strconv.Itoa(POSIXServerPort) + posixServerAddress := net.JoinHostPort(n.POSIXServerIP.String(), fmt.Sprintf("%d", n.POSIXServerPort)) conn, err := grpc.Dial(posixServerAddress, grpc.WithInsecure(), grpc.WithKeepaliveParams(keepalive.ClientParameters{Timeout: RPCKeepalive})) if err != nil { t.Fatalf("failed to grpc.Dial(%s): %s", posixServerAddress, err) @@ -55,11 +62,12 @@ func NewDUT(t *testing.T) DUT { return DUT{ conn: conn, posixServer: posixServer, + Net: n, } } -// TearDown closes the underlying connection. -func (dut *DUT) TearDown() { +// TearDownConnection closes the underlying connection. +func (dut *DUT) TearDownConnection() { dut.conn.Close() } @@ -132,7 +140,7 @@ func (dut *DUT) CreateBoundSocket(t *testing.T, typ, proto int32, addr net.IP) ( fd = dut.Socket(t, unix.AF_INET6, typ, proto) sa := unix.SockaddrInet6{} copy(sa.Addr[:], addr.To16()) - sa.ZoneId = uint32(RemoteInterfaceID) + sa.ZoneId = dut.Net.RemoteDevID dut.Bind(t, fd, &sa) } else { t.Fatalf("invalid IP address: %s", addr) @@ -154,7 +162,7 @@ func (dut *DUT) CreateBoundSocket(t *testing.T, typ, proto int32, addr net.IP) ( func (dut *DUT) CreateListener(t *testing.T, typ, proto, backlog int32) (int32, uint16) { t.Helper() - fd, remotePort := dut.CreateBoundSocket(t, typ, proto, net.ParseIP(RemoteIPv4)) + fd, remotePort := dut.CreateBoundSocket(t, typ, proto, dut.Net.RemoteIPv4) dut.Listen(t, fd, backlog) return fd, remotePort } @@ -717,9 +725,9 @@ func (dut *DUT) SetSockLingerOption(t *testing.T, sockfd int32, timeout time.Dur dut.SetSockOpt(t, sockfd, unix.SOL_SOCKET, unix.SO_LINGER, buf) } -// Shutdown calls shutdown on the DUT and causes a fatal test failure if it doesn't -// succeed. If more control over the timeout or error handling is needed, use -// ShutdownWithErrno. +// Shutdown calls shutdown on the DUT and causes a fatal test failure if it +// doesn't succeed. If more control over the timeout or error handling is +// needed, use ShutdownWithErrno. func (dut *DUT) Shutdown(t *testing.T, fd, how int32) error { t.Helper() diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go index 7401a1991..19e6b8d7d 100644 --- a/test/packetimpact/testbench/layers.go +++ b/test/packetimpact/testbench/layers.go @@ -298,14 +298,12 @@ func (l *IPv4) ToBytes() ([]byte, error) { // An IPv4 header is variable length depending on the size of the Options. hdrLen := header.IPv4MinimumSize if l.Options != nil { - hdrLen += l.Options.SizeWithPadding() + if len(*l.Options)%4 != 0 { + return nil, fmt.Errorf("invalid header options '%x (len=%d)'; must be 32 bit aligned", *l.Options, len(*l.Options)) + } + hdrLen += len(*l.Options) if hdrLen > header.IPv4MaximumHeaderSize { - // While ToBytes can be called on packets that were received as well - // as packets locally generated, it is physically impossible for a - // received packet to overflow this value so any such failure must - // be the result of a local programming error and not remotely - // triggered. A panic is therefore appropriate. - panic(fmt.Sprintf("IPv4 Options %d bytes, Max %d", len(*l.Options), header.IPv4MaximumOptionsSize)) + return nil, fmt.Errorf("IPv4 Options %d bytes, Max %d", len(*l.Options), header.IPv4MaximumOptionsSize) } } b := make([]byte, hdrLen) @@ -323,10 +321,6 @@ func (l *IPv4) ToBytes() ([]byte, error) { DstAddr: tcpip.Address(""), Options: nil, } - // Leave an empty options slice as nil. - if hdrLen > header.IPv4MinimumSize { - fields.Options = *l.Options - } if l.TOS != nil { fields.TOS = *l.TOS } @@ -373,18 +367,31 @@ func (l *IPv4) ToBytes() ([]byte, error) { if l.DstAddr != nil { fields.DstAddr = *l.DstAddr } - if l.Checksum != nil { - fields.Checksum = *l.Checksum - } + h.Encode(fields) - if l.Checksum == nil { - h.SetChecksum(^h.CalculateChecksum()) + + // Put raw option bytes from test definition in header. Options as raw bytes + // allows us to serialize malformed options, which is not possible with + // the provided serialization functions. + if l.Options != nil { + h.SetHeaderLength(h.HeaderLength() + uint8(len(*l.Options))) + if got, want := copy(h.Options(), *l.Options), len(*l.Options); got != want { + return nil, fmt.Errorf("failed to copy option bytes into header, got %d want %d", got, want) + } } + // Encode cannot set this incorrectly so we need to overwrite what it wrote // in order to test handling of a bad IHL value. if l.IHL != nil { h.SetHeaderLength(*l.IHL) } + + if l.Checksum == nil { + h.SetChecksum(^h.CalculateChecksum()) + } else { + h.SetChecksum(*l.Checksum) + } + return h, nil } @@ -498,13 +505,13 @@ func (l *IPv6) ToBytes() ([]byte, error) { } } if l.NextHeader != nil { - fields.NextHeader = *l.NextHeader + fields.TransportProtocol = tcpip.TransportProtocolNumber(*l.NextHeader) } else { nh, err := nextHeaderByLayer(l.next()) if err != nil { return nil, err } - fields.NextHeader = nh + fields.TransportProtocol = tcpip.TransportProtocolNumber(nh) } if l.HopLimit != nil { fields.HopLimit = *l.HopLimit @@ -830,7 +837,9 @@ func (l *ICMPv6) ToBytes() ([]byte, error) { if l.Code != nil { h.SetCode(*l.Code) } - copy(h.NDPPayload(), l.Payload) + if n := copy(h.MessageBody(), l.Payload); n != len(l.Payload) { + panic(fmt.Sprintf("copied %d bytes, expected to copy %d bytes", n, len(l.Payload))) + } if l.Checksum != nil { h.SetChecksum(*l.Checksum) } else { @@ -876,7 +885,7 @@ func parseICMPv6(b []byte) (Layer, layerParser) { Type: ICMPv6Type(h.Type()), Code: ICMPv6Code(h.Code()), Checksum: Uint16(h.Checksum()), - Payload: h.NDPPayload(), + Payload: h.MessageBody(), } return &icmpv6, nil } diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go index 193bb2dc8..1ac96626a 100644 --- a/test/packetimpact/testbench/rawsockets.go +++ b/test/packetimpact/testbench/rawsockets.go @@ -38,13 +38,27 @@ func htons(x uint16) uint16 { } // NewSniffer creates a Sniffer connected to *device. -func NewSniffer(t *testing.T) (Sniffer, error) { +func (n *DUTTestNet) NewSniffer(t *testing.T) (Sniffer, error) { t.Helper() + ifInfo, err := net.InterfaceByName(n.LocalDevName) + if err != nil { + return Sniffer{}, err + } + + var haddr [8]byte + copy(haddr[:], ifInfo.HardwareAddr) + sa := unix.SockaddrLinklayer{ + Protocol: htons(unix.ETH_P_ALL), + Ifindex: ifInfo.Index, + } snifferFd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, int(htons(unix.ETH_P_ALL))) if err != nil { return Sniffer{}, err } + if err := unix.Bind(snifferFd, &sa); err != nil { + return Sniffer{}, err + } if err := unix.SetsockoptInt(snifferFd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, 1); err != nil { t.Fatalf("can't set sockopt SO_RCVBUFFORCE to 1: %s", err) } @@ -60,7 +74,8 @@ func NewSniffer(t *testing.T) (Sniffer, error) { // packet too large for the buffer arrives, the test will get a fatal error. const maxReadSize int = 65536 -// Recv tries to read one frame until the timeout is up. +// Recv tries to read one frame until the timeout is up. If the timeout given +// is 0, then no read attempt will be made. func (s *Sniffer) Recv(t *testing.T, timeout time.Duration) []byte { t.Helper() @@ -73,9 +88,13 @@ func (s *Sniffer) Recv(t *testing.T, timeout time.Duration) []byte { whole, frac := math.Modf(timeout.Seconds()) tv := unix.Timeval{ Sec: int64(whole), - Usec: int64(frac * float64(time.Microsecond/time.Second)), + Usec: int64(frac * float64(time.Second/time.Microsecond)), + } + // The following should never happen, but having this guard here is better + // than blocking indefinitely in the future. + if tv.Sec == 0 && tv.Usec == 0 { + t.Fatal("setting SO_RCVTIMEO to 0 means blocking indefinitely") } - if err := unix.SetsockoptTimeval(s.fd, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &tv); err != nil { t.Fatalf("can't setsockopt SO_RCVTIMEO: %s", err) } @@ -136,10 +155,10 @@ type Injector struct { } // NewInjector creates a new injector on *device. -func NewInjector(t *testing.T) (Injector, error) { +func (n *DUTTestNet) NewInjector(t *testing.T) (Injector, error) { t.Helper() - ifInfo, err := net.InterfaceByName(LocalDevice) + ifInfo, err := net.InterfaceByName(n.LocalDevName) if err != nil { return Injector{}, err } @@ -147,7 +166,7 @@ func NewInjector(t *testing.T) (Injector, error) { var haddr [8]byte copy(haddr[:], ifInfo.HardwareAddr) sa := unix.SockaddrLinklayer{ - Protocol: unix.ETH_P_IP, + Protocol: htons(unix.ETH_P_IP), Ifindex: ifInfo.Index, Halen: uint8(len(ifInfo.HardwareAddr)), Addr: haddr, diff --git a/test/packetimpact/testbench/testbench.go b/test/packetimpact/testbench/testbench.go index c1db95d8c..891897d55 100644 --- a/test/packetimpact/testbench/testbench.go +++ b/test/packetimpact/testbench/testbench.go @@ -17,108 +17,105 @@ package testbench import ( + "encoding/json" "flag" "fmt" "math/rand" "net" - "os/exec" "testing" "time" - - "gvisor.dev/gvisor/test/packetimpact/netdevs" ) var ( // Native indicates that the test is being run natively. Native = false - // LocalDevice is the device that testbench uses to inject traffic. - LocalDevice = "" - // RemoteDevice is the device name on the DUT, individual tests can - // use the name to construct tests. - RemoteDevice = "" + // RPCKeepalive is the gRPC keepalive. + RPCKeepalive = 10 * time.Second + // RPCTimeout is the gRPC timeout. + RPCTimeout = 100 * time.Millisecond + + // dutTestNetsJSON is the json string that describes all the test networks to + // duts available to use. + dutTestNetsJSON string + // dutTestNets is the pool among which the testbench can choose a DUT to work + // with. + dutTestNets chan *DUTTestNet +) +// DUTTestNet describes the test network setup on dut and how the testbench +// should connect with an existing DUT. +type DUTTestNet struct { + // LocalMAC is the local MAC address on the test network. + LocalMAC net.HardwareAddr + // RemoteMAC is the DUT's MAC address on the test network. + RemoteMAC net.HardwareAddr // LocalIPv4 is the local IPv4 address on the test network. - LocalIPv4 = "" + LocalIPv4 net.IP // RemoteIPv4 is the DUT's IPv4 address on the test network. - RemoteIPv4 = "" + RemoteIPv4 net.IP // IPv4PrefixLength is the network prefix length of the IPv4 test network. - IPv4PrefixLength = 0 - + IPv4PrefixLength int // LocalIPv6 is the local IPv6 address on the test network. - LocalIPv6 = "" + LocalIPv6 net.IP // RemoteIPv6 is the DUT's IPv6 address on the test network. - RemoteIPv6 = "" + RemoteIPv6 net.IP + // LocalDevID is the ID of the local interface on the test network. + LocalDevID uint32 + // RemoteDevID is the ID of the remote interface on the test network. + RemoteDevID uint32 + // LocalDevName is the device that testbench uses to inject traffic. + LocalDevName string + // RemoteDevName is the device name on the DUT, individual tests can + // use the name to construct tests. + RemoteDevName string - // LocalInterfaceID is the ID of the local interface on the test network. - LocalInterfaceID uint32 - // RemoteInterfaceID is the ID of the remote interface on the test network. - // - // Not using uint32 because package flag does not support uint32. - RemoteInterfaceID uint64 - - // LocalMAC is the local MAC address on the test network. - LocalMAC = "" - // RemoteMAC is the DUT's MAC address on the test network. - RemoteMAC = "" + // The following two fields on actually on the control network instead + // of the test network, including them for convenience. // POSIXServerIP is the POSIX server's IP address on the control network. - POSIXServerIP = "" + POSIXServerIP net.IP // POSIXServerPort is the UDP port the POSIX server is bound to on the // control network. - POSIXServerPort = 40000 - - // RPCKeepalive is the gRPC keepalive. - RPCKeepalive = 10 * time.Second - // RPCTimeout is the gRPC timeout. - RPCTimeout = 100 * time.Millisecond -) + POSIXServerPort uint16 +} -// RegisterFlags defines flags and associates them with the package-level +// registerFlags defines flags and associates them with the package-level // exported variables above. It should be called by tests in their init // functions. -func RegisterFlags(fs *flag.FlagSet) { - fs.StringVar(&POSIXServerIP, "posix_server_ip", POSIXServerIP, "ip address to listen to for UDP commands") - fs.IntVar(&POSIXServerPort, "posix_server_port", POSIXServerPort, "port to listen to for UDP commands") +func registerFlags(fs *flag.FlagSet) { + fs.BoolVar(&Native, "native", Native, "whether the test is running natively") fs.DurationVar(&RPCTimeout, "rpc_timeout", RPCTimeout, "gRPC timeout") fs.DurationVar(&RPCKeepalive, "rpc_keepalive", RPCKeepalive, "gRPC keepalive") - fs.StringVar(&LocalIPv4, "local_ipv4", LocalIPv4, "local IPv4 address for test packets") - fs.StringVar(&RemoteIPv4, "remote_ipv4", RemoteIPv4, "remote IPv4 address for test packets") - fs.StringVar(&RemoteIPv6, "remote_ipv6", RemoteIPv6, "remote IPv6 address for test packets") - fs.StringVar(&RemoteMAC, "remote_mac", RemoteMAC, "remote mac address for test packets") - fs.StringVar(&LocalDevice, "local_device", LocalDevice, "local device to inject traffic") - fs.StringVar(&RemoteDevice, "remote_device", RemoteDevice, "remote device on the DUT") - fs.BoolVar(&Native, "native", Native, "whether the test is running natively") - fs.Uint64Var(&RemoteInterfaceID, "remote_interface_id", RemoteInterfaceID, "remote interface ID for test packets") + fs.StringVar(&dutTestNetsJSON, "dut_test_nets_json", dutTestNetsJSON, "path to the dut test nets json file") } -// genPseudoFlags populates flag-like global config based on real flags. -// -// genPseudoFlags must only be called after flag.Parse. -func genPseudoFlags() error { - out, err := exec.Command("ip", "addr", "show").CombinedOutput() - if err != nil { - return fmt.Errorf("listing devices: %q: %w", string(out), err) - } - devs, err := netdevs.ParseDevices(string(out)) - if err != nil { - return fmt.Errorf("parsing devices: %w", err) +// Initialize initializes the testbench, it parse the flags and sets up the +// pool of test networks for testbench's later use. +func Initialize(fs *flag.FlagSet) { + registerFlags(fs) + flag.Parse() + if err := loadDUTTestNets(); err != nil { + panic(err) } +} - _, deviceInfo, err := netdevs.FindDeviceByIP(net.ParseIP(LocalIPv4), devs) - if err != nil { - return fmt.Errorf("can't find deviceInfo: %w", err) +// loadDUTTestNets loads available DUT test networks from the json file, it +// must be called after flag.Parse(). +func loadDUTTestNets() error { + var parsedTestNets []DUTTestNet + if err := json.Unmarshal([]byte(dutTestNetsJSON), &parsedTestNets); err != nil { + return fmt.Errorf("failed to unmarshal JSON: %w", err) } - - LocalMAC = deviceInfo.MAC.String() - LocalIPv6 = deviceInfo.IPv6Addr.String() - LocalInterfaceID = deviceInfo.ID - - if deviceInfo.IPv4Net != nil { - IPv4PrefixLength, _ = deviceInfo.IPv4Net.Mask.Size() - } else { - IPv4PrefixLength, _ = net.ParseIP(LocalIPv4).DefaultMask().Size() + if got, want := len(parsedTestNets), 1; got < want { + return fmt.Errorf("got %d DUTs, the test requires at least %d DUTs", got, want) + } + // Using a buffered channel as semaphore + dutTestNets = make(chan *DUTTestNet, len(parsedTestNets)) + for i := range parsedTestNets { + parsedTestNets[i].LocalIPv4 = parsedTestNets[i].LocalIPv4.To4() + parsedTestNets[i].RemoteIPv4 = parsedTestNets[i].RemoteIPv4.To4() + dutTestNets <- &parsedTestNets[i] } - return nil } @@ -132,3 +129,15 @@ func GenerateRandomPayload(t *testing.T, n int) []byte { } return buf } + +// GetDUTTestNet gets a usable DUTTestNet, the function will block until any +// becomes available. +func GetDUTTestNet() *DUTTestNet { + return <-dutTestNets +} + +// Release releases the DUTTestNet back to the pool so that some other test +// can use. +func (n *DUTTestNet) Release() { + dutTestNets <- n +} diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index 33bd070c1..b1b3c578b 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -366,9 +366,29 @@ packetimpact_testbench( ], ) +packetimpact_testbench( + name = "tcp_zero_receive_window", + srcs = ["tcp_zero_receive_window_test.go"], + deps = [ + "//pkg/tcpip/header", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + validate_all_tests() [packetimpact_go_test( name = t.name, expect_netstack_failure = hasattr(t, "expect_netstack_failure"), + num_duts = t.num_duts if hasattr(t, "num_duts") else 1, ) for t in ALL_TESTS] + +test_suite( + name = "all_tests", + tags = [ + "manual", + "packetimpact", + ], + tests = existing_rules(), +) diff --git a/test/packetimpact/tests/fin_wait2_timeout_test.go b/test/packetimpact/tests/fin_wait2_timeout_test.go index a61054c2c..11f0fcd1e 100644 --- a/test/packetimpact/tests/fin_wait2_timeout_test.go +++ b/test/packetimpact/tests/fin_wait2_timeout_test.go @@ -25,7 +25,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func TestFinWait2Timeout(t *testing.T) { @@ -38,10 +38,9 @@ func TestFinWait2Timeout(t *testing.T) { } { t.Run(tt.description, func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFd) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/icmpv6_param_problem_test.go b/test/packetimpact/tests/icmpv6_param_problem_test.go index 2d59d552d..40d7a491d 100644 --- a/test/packetimpact/tests/icmpv6_param_problem_test.go +++ b/test/packetimpact/tests/icmpv6_param_problem_test.go @@ -25,15 +25,14 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestICMPv6ParamProblemTest sends a packet with a bad next header. The DUT // should respond with an ICMPv6 Parameter Problem message. func TestICMPv6ParamProblemTest(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) + conn := dut.Net.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) defer conn.Close(t) ipv6 := testbench.IPv6{ // 254 is reserved and used for experimentation and testing. This should diff --git a/test/packetimpact/tests/ipv4_fragment_reassembly_test.go b/test/packetimpact/tests/ipv4_fragment_reassembly_test.go index 40f899065..d2203082d 100644 --- a/test/packetimpact/tests/ipv4_fragment_reassembly_test.go +++ b/test/packetimpact/tests/ipv4_fragment_reassembly_test.go @@ -27,17 +27,17 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } type fragmentInfo struct { offset uint16 size uint16 more uint8 + id uint16 } func TestIPv4FragmentReassembly(t *testing.T) { - const fragmentID = 42 icmpv4ProtoNum := uint8(header.ICMPv4ProtocolNumber) tests := []struct { @@ -45,32 +45,78 @@ func TestIPv4FragmentReassembly(t *testing.T) { ipPayloadLen int fragments []fragmentInfo expectReply bool + skip bool + skipReason string }{ { description: "basic reassembly", - ipPayloadLen: 2000, + ipPayloadLen: 3000, fragments: []fragmentInfo{ - {offset: 0, size: 1000, more: header.IPv4FlagMoreFragments}, - {offset: 1000, size: 1000, more: 0}, + {offset: 0, size: 1000, id: 5, more: header.IPv4FlagMoreFragments}, + {offset: 1000, size: 1000, id: 5, more: header.IPv4FlagMoreFragments}, + {offset: 2000, size: 1000, id: 5, more: 0}, }, expectReply: true, }, { description: "out of order fragments", - ipPayloadLen: 2000, + ipPayloadLen: 3000, fragments: []fragmentInfo{ - {offset: 1000, size: 1000, more: 0}, - {offset: 0, size: 1000, more: header.IPv4FlagMoreFragments}, + {offset: 2000, size: 1000, id: 6, more: 0}, + {offset: 0, size: 1000, id: 6, more: header.IPv4FlagMoreFragments}, + {offset: 1000, size: 1000, id: 6, more: header.IPv4FlagMoreFragments}, }, expectReply: true, }, + { + description: "duplicated fragments", + ipPayloadLen: 3000, + fragments: []fragmentInfo{ + {offset: 0, size: 1000, id: 7, more: header.IPv4FlagMoreFragments}, + {offset: 1000, size: 1000, id: 7, more: header.IPv4FlagMoreFragments}, + {offset: 1000, size: 1000, id: 7, more: header.IPv4FlagMoreFragments}, + {offset: 2000, size: 1000, id: 7, more: 0}, + }, + expectReply: true, + skip: true, + skipReason: "gvisor.dev/issues/4971", + }, + { + description: "fragment subset", + ipPayloadLen: 3000, + fragments: []fragmentInfo{ + {offset: 0, size: 1000, id: 8, more: header.IPv4FlagMoreFragments}, + {offset: 1000, size: 1000, id: 8, more: header.IPv4FlagMoreFragments}, + {offset: 512, size: 256, id: 8, more: header.IPv4FlagMoreFragments}, + {offset: 2000, size: 1000, id: 8, more: 0}, + }, + expectReply: true, + skip: true, + skipReason: "gvisor.dev/issues/4971", + }, + { + description: "fragment overlap", + ipPayloadLen: 3000, + fragments: []fragmentInfo{ + {offset: 0, size: 1000, id: 9, more: header.IPv4FlagMoreFragments}, + {offset: 1512, size: 1000, id: 9, more: header.IPv4FlagMoreFragments}, + {offset: 1000, size: 1000, id: 9, more: header.IPv4FlagMoreFragments}, + {offset: 2000, size: 1000, id: 9, more: 0}, + }, + expectReply: false, + skip: true, + skipReason: "gvisor.dev/issues/4971", + }, } for _, test := range tests { + if test.skip { + t.Skip("%s test skipped: %s", test.description, test.skipReason) + continue + } t.Run(test.description, func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - conn := testbench.NewIPv4Conn(t, testbench.IPv4{}, testbench.IPv4{}) + conn := dut.Net.NewIPv4Conn(t, testbench.IPv4{}, testbench.IPv4{}) defer conn.Close(t) data := make([]byte, test.ipPayloadLen) @@ -96,7 +142,7 @@ func TestIPv4FragmentReassembly(t *testing.T) { Protocol: &icmpv4ProtoNum, FragmentOffset: testbench.Uint16(fragment.offset), Flags: testbench.Uint8(fragment.more), - ID: testbench.Uint16(fragmentID), + ID: testbench.Uint16(fragment.id), }, &testbench.Payload{ Bytes: data[fragment.offset:][:fragment.size], @@ -115,7 +161,7 @@ func TestIPv4FragmentReassembly(t *testing.T) { }, time.Second) if err != nil { // Either an unexpected frame was received, or none at all. - if bytesReceived < test.ipPayloadLen { + if test.expectReply && bytesReceived < test.ipPayloadLen { t.Fatalf("received %d bytes out of %d, then conn.ExpectFrame(_, _, time.Second) failed with %s", bytesReceived, test.ipPayloadLen, err) } break diff --git a/test/packetimpact/tests/ipv4_id_uniqueness_test.go b/test/packetimpact/tests/ipv4_id_uniqueness_test.go index 7f7a768d3..a63b41366 100644 --- a/test/packetimpact/tests/ipv4_id_uniqueness_test.go +++ b/test/packetimpact/tests/ipv4_id_uniqueness_test.go @@ -28,7 +28,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func recvTCPSegment(t *testing.T, conn *testbench.TCPIPv4, expect *testbench.TCP, expectPayload *testbench.Payload) (uint16, error) { @@ -67,12 +67,10 @@ func TestIPv4RetransmitIdentificationUniqueness(t *testing.T) { } { t.Run(tc.name, func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/ipv6_fragment_icmp_error_test.go b/test/packetimpact/tests/ipv6_fragment_icmp_error_test.go index e058fb0d8..a37867e85 100644 --- a/test/packetimpact/tests/ipv6_fragment_icmp_error_test.go +++ b/test/packetimpact/tests/ipv6_fragment_icmp_error_test.go @@ -16,7 +16,6 @@ package ipv6_fragment_icmp_error_test import ( "flag" - "net" "testing" "time" @@ -35,10 +34,10 @@ const ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } -func fragmentedICMPEchoRequest(t *testing.T, conn *testbench.Connection, firstPayloadLength uint16, payload []byte, secondFragmentOffset uint16) ([]testbench.Layers, [][]byte) { +func fragmentedICMPEchoRequest(t *testing.T, n *testbench.DUTTestNet, conn *testbench.Connection, firstPayloadLength uint16, payload []byte, secondFragmentOffset uint16) ([]testbench.Layers, [][]byte) { t.Helper() icmpv6Header := header.ICMPv6(make([]byte, header.ICMPv6EchoMinimumSize)) @@ -48,8 +47,8 @@ func fragmentedICMPEchoRequest(t *testing.T, conn *testbench.Connection, firstPa icmpv6Header.SetSequence(0) cksum := header.ICMPv6Checksum( icmpv6Header, - tcpip.Address(net.ParseIP(testbench.LocalIPv6).To16()), - tcpip.Address(net.ParseIP(testbench.RemoteIPv6).To16()), + tcpip.Address(n.LocalIPv6), + tcpip.Address(n.RemoteIPv6), buffer.NewVectorisedView(len(payload), []buffer.View{payload}), ) icmpv6Header.SetChecksum(cksum) @@ -120,13 +119,13 @@ func TestIPv6ICMPEchoRequestFragmentReassembly(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { + t.Parallel() dut := testbench.NewDUT(t) - defer dut.TearDown() - ipv6Conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) + ipv6Conn := dut.Net.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) conn := (*testbench.Connection)(&ipv6Conn) defer ipv6Conn.Close(t) - fragments, _ := fragmentedICMPEchoRequest(t, conn, test.firstPayloadLength, test.payload, test.secondFragmentOffset) + fragments, _ := fragmentedICMPEchoRequest(t, dut.Net, conn, test.firstPayloadLength, test.payload, test.secondFragmentOffset) for _, i := range test.sendFrameOrder { conn.SendFrame(t, fragments[i-1]) @@ -222,13 +221,13 @@ func TestIPv6FragmentReassemblyTimeout(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { + t.Parallel() dut := testbench.NewDUT(t) - defer dut.TearDown() - ipv6Conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) + ipv6Conn := dut.Net.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) conn := (*testbench.Connection)(&ipv6Conn) defer ipv6Conn.Close(t) - fragments, ipv6Bytes := fragmentedICMPEchoRequest(t, conn, test.firstPayloadLength, test.payload, test.secondFragmentOffset) + fragments, ipv6Bytes := fragmentedICMPEchoRequest(t, dut.Net, conn, test.firstPayloadLength, test.payload, test.secondFragmentOffset) for _, i := range test.sendFrameOrder { conn.SendFrame(t, fragments[i-1]) @@ -318,13 +317,13 @@ func TestIPv6FragmentParamProblem(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { + t.Parallel() dut := testbench.NewDUT(t) - defer dut.TearDown() - ipv6Conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) + ipv6Conn := dut.Net.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) conn := (*testbench.Connection)(&ipv6Conn) defer ipv6Conn.Close(t) - fragments, ipv6Bytes := fragmentedICMPEchoRequest(t, conn, test.firstPayloadLength, test.payload, test.secondFragmentOffset) + fragments, ipv6Bytes := fragmentedICMPEchoRequest(t, dut.Net, conn, test.firstPayloadLength, test.payload, test.secondFragmentOffset) for _, i := range test.sendFrameOrder { conn.SendFrame(t, fragments[i-1]) diff --git a/test/packetimpact/tests/ipv6_fragment_reassembly_test.go b/test/packetimpact/tests/ipv6_fragment_reassembly_test.go index eb56a53f7..dd98ee7a1 100644 --- a/test/packetimpact/tests/ipv6_fragment_reassembly_test.go +++ b/test/packetimpact/tests/ipv6_fragment_reassembly_test.go @@ -17,7 +17,6 @@ package ipv6_fragment_reassembly_test import ( "flag" "math/rand" - "net" "testing" "time" @@ -29,17 +28,17 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } type fragmentInfo struct { offset uint16 size uint16 more bool + id uint32 } func TestIPv6FragmentReassembly(t *testing.T) { - const fragmentID = 42 icmpv6ProtoNum := header.IPv6ExtensionHeaderIdentifier(header.ICMPv6ProtocolNumber) tests := []struct { @@ -50,10 +49,11 @@ func TestIPv6FragmentReassembly(t *testing.T) { }{ { description: "basic reassembly", - ipPayloadLen: 1500, + ipPayloadLen: 3000, fragments: []fragmentInfo{ - {offset: 0, size: 760, more: true}, - {offset: 760, size: 740, more: false}, + {offset: 0, size: 1000, id: 100, more: true}, + {offset: 1000, size: 1000, id: 100, more: true}, + {offset: 2000, size: 1000, id: 100, more: false}, }, expectReply: true, }, @@ -61,23 +61,55 @@ func TestIPv6FragmentReassembly(t *testing.T) { description: "out of order fragments", ipPayloadLen: 3000, fragments: []fragmentInfo{ - {offset: 0, size: 1024, more: true}, - {offset: 2048, size: 952, more: false}, - {offset: 1024, size: 1024, more: true}, + {offset: 0, size: 1000, id: 101, more: true}, + {offset: 2000, size: 1000, id: 101, more: false}, + {offset: 1000, size: 1000, id: 101, more: true}, + }, + expectReply: true, + }, + { + description: "duplicated fragments", + ipPayloadLen: 3000, + fragments: []fragmentInfo{ + {offset: 0, size: 1000, id: 102, more: true}, + {offset: 1000, size: 1000, id: 102, more: true}, + {offset: 1000, size: 1000, id: 102, more: true}, + {offset: 2000, size: 1000, id: 102, more: false}, + }, + expectReply: true, + }, + { + description: "fragment subset", + ipPayloadLen: 3000, + fragments: []fragmentInfo{ + {offset: 0, size: 1000, id: 103, more: true}, + {offset: 1000, size: 1000, id: 103, more: true}, + {offset: 512, size: 256, id: 103, more: true}, + {offset: 2000, size: 1000, id: 103, more: false}, }, expectReply: true, }, + { + description: "fragment overlap", + ipPayloadLen: 3000, + fragments: []fragmentInfo{ + {offset: 0, size: 1000, id: 104, more: true}, + {offset: 1512, size: 1000, id: 104, more: true}, + {offset: 1000, size: 1000, id: 104, more: true}, + {offset: 2000, size: 1000, id: 104, more: false}, + }, + expectReply: false, + }, } for _, test := range tests { t.Run(test.description, func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) + conn := dut.Net.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) defer conn.Close(t) - lIP := tcpip.Address(net.ParseIP(testbench.LocalIPv6).To16()) - rIP := tcpip.Address(net.ParseIP(testbench.RemoteIPv6).To16()) + lIP := tcpip.Address(dut.Net.LocalIPv6) + rIP := tcpip.Address(dut.Net.RemoteIPv6) data := make([]byte, test.ipPayloadLen) icmp := header.ICMPv6(data[:header.ICMPv6HeaderSize]) @@ -103,7 +135,7 @@ func TestIPv6FragmentReassembly(t *testing.T) { NextHeader: &icmpv6ProtoNum, FragmentOffset: testbench.Uint16(fragment.offset / header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit), MoreFragments: testbench.Bool(fragment.more), - Identification: testbench.Uint32(fragmentID), + Identification: testbench.Uint32(fragment.id), }, &testbench.Payload{ Bytes: data[fragment.offset:][:fragment.size], @@ -120,7 +152,7 @@ func TestIPv6FragmentReassembly(t *testing.T) { }, time.Second) if err != nil { // Either an unexpected frame was received, or none at all. - if bytesReceived < test.ipPayloadLen { + if test.expectReply && bytesReceived < test.ipPayloadLen { t.Fatalf("received %d bytes out of %d, then conn.ExpectFrame(_, _, time.Second) failed with %s", bytesReceived, test.ipPayloadLen, err) } break diff --git a/test/packetimpact/tests/ipv6_unknown_options_action_test.go b/test/packetimpact/tests/ipv6_unknown_options_action_test.go index e79d74476..cb5396417 100644 --- a/test/packetimpact/tests/ipv6_unknown_options_action_test.go +++ b/test/packetimpact/tests/ipv6_unknown_options_action_test.go @@ -27,7 +27,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func mkHopByHopOptionsExtHdr(optType byte) testbench.Layer { @@ -141,8 +141,7 @@ func TestIPv6UnknownOptionAction(t *testing.T) { } { t.Run(tt.description, func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - ipv6Conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) + ipv6Conn := dut.Net.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{}) conn := (*testbench.Connection)(&ipv6Conn) defer ipv6Conn.Close(t) diff --git a/test/packetimpact/tests/tcp_cork_mss_test.go b/test/packetimpact/tests/tcp_cork_mss_test.go index 8feea4a82..a7ba5035e 100644 --- a/test/packetimpact/tests/tcp_cork_mss_test.go +++ b/test/packetimpact/tests/tcp_cork_mss_test.go @@ -25,16 +25,15 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestTCPCorkMSS tests for segment coalesce and split as per MSS. func TestTCPCorkMSS(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) const mss = uint32(header.TCPDefaultMSS) diff --git a/test/packetimpact/tests/tcp_handshake_window_size_test.go b/test/packetimpact/tests/tcp_handshake_window_size_test.go index 22937d92f..5d1266f3c 100644 --- a/test/packetimpact/tests/tcp_handshake_window_size_test.go +++ b/test/packetimpact/tests/tcp_handshake_window_size_test.go @@ -25,17 +25,16 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestTCPHandshakeWindowSize tests if the stack is honoring the window size // communicated during handshake. func TestTCPHandshakeWindowSize(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) // Start handshake with zero window size. diff --git a/test/packetimpact/tests/tcp_linger_test.go b/test/packetimpact/tests/tcp_linger_test.go index b9a0409aa..bc4b64388 100644 --- a/test/packetimpact/tests/tcp_linger_test.go +++ b/test/packetimpact/tests/tcp_linger_test.go @@ -27,12 +27,12 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func createSocket(t *testing.T, dut testbench.DUT) (int32, int32, testbench.TCPIPv4) { listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) conn.Connect(t) acceptFD, _ := dut.Accept(t, listenFD) return acceptFD, listenFD, conn @@ -41,7 +41,6 @@ func createSocket(t *testing.T, dut testbench.DUT) (int32, int32, testbench.TCPI func closeAll(t *testing.T, dut testbench.DUT, listenFD int32, conn testbench.TCPIPv4) { conn.Close(t) dut.Close(t, listenFD) - dut.TearDown() } // lingerDuration is the timeout value used with SO_LINGER socket option. @@ -266,5 +265,4 @@ func TestTCPLingerNonEstablished(t *testing.T) { if diff > lingerDuration { t.Errorf("expected close to return within %s, but returned after %s", lingerDuration, diff) } - dut.TearDown() } diff --git a/test/packetimpact/tests/tcp_network_unreachable_test.go b/test/packetimpact/tests/tcp_network_unreachable_test.go index 8a1fe1279..6cd6d2edf 100644 --- a/test/packetimpact/tests/tcp_network_unreachable_test.go +++ b/test/packetimpact/tests/tcp_network_unreachable_test.go @@ -17,7 +17,6 @@ package tcp_synsent_reset_test import ( "context" "flag" - "net" "syscall" "testing" "time" @@ -28,7 +27,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestTCPSynSentUnreachable verifies that TCP connections fail immediately when @@ -37,17 +36,16 @@ func init() { func TestTCPSynSentUnreachable(t *testing.T) { // Create the DUT and connection. dut := testbench.NewDUT(t) - defer dut.TearDown() - clientFD, clientPort := dut.CreateBoundSocket(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4)) + clientFD, clientPort := dut.CreateBoundSocket(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, dut.Net.RemoteIPv4) port := uint16(9001) - conn := testbench.NewTCPIPv4(t, testbench.TCP{SrcPort: &port, DstPort: &clientPort}, testbench.TCP{SrcPort: &clientPort, DstPort: &port}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{SrcPort: &port, DstPort: &clientPort}, testbench.TCP{SrcPort: &clientPort, DstPort: &port}) defer conn.Close(t) // Bring the DUT to SYN-SENT state with a non-blocking connect. ctx, cancel := context.WithTimeout(context.Background(), testbench.RPCTimeout) defer cancel() sa := unix.SockaddrInet4{Port: int(port)} - copy(sa.Addr[:], net.IP(net.ParseIP(testbench.LocalIPv4)).To4()) + copy(sa.Addr[:], dut.Net.LocalIPv4) if _, err := dut.ConnectWithErrno(ctx, t, clientFD, &sa); err != syscall.Errno(unix.EINPROGRESS) { t.Errorf("expected connect to fail with EINPROGRESS, but got %v", err) } @@ -91,9 +89,8 @@ func TestTCPSynSentUnreachable(t *testing.T) { func TestTCPSynSentUnreachable6(t *testing.T) { // Create the DUT and connection. dut := testbench.NewDUT(t) - defer dut.TearDown() - clientFD, clientPort := dut.CreateBoundSocket(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv6)) - conn := testbench.NewTCPIPv6(t, testbench.TCP{DstPort: &clientPort}, testbench.TCP{SrcPort: &clientPort}) + clientFD, clientPort := dut.CreateBoundSocket(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, dut.Net.RemoteIPv6) + conn := dut.Net.NewTCPIPv6(t, testbench.TCP{DstPort: &clientPort}, testbench.TCP{SrcPort: &clientPort}) defer conn.Close(t) // Bring the DUT to SYN-SENT state with a non-blocking connect. @@ -101,9 +98,9 @@ func TestTCPSynSentUnreachable6(t *testing.T) { defer cancel() sa := unix.SockaddrInet6{ Port: int(conn.SrcPort()), - ZoneId: uint32(testbench.RemoteInterfaceID), + ZoneId: dut.Net.RemoteDevID, } - copy(sa.Addr[:], net.IP(net.ParseIP(testbench.LocalIPv6)).To16()) + copy(sa.Addr[:], dut.Net.LocalIPv6) if _, err := dut.ConnectWithErrno(ctx, t, clientFD, &sa); err != syscall.Errno(unix.EINPROGRESS) { t.Errorf("expected connect to fail with EINPROGRESS, but got %v", err) } diff --git a/test/packetimpact/tests/tcp_noaccept_close_rst_test.go b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go index 82b7a85ff..f0af5352d 100644 --- a/test/packetimpact/tests/tcp_noaccept_close_rst_test.go +++ b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go @@ -25,14 +25,13 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func TestTcpNoAcceptCloseReset(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) conn.Connect(t) defer conn.Close(t) dut.Close(t, listenFd) diff --git a/test/packetimpact/tests/tcp_outside_the_window_test.go b/test/packetimpact/tests/tcp_outside_the_window_test.go index 08f759f7c..1b041932a 100644 --- a/test/packetimpact/tests/tcp_outside_the_window_test.go +++ b/test/packetimpact/tests/tcp_outside_the_window_test.go @@ -27,7 +27,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestTCPOutsideTheWindows tests the behavior of the DUT when packets arrive @@ -62,10 +62,9 @@ func TestTCPOutsideTheWindow(t *testing.T) { } { t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) acceptFD, _ := dut.Accept(t, listenFD) diff --git a/test/packetimpact/tests/tcp_paws_mechanism_test.go b/test/packetimpact/tests/tcp_paws_mechanism_test.go index 37f3b56dd..24d9ef4ec 100644 --- a/test/packetimpact/tests/tcp_paws_mechanism_test.go +++ b/test/packetimpact/tests/tcp_paws_mechanism_test.go @@ -26,15 +26,14 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func TestPAWSMechanism(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) options := make([]byte, header.TCPOptionTSLength) diff --git a/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go index d9f3ea0f2..646c93216 100644 --- a/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go +++ b/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go @@ -20,7 +20,6 @@ import ( "encoding/hex" "errors" "flag" - "net" "sync" "syscall" "testing" @@ -32,7 +31,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestQueueReceiveInSynSent tests receive behavior when the TCP state @@ -50,10 +49,9 @@ func TestQueueReceiveInSynSent(t *testing.T) { } { t.Run(tt.description, func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - socket, remotePort := dut.CreateBoundSocket(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4)) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + socket, remotePort := dut.CreateBoundSocket(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, dut.Net.RemoteIPv4) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) sampleData := []byte("Sample Data") diff --git a/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go index 0ec8fd748..29e51cae3 100644 --- a/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go +++ b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go @@ -18,7 +18,6 @@ import ( "context" "errors" "flag" - "net" "sync" "syscall" "testing" @@ -30,7 +29,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestQueueSendInSynSent tests send behavior when the TCP state @@ -48,10 +47,9 @@ func TestQueueSendInSynSent(t *testing.T) { } { t.Run(tt.description, func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - socket, remotePort := dut.CreateBoundSocket(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4)) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + socket, remotePort := dut.CreateBoundSocket(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, dut.Net.RemoteIPv4) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) sampleData := []byte("Sample Data") diff --git a/test/packetimpact/tests/tcp_rcv_buf_space_test.go b/test/packetimpact/tests/tcp_rcv_buf_space_test.go index cfbba1e8e..d6ad5cda6 100644 --- a/test/packetimpact/tests/tcp_rcv_buf_space_test.go +++ b/test/packetimpact/tests/tcp_rcv_buf_space_test.go @@ -26,7 +26,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestReduceRecvBuf tests that a packet within window is still dropped @@ -34,10 +34,9 @@ func init() { // segment. func TestReduceRecvBuf(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFd) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/tcp_reordering_test.go b/test/packetimpact/tests/tcp_reordering_test.go index b4aeaab57..ca352dbc7 100644 --- a/test/packetimpact/tests/tcp_reordering_test.go +++ b/test/packetimpact/tests/tcp_reordering_test.go @@ -22,19 +22,18 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" - tb "gvisor.dev/gvisor/test/packetimpact/testbench" + "gvisor.dev/gvisor/test/packetimpact/testbench" ) func init() { - tb.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func TestReorderingWindow(t *testing.T) { - dut := tb.NewDUT(t) - defer dut.TearDown() + dut := testbench.NewDUT(t) listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFd) - conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) // Enable SACK. @@ -54,13 +53,13 @@ func TestReorderingWindow(t *testing.T) { acceptFd, _ := dut.Accept(t, listenFd) defer dut.Close(t, acceptFd) - if tb.Native { + if testbench.Native { // Linux has changed its handling of reordering, force the old behavior. dut.SetSockOpt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_CONGESTION, []byte("reno")) } pls := dut.GetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_MAXSEG) - if !tb.Native { + if !testbench.Native { // netstack does not impliment TCP_MAXSEG correctly. Fake it // here. Netstack uses the max SACK size which is 32. The MSS // option is 8 bytes, making the total 36 bytes. @@ -75,14 +74,14 @@ func TestReorderingWindow(t *testing.T) { for i, sn := 0, seqNum1; i < numPkts; i++ { dut.Send(t, acceptFd, payload, 0) - gotOne, err := conn.Expect(t, tb.TCP{SeqNum: tb.Uint32(uint32(sn))}, time.Second) + gotOne, err := conn.Expect(t, testbench.TCP{SeqNum: testbench.Uint32(uint32(sn))}, time.Second) sn.UpdateForward(seqnum.Size(len(payload))) if err != nil { - t.Errorf("Expect #%d: %s", i+1, err) + t.Fatalf("Expect #%d: %s", i+1, err) continue } if gotOne == nil { - t.Errorf("#%d: expected a packet within a second but got none", i+1) + t.Fatalf("#%d: expected a packet within a second but got none", i+1) } } @@ -97,13 +96,13 @@ func TestReorderingWindow(t *testing.T) { seqNum1.Add(seqnum.Size(len(payload))), seqNum1.Add(seqnum.Size(4 * len(payload))), }}, sackBlock[sbOff:]) - conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: tb.Uint32(uint32(seqNum1)), Options: sackBlock[:sbOff]}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: testbench.Uint32(uint32(seqNum1)), Options: sackBlock[:sbOff]}) // ACK first packet. - conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: tb.Uint32(uint32(seqNum1) + uint32(len(payload)))}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: testbench.Uint32(uint32(seqNum1) + uint32(len(payload)))}) // Check for retransmit. - gotOne, err := conn.Expect(t, tb.TCP{SeqNum: tb.Uint32(uint32(seqNum1))}, time.Second) + gotOne, err := conn.Expect(t, testbench.TCP{SeqNum: testbench.Uint32(uint32(seqNum1))}, time.Second) if err != nil { t.Error("Expect for retransmit:", err) } @@ -123,29 +122,29 @@ func TestReorderingWindow(t *testing.T) { seqNum1.Add(seqnum.Size(4 * len(payload))), }}, dsackBlock[dsbOff:]) - conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: tb.Uint32(uint32(seqNum2)), Options: dsackBlock[:dsbOff]}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: testbench.Uint32(uint32(seqNum2)), Options: dsackBlock[:dsbOff]}) // Send half of the original window of packets, checking that we // received each. for i, sn := 0, seqNum2; i < numPkts/2; i++ { dut.Send(t, acceptFd, payload, 0) - gotOne, err := conn.Expect(t, tb.TCP{SeqNum: tb.Uint32(uint32(sn))}, time.Second) + gotOne, err := conn.Expect(t, testbench.TCP{SeqNum: testbench.Uint32(uint32(sn))}, time.Second) sn.UpdateForward(seqnum.Size(len(payload))) if err != nil { - t.Errorf("Expect #%d: %s", i+1, err) + t.Fatalf("Expect #%d: %s", i+1, err) continue } if gotOne == nil { - t.Errorf("#%d: expected a packet within a second but got none", i+1) + t.Fatalf("#%d: expected a packet within a second but got none", i+1) } } - if !tb.Native { + if !testbench.Native { // The window should now be halved, so we should receive any // more, even if we send them. dut.Send(t, acceptFd, payload, 0) - if got, err := conn.Expect(t, tb.TCP{}, 100*time.Millisecond); got != nil || err == nil { + if got, err := conn.Expect(t, testbench.TCP{}, 100*time.Millisecond); got != nil || err == nil { t.Fatalf("expected no packets within 100 millisecond, but got one: %s", got) } return @@ -155,20 +154,20 @@ func TestReorderingWindow(t *testing.T) { for i, sn := 0, seqNum2.Add(seqnum.Size(numPkts/2*len(payload))); i < 2; i++ { dut.Send(t, acceptFd, payload, 0) - gotOne, err := conn.Expect(t, tb.TCP{SeqNum: tb.Uint32(uint32(sn))}, time.Second) + gotOne, err := conn.Expect(t, testbench.TCP{SeqNum: testbench.Uint32(uint32(sn))}, time.Second) sn.UpdateForward(seqnum.Size(len(payload))) if err != nil { - t.Errorf("Expect #%d: %s", i+1, err) + t.Fatalf("Expect #%d: %s", i+1, err) continue } if gotOne == nil { - t.Errorf("#%d: expected a packet within a second but got none", i+1) + t.Fatalf("#%d: expected a packet within a second but got none", i+1) } } // The window should now be full. dut.Send(t, acceptFd, payload, 0) - if got, err := conn.Expect(t, tb.TCP{}, 100*time.Millisecond); got != nil || err == nil { + if got, err := conn.Expect(t, testbench.TCP{}, 100*time.Millisecond); got != nil || err == nil { t.Fatalf("expected no packets within 100 millisecond, but got one: %s", got) } } diff --git a/test/packetimpact/tests/tcp_retransmits_test.go b/test/packetimpact/tests/tcp_retransmits_test.go index 072014ff8..27e9641b1 100644 --- a/test/packetimpact/tests/tcp_retransmits_test.go +++ b/test/packetimpact/tests/tcp_retransmits_test.go @@ -25,17 +25,16 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestRetransmits tests retransmits occur at exponentially increasing // time intervals. func TestRetransmits(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFd) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go b/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go index f91b06ba1..418393796 100644 --- a/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go +++ b/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go @@ -16,7 +16,6 @@ package tcp_send_window_sizes_piggyback_test import ( "flag" - "fmt" "testing" "time" @@ -26,7 +25,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestSendWindowSizesPiggyback tests cases where segment sizes are close to @@ -58,13 +57,12 @@ func TestSendWindowSizesPiggyback(t *testing.T) { // greater than available sender window. {"WindowGreaterThanSegment", segmentSize + 1, sampleData, sampleData, true /* enqueue */}, } { - t.Run(fmt.Sprintf("%s%d", tt.description, tt.windowSize), func(t *testing.T) { + t.Run(tt.description, func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFd) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort, WindowSize: testbench.Uint16(tt.windowSize)}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort, WindowSize: testbench.Uint16(tt.windowSize)}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/tcp_synrcvd_reset_test.go b/test/packetimpact/tests/tcp_synrcvd_reset_test.go index 57d034dd1..c5bbd29ee 100644 --- a/test/packetimpact/tests/tcp_synrcvd_reset_test.go +++ b/test/packetimpact/tests/tcp_synrcvd_reset_test.go @@ -25,16 +25,15 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestTCPSynRcvdReset tests transition from SYN-RCVD to CLOSED. func TestTCPSynRcvdReset(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) // Expect dut connection to have transitioned to SYN-RCVD state. diff --git a/test/packetimpact/tests/tcp_synsent_reset_test.go b/test/packetimpact/tests/tcp_synsent_reset_test.go index eac8eb19d..2c8bb101b 100644 --- a/test/packetimpact/tests/tcp_synsent_reset_test.go +++ b/test/packetimpact/tests/tcp_synsent_reset_test.go @@ -16,34 +16,33 @@ package tcp_synsent_reset_test import ( "flag" - "net" "testing" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/tcpip/header" - tb "gvisor.dev/gvisor/test/packetimpact/testbench" + "gvisor.dev/gvisor/test/packetimpact/testbench" ) func init() { - tb.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // dutSynSentState sets up the dut connection in SYN-SENT state. -func dutSynSentState(t *testing.T) (*tb.DUT, *tb.TCPIPv4, uint16, uint16) { +func dutSynSentState(t *testing.T) (*testbench.DUT, *testbench.TCPIPv4, uint16, uint16) { t.Helper() - dut := tb.NewDUT(t) + dut := testbench.NewDUT(t) - clientFD, clientPort := dut.CreateBoundSocket(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(tb.RemoteIPv4)) + clientFD, clientPort := dut.CreateBoundSocket(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, dut.Net.RemoteIPv4) port := uint16(9001) - conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &port, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &port}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{SrcPort: &port, DstPort: &clientPort}, testbench.TCP{SrcPort: &clientPort, DstPort: &port}) sa := unix.SockaddrInet4{Port: int(port)} - copy(sa.Addr[:], net.IP(net.ParseIP(tb.LocalIPv4)).To4()) + copy(sa.Addr[:], dut.Net.LocalIPv4) // Bring the dut to SYN-SENT state with a non-blocking connect. dut.Connect(t, clientFD, &sa) - if _, err := conn.ExpectData(t, &tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)}, nil, time.Second); err != nil { + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, nil, time.Second); err != nil { t.Fatalf("expected SYN\n") } @@ -52,14 +51,13 @@ func dutSynSentState(t *testing.T) (*tb.DUT, *tb.TCPIPv4, uint16, uint16) { // TestTCPSynSentReset tests RFC793, p67: SYN-SENT to CLOSED transition. func TestTCPSynSentReset(t *testing.T) { - dut, conn, _, _ := dutSynSentState(t) + _, conn, _, _ := dutSynSentState(t) defer conn.Close(t) - defer dut.TearDown() - conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagRst | header.TCPFlagAck)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}) // Expect the connection to have closed. // TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side. - conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}) - if _, err := conn.ExpectData(t, &tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { t.Fatalf("expected a TCP RST") } } @@ -68,23 +66,22 @@ func TestTCPSynSentReset(t *testing.T) { // transitions. func TestTCPSynSentRcvdReset(t *testing.T) { dut, c, remotePort, clientPort := dutSynSentState(t) - defer dut.TearDown() defer c.Close(t) - conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &remotePort, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{SrcPort: &remotePort, DstPort: &clientPort}, testbench.TCP{SrcPort: &clientPort, DstPort: &remotePort}) defer conn.Close(t) // Initiate new SYN connection with the same port pair // (simultaneous open case), expect the dut connection to move to // SYN-RCVD state - conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)}) - if _, err := conn.ExpectData(t, &tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { t.Fatalf("expected SYN-ACK %s\n", err) } - conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}) + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}) // Expect the connection to have transitioned SYN-RCVD to CLOSED. // TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side. - conn.Send(t, tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}) - if _, err := conn.ExpectData(t, &tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil { t.Fatalf("expected a TCP RST") } } diff --git a/test/packetimpact/tests/tcp_timewait_reset_test.go b/test/packetimpact/tests/tcp_timewait_reset_test.go index 2f76a6531..d1d2fb83d 100644 --- a/test/packetimpact/tests/tcp_timewait_reset_test.go +++ b/test/packetimpact/tests/tcp_timewait_reset_test.go @@ -25,16 +25,15 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestTimeWaitReset tests handling of RST when in TIME_WAIT state. func TestTimeWaitReset(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/tcp_unacc_seq_ack_test.go b/test/packetimpact/tests/tcp_unacc_seq_ack_test.go index d078bbf15..ea962c818 100644 --- a/test/packetimpact/tests/tcp_unacc_seq_ack_test.go +++ b/test/packetimpact/tests/tcp_unacc_seq_ack_test.go @@ -28,7 +28,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func TestEstablishedUnaccSeqAck(t *testing.T) { @@ -48,10 +48,9 @@ func TestEstablishedUnaccSeqAck(t *testing.T) { } { t.Run(fmt.Sprintf("%s:offset=%d", tt.description, tt.seqNumOffset), func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) @@ -102,10 +101,9 @@ func TestPassiveCloseUnaccSeqAck(t *testing.T) { } { t.Run(fmt.Sprintf("%s:offset=%d", tt.description, tt.seqNumOffset), func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) @@ -164,10 +162,9 @@ func TestActiveCloseUnaccpSeqAck(t *testing.T) { } { t.Run(fmt.Sprintf("%s:offset=%d", tt.description, tt.seqNumOffset), func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/tcp_user_timeout_test.go b/test/packetimpact/tests/tcp_user_timeout_test.go index 551dc78e7..b16e65366 100644 --- a/test/packetimpact/tests/tcp_user_timeout_test.go +++ b/test/packetimpact/tests/tcp_user_timeout_test.go @@ -25,7 +25,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func sendPayload(t *testing.T, conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) { @@ -64,10 +64,9 @@ func TestTCPUserTimeout(t *testing.T) { t.Run(tt.description+ttf.description, func(t *testing.T) { // Create a socket, listen, TCP handshake, and accept. dut := testbench.NewDUT(t) - defer dut.TearDown() listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFD) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) acceptFD, _ := dut.Accept(t, listenFD) diff --git a/test/packetimpact/tests/tcp_window_shrink_test.go b/test/packetimpact/tests/tcp_window_shrink_test.go index 5b001fbec..093484721 100644 --- a/test/packetimpact/tests/tcp_window_shrink_test.go +++ b/test/packetimpact/tests/tcp_window_shrink_test.go @@ -25,15 +25,14 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func TestWindowShrink(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFd) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/tcp_zero_receive_window_test.go b/test/packetimpact/tests/tcp_zero_receive_window_test.go new file mode 100644 index 000000000..d06690705 --- /dev/null +++ b/test/packetimpact/tests/tcp_zero_receive_window_test.go @@ -0,0 +1,125 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_zero_receive_window_test + +import ( + "flag" + "fmt" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + testbench.Initialize(flag.CommandLine) +} + +// TestZeroReceiveWindow tests if the DUT sends a zero receive window eventually. +func TestZeroReceiveWindow(t *testing.T) { + for _, payloadLen := range []int{64, 512, 1024} { + t.Run(fmt.Sprintf("TestZeroReceiveWindow_with_%dbytes_payload", payloadLen), func(t *testing.T) { + dut := testbench.NewDUT(t) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + defer conn.Close(t) + + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) + + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + + samplePayload := &testbench.Payload{Bytes: testbench.GenerateRandomPayload(t, payloadLen)} + // Expect the DUT to eventually advertise zero receive window. + // The test would timeout otherwise. + for readOnce := false; ; { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) + gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) + if err != nil { + t.Fatalf("expected packet was not received: %s", err) + } + // Read once to trigger the subsequent window update from the + // DUT to grow the right edge of the receive window from what + // was advertised in the SYN-ACK. This ensures that we test + // for the full default buffer size (1MB on gVisor at the time + // of writing this comment), thus testing for cases when the + // scaled receive window size ends up > 65535 (0xffff). + if !readOnce { + if got := dut.Recv(t, acceptFd, int32(payloadLen), 0); len(got) != payloadLen { + t.Fatalf("got dut.Recv(t, %d, %d, 0) = %d, want %d", acceptFd, payloadLen, len(got), payloadLen) + } + readOnce = true + } + windowSize := *gotTCP.WindowSize + t.Logf("got window size = %d", windowSize) + if windowSize == 0 { + break + } + } + }) + } +} + +// TestNonZeroReceiveWindow tests for the DUT to never send a zero receive +// window when the data is being read from the socket buffer. +func TestNonZeroReceiveWindow(t *testing.T) { + for _, payloadLen := range []int{64, 512, 1024} { + t.Run(fmt.Sprintf("TestZeroReceiveWindow_with_%dbytes_payload", payloadLen), func(t *testing.T) { + dut := testbench.NewDUT(t) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + defer conn.Close(t) + + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) + + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + + samplePayload := &testbench.Payload{Bytes: testbench.GenerateRandomPayload(t, payloadLen)} + var rcvWindow uint16 + initRcv := false + // This loop keeps a running rcvWindow value from the initial ACK for the data + // we sent. Once we have received ACKs with non-zero receive windows, we break + // the loop. + for { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) + gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) + if err != nil { + t.Fatalf("expected packet was not received: %s", err) + } + if got := dut.Recv(t, acceptFd, int32(payloadLen), 0); len(got) != payloadLen { + t.Fatalf("got dut.Recv(t, %d, %d, 0) = %d, want %d", acceptFd, payloadLen, len(got), payloadLen) + } + if *gotTCP.WindowSize == 0 { + t.Fatalf("expected non-zero receive window.") + } + if !initRcv { + rcvWindow = uint16(*gotTCP.WindowSize) + initRcv = true + } + if rcvWindow <= uint16(payloadLen) { + break + } + rcvWindow -= uint16(payloadLen) + } + }) + } +} diff --git a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go index da93267d6..1ab9ee1b2 100644 --- a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go +++ b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go @@ -25,17 +25,16 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestZeroWindowProbeRetransmit tests retransmits of zero window probes // to be sent at exponentially inreasing time intervals. func TestZeroWindowProbeRetransmit(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFd) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/tcp_zero_window_probe_test.go b/test/packetimpact/tests/tcp_zero_window_probe_test.go index 44cac42f8..650a569cc 100644 --- a/test/packetimpact/tests/tcp_zero_window_probe_test.go +++ b/test/packetimpact/tests/tcp_zero_window_probe_test.go @@ -25,17 +25,16 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestZeroWindowProbe tests few cases of zero window probing over the // same connection. func TestZeroWindowProbe(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFd) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go b/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go index 09a1c653f..079fea68c 100644 --- a/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go +++ b/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go @@ -25,17 +25,16 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } // TestZeroWindowProbeUserTimeout sanity tests user timeout when we are // retransmitting zero window probes. func TestZeroWindowProbeUserTimeout(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) defer dut.Close(t, listenFd) - conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) defer conn.Close(t) conn.Connect(t) diff --git a/test/packetimpact/tests/udp_any_addr_recv_unicast_test.go b/test/packetimpact/tests/udp_any_addr_recv_unicast_test.go index 17f32ef65..f4ae00a81 100644 --- a/test/packetimpact/tests/udp_any_addr_recv_unicast_test.go +++ b/test/packetimpact/tests/udp_any_addr_recv_unicast_test.go @@ -26,21 +26,20 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func TestAnyRecvUnicastUDP(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() boundFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.IPv4zero) defer dut.Close(t, boundFD) - conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + conn := dut.Net.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) defer conn.Close(t) payload := testbench.GenerateRandomPayload(t, 1<<10 /* 1 KiB */) conn.SendIP( t, - testbench.IPv4{DstAddr: testbench.Address(tcpip.Address(net.ParseIP(testbench.RemoteIPv4).To4()))}, + testbench.IPv4{DstAddr: testbench.Address(tcpip.Address(dut.Net.RemoteIPv4))}, testbench.UDP{}, &testbench.Payload{Bytes: payload}, ) diff --git a/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go b/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go index 3d2791a6e..52c6f9d91 100644 --- a/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go +++ b/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go @@ -30,16 +30,15 @@ import ( var oneSecond = unix.Timeval{Sec: 1, Usec: 0} func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func TestDiscardsUDPPacketsWithMcastSourceAddressV4(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(testbench.RemoteIPv4)) + remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, dut.Net.RemoteIPv4) defer dut.Close(t, remoteFD) dut.SetSockOptTimeval(t, remoteFD, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &oneSecond) - conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + conn := dut.Net.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) defer conn.Close(t) for _, mcastAddr := range []net.IP{ @@ -66,11 +65,10 @@ func TestDiscardsUDPPacketsWithMcastSourceAddressV4(t *testing.T) { func TestDiscardsUDPPacketsWithMcastSourceAddressV6(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(testbench.RemoteIPv6)) + remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, dut.Net.RemoteIPv6) defer dut.Close(t, remoteFD) dut.SetSockOptTimeval(t, remoteFD, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &oneSecond) - conn := testbench.NewUDPIPv6(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + conn := dut.Net.NewUDPIPv6(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) defer conn.Close(t) for _, mcastAddr := range []net.IP{ diff --git a/test/packetimpact/tests/udp_icmp_error_propagation_test.go b/test/packetimpact/tests/udp_icmp_error_propagation_test.go index df35d16c8..cd4523e88 100644 --- a/test/packetimpact/tests/udp_icmp_error_propagation_test.go +++ b/test/packetimpact/tests/udp_icmp_error_propagation_test.go @@ -30,7 +30,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } type connectionMode bool @@ -229,7 +229,6 @@ func TestUDPICMPErrorPropagation(t *testing.T) { } { t.Run(fmt.Sprintf("%s/%s/%s", connect, icmpErr, errDetect.name), func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.IPv4zero) defer dut.Close(t, remoteFD) @@ -239,7 +238,7 @@ func TestUDPICMPErrorPropagation(t *testing.T) { cleanFD, cleanPort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.IPv4zero) defer dut.Close(t, cleanFD) - conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + conn := dut.Net.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) defer conn.Close(t) if connect { @@ -261,7 +260,7 @@ func TestUDPICMPErrorPropagation(t *testing.T) { // involved in the generation of the ICMP error. As such, // interactions between it and the the DUT should be independent of // the ICMP error at least at the port level. - connClean := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + connClean := dut.Net.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) defer connClean.Close(t) errDetectConn = &connClean @@ -283,7 +282,6 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { t.Run(fmt.Sprintf("%s/%s", connect, icmpErr), func(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() remoteFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.IPv4zero) defer dut.Close(t, remoteFD) @@ -293,7 +291,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { cleanFD, cleanPort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.IPv4zero) defer dut.Close(t, cleanFD) - conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + conn := dut.Net.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) defer conn.Close(t) if connect { diff --git a/test/packetimpact/tests/udp_recv_mcast_bcast_test.go b/test/packetimpact/tests/udp_recv_mcast_bcast_test.go index 526173969..b29c07825 100644 --- a/test/packetimpact/tests/udp_recv_mcast_bcast_test.go +++ b/test/packetimpact/tests/udp_recv_mcast_bcast_test.go @@ -29,12 +29,12 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } func TestUDPRecvMcastBcast(t *testing.T) { - subnetBcastAddr := broadcastAddr(net.ParseIP(testbench.RemoteIPv4), net.CIDRMask(testbench.IPv4PrefixLength, 32)) - + dut := testbench.NewDUT(t) + subnetBcastAddr := broadcastAddr(dut.Net.RemoteIPv4, net.CIDRMask(dut.Net.IPv4PrefixLength, 32)) for _, v := range []struct { bound, to net.IP }{ @@ -43,17 +43,22 @@ func TestUDPRecvMcastBcast(t *testing.T) { {bound: net.IPv4zero, to: net.IPv4allsys}, {bound: subnetBcastAddr, to: subnetBcastAddr}, - {bound: subnetBcastAddr, to: net.IPv4bcast}, + + // FIXME(gvisor.dev/issue/4896): Previously by the time subnetBcastAddr is + // created, IPv4PrefixLength is still 0 because genPseudoFlags is not called + // yet, it was only called in NewDUT, so the test didn't do what the author + // original intended to and becomes failing because we process all flags at + // the very beginning. + // + // {bound: subnetBcastAddr, to: net.IPv4bcast}, {bound: net.IPv4bcast, to: net.IPv4bcast}, {bound: net.IPv4allsys, to: net.IPv4allsys}, } { t.Run(fmt.Sprintf("bound=%s,to=%s", v.bound, v.to), func(t *testing.T) { - dut := testbench.NewDUT(t) - defer dut.TearDown() boundFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, v.bound) defer dut.Close(t, boundFD) - conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + conn := dut.Net.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) defer conn.Close(t) payload := testbench.GenerateRandomPayload(t, 1<<10 /* 1 KiB */) @@ -73,15 +78,14 @@ func TestUDPRecvMcastBcast(t *testing.T) { func TestUDPDoesntRecvMcastBcastOnUnicastAddr(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() - boundFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(testbench.RemoteIPv4)) + boundFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, dut.Net.RemoteIPv4) dut.SetSockOptTimeval(t, boundFD, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &unix.Timeval{Sec: 1, Usec: 0}) defer dut.Close(t, boundFD) - conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + conn := dut.Net.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) defer conn.Close(t) for _, to := range []net.IP{ - broadcastAddr(net.ParseIP(testbench.RemoteIPv4), net.CIDRMask(testbench.IPv4PrefixLength, 32)), + broadcastAddr(dut.Net.RemoteIPv4, net.CIDRMask(dut.Net.IPv4PrefixLength, 32)), net.IPv4(255, 255, 255, 255), net.IPv4(224, 0, 0, 1), } { @@ -102,9 +106,10 @@ func TestUDPDoesntRecvMcastBcastOnUnicastAddr(t *testing.T) { } func broadcastAddr(ip net.IP, mask net.IPMask) net.IP { + result := make(net.IP, net.IPv4len) ip4 := ip.To4() for i := range ip4 { - ip4[i] |= ^mask[i] + result[i] = ip4[i] | ^mask[i] } - return ip4 + return result } diff --git a/test/packetimpact/tests/udp_send_recv_dgram_test.go b/test/packetimpact/tests/udp_send_recv_dgram_test.go index 91b967400..7ee2c8014 100644 --- a/test/packetimpact/tests/udp_send_recv_dgram_test.go +++ b/test/packetimpact/tests/udp_send_recv_dgram_test.go @@ -26,7 +26,7 @@ import ( ) func init() { - testbench.RegisterFlags(flag.CommandLine) + testbench.Initialize(flag.CommandLine) } type udpConn interface { @@ -38,7 +38,6 @@ type udpConn interface { func TestUDP(t *testing.T) { dut := testbench.NewDUT(t) - defer dut.TearDown() for _, isIPv4 := range []bool{true, false} { ipVersionName := "IPv6" @@ -46,24 +45,24 @@ func TestUDP(t *testing.T) { ipVersionName = "IPv4" } t.Run(ipVersionName, func(t *testing.T) { - var addr string + var addr net.IP if isIPv4 { - addr = testbench.RemoteIPv4 + addr = dut.Net.RemoteIPv4 } else { - addr = testbench.RemoteIPv6 + addr = dut.Net.RemoteIPv6 } - boundFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP(addr)) + boundFD, remotePort := dut.CreateBoundSocket(t, unix.SOCK_DGRAM, unix.IPPROTO_UDP, addr) defer dut.Close(t, boundFD) var conn udpConn var localAddr unix.Sockaddr if isIPv4 { - v4Conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + v4Conn := dut.Net.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) localAddr = v4Conn.LocalAddr(t) conn = &v4Conn } else { - v6Conn := testbench.NewUDPIPv6(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) - localAddr = v6Conn.LocalAddr(t) + v6Conn := dut.Net.NewUDPIPv6(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort}) + localAddr = v6Conn.LocalAddr(t, dut.Net.RemoteDevID) conn = &v6Conn } defer conn.Close(t) diff --git a/test/perf/BUILD b/test/perf/BUILD index b763be50e..e25f090ae 100644 --- a/test/perf/BUILD +++ b/test/perf/BUILD @@ -1,3 +1,4 @@ +load("//tools:defs.bzl", "more_shards") load("//test/runner:defs.bzl", "syscall_test") package(licenses = ["notice"]) @@ -37,7 +38,7 @@ syscall_test( syscall_test( size = "enormous", debug = False, - shard_count = 10, + shard_count = more_shards, tags = ["nogotsan"], test = "//test/perf/linux:getdents_benchmark", ) diff --git a/test/root/BUILD b/test/root/BUILD index a9130b34f..8d9fff578 100644 --- a/test/root/BUILD +++ b/test/root/BUILD @@ -1,5 +1,4 @@ load("//tools:defs.bzl", "go_library", "go_test") -load("//tools/vm:defs.bzl", "vm_test") package(licenses = ["notice"]) @@ -24,12 +23,8 @@ go_test( ], library = ":root", tags = [ - # Requires docker and runsc to be configured before the test runs. - # Also, the test needs to be run as root. Note that below, the - # root_vm_test relies on the default runtime 'runsc' being installed by - # the default installer. - "manual", "local", + "manual", ], visibility = ["//:sandbox"], deps = [ @@ -46,10 +41,3 @@ go_test( "@org_golang_x_sys//unix:go_default_library", ], ) - -vm_test( - name = "root_vm_test", - size = "large", - shard_count = 1, - targets = [":root_test"], -) diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go index 11ac5cb52..df52dd381 100644 --- a/test/root/crictl_test.go +++ b/test/root/crictl_test.go @@ -315,7 +315,7 @@ const ( // v1 is the containerd API v1. v1 string = "v1" - // v1 is the containerd API v21. + // v2 is the containerd API v2. v2 string = "v2" ) @@ -480,7 +480,7 @@ func setup(t *testing.T, version string) (*criutil.Crictl, func(), error) { } // Wait for containerd to boot. - if err := testutil.WaitUntilRead(startupR, "Start streaming server", nil, 10*time.Second); err != nil { + if err := testutil.WaitUntilRead(startupR, "Start streaming server", 10*time.Second); err != nil { t.Fatalf("failed to start containerd: %v", err) } diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl index 7618f6a21..829247657 100644 --- a/test/runner/defs.bzl +++ b/test/runner/defs.bzl @@ -12,7 +12,7 @@ def _runner_test_impl(ctx): " mkdir -p \"${TEST_UNDECLARED_OUTPUTS_DIR}\"", " chmod a+rwx \"${TEST_UNDECLARED_OUTPUTS_DIR}\"", "fi", - "exec %s %s %s\n" % ( + "exec %s %s \"$@\" %s\n" % ( ctx.files.runner[0].short_path, " ".join(ctx.attr.runner_args), ctx.files.test[0].short_path, @@ -52,8 +52,6 @@ _runner_test = rule( def _syscall_test( test, - shard_count, - size, platform, use_tmpfs, tags, @@ -63,7 +61,8 @@ def _syscall_test( overlay = False, add_uds_tree = False, vfs2 = False, - fuse = False): + fuse = False, + **kwargs): # Prepend "runsc" to non-native platform names. full_platform = platform if platform == "native" else "runsc_" + platform @@ -126,15 +125,12 @@ def _syscall_test( name = name, test = test, runner_args = runner_args, - size = size, tags = tags, - shard_count = shard_count, + **kwargs ) def syscall_test( test, - shard_count = 5, - size = "small", use_tmpfs = False, add_overlay = False, add_uds_tree = False, @@ -142,18 +138,21 @@ def syscall_test( vfs2 = True, fuse = False, debug = True, - tags = None): + tags = None, + **kwargs): """syscall_test is a macro that will create targets for all platforms. Args: test: the test target. - shard_count: shards for defined tests. - size: the defined test size. use_tmpfs: use tmpfs in the defined tests. add_overlay: add an overlay test. add_uds_tree: add a UDS test. add_hostinet: add a hostinet test. + vfs2: enable VFS2 support. + fuse: enable FUSE support. + debug: enable debug output. tags: starting test tags. + **kwargs: additional test arguments. """ if not tags: tags = [] @@ -173,8 +172,6 @@ def syscall_test( _syscall_test( test = test, - shard_count = shard_count, - size = size, platform = default_platform, use_tmpfs = use_tmpfs, add_uds_tree = add_uds_tree, @@ -182,6 +179,7 @@ def syscall_test( debug = debug, vfs2 = True, fuse = fuse, + **kwargs ) if fuse: # Only generate *_vfs2_fuse target if fuse parameter is enabled. @@ -189,38 +187,35 @@ def syscall_test( _syscall_test( test = test, - shard_count = shard_count, - size = size, platform = "native", use_tmpfs = False, add_uds_tree = add_uds_tree, tags = list(tags), debug = debug, + **kwargs ) for (platform, platform_tags) in platforms.items(): _syscall_test( test = test, - shard_count = shard_count, - size = size, platform = platform, use_tmpfs = use_tmpfs, add_uds_tree = add_uds_tree, tags = platform_tags + tags, debug = debug, + **kwargs ) if add_overlay: _syscall_test( test = test, - shard_count = shard_count, - size = size, platform = default_platform, use_tmpfs = use_tmpfs, add_uds_tree = add_uds_tree, tags = platforms[default_platform] + tags, debug = debug, overlay = True, + **kwargs ) # TODO(gvisor.dev/issue/4407): Remove tags to enable VFS2 overlay tests. @@ -230,8 +225,6 @@ def syscall_test( overlay_vfs2_tags.append("notap") _syscall_test( test = test, - shard_count = shard_count, - size = size, platform = default_platform, use_tmpfs = use_tmpfs, add_uds_tree = add_uds_tree, @@ -239,38 +232,35 @@ def syscall_test( debug = debug, overlay = True, vfs2 = True, + **kwargs ) if add_hostinet: _syscall_test( test = test, - shard_count = shard_count, - size = size, platform = default_platform, use_tmpfs = use_tmpfs, network = "host", add_uds_tree = add_uds_tree, tags = platforms[default_platform] + tags, debug = debug, + **kwargs ) if not use_tmpfs: # Also test shared gofer access. _syscall_test( test = test, - shard_count = shard_count, - size = size, platform = default_platform, use_tmpfs = use_tmpfs, add_uds_tree = add_uds_tree, tags = platforms[default_platform] + tags, debug = debug, file_access = "shared", + **kwargs ) _syscall_test( test = test, - shard_count = shard_count, - size = size, platform = default_platform, use_tmpfs = use_tmpfs, add_uds_tree = add_uds_tree, @@ -278,4 +268,5 @@ def syscall_test( debug = debug, file_access = "shared", vfs2 = True, + **kwargs ) diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD index 22b526f59..510ffe013 100644 --- a/test/runtimes/BUILD +++ b/test/runtimes/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "bzl_library") +load("//tools:defs.bzl", "bzl_library", "more_shards", "most_shards") load("//test/runtimes:defs.bzl", "runtime_test") package(licenses = ["notice"]) @@ -7,7 +7,7 @@ runtime_test( name = "go1.12", exclude_file = "exclude/go1.12.csv", lang = "go", - shard_count = 8, + shard_count = more_shards, ) runtime_test( @@ -15,28 +15,28 @@ runtime_test( batch = 100, exclude_file = "exclude/java11.csv", lang = "java", - shard_count = 16, + shard_count = most_shards, ) runtime_test( name = "nodejs12.4.0", exclude_file = "exclude/nodejs12.4.0.csv", lang = "nodejs", - shard_count = 8, + shard_count = most_shards, ) runtime_test( name = "php7.3.6", exclude_file = "exclude/php7.3.6.csv", lang = "php", - shard_count = 8, + shard_count = more_shards, ) runtime_test( name = "python3.7.3", exclude_file = "exclude/python3.7.3.csv", lang = "python", - shard_count = 8, + shard_count = more_shards, ) bzl_library( diff --git a/test/runtimes/runner/lib/lib.go b/test/runtimes/runner/lib/lib.go index 64e6e14db..9272137ff 100644 --- a/test/runtimes/runner/lib/lib.go +++ b/test/runtimes/runner/lib/lib.go @@ -34,12 +34,7 @@ import ( // RunTests is a helper that is called by main. It exists so that we can run // defered functions before exiting. It returns an exit code that should be // passed to os.Exit. -func RunTests(lang, image, excludeFile string, partitionNum, totalPartitions, batchSize int, timeout time.Duration) int { - if partitionNum <= 0 || totalPartitions <= 0 || partitionNum > totalPartitions { - fmt.Fprintf(os.Stderr, "invalid partition %d of %d", partitionNum, totalPartitions) - return 1 - } - +func RunTests(lang, image, excludeFile string, batchSize int, timeout time.Duration) int { // TODO(gvisor.dev/issue/1624): Remove those tests from all exclude lists // that only fail with VFS1. @@ -63,7 +58,7 @@ func RunTests(lang, image, excludeFile string, partitionNum, totalPartitions, ba // Get a slice of tests to run. This will also start a single Docker // container that will be used to run each test. The final test will // stop the Docker container. - tests, err := getTests(ctx, d, lang, image, partitionNum, totalPartitions, batchSize, timeout, excludes) + tests, err := getTests(ctx, d, lang, image, batchSize, timeout, excludes) if err != nil { fmt.Fprintf(os.Stderr, "%s\n", err.Error()) return 1 @@ -74,7 +69,7 @@ func RunTests(lang, image, excludeFile string, partitionNum, totalPartitions, ba } // getTests executes all tests as table tests. -func getTests(ctx context.Context, d *dockerutil.Container, lang, image string, partitionNum, totalPartitions, batchSize int, timeout time.Duration, excludes map[string]struct{}) ([]testing.InternalTest, error) { +func getTests(ctx context.Context, d *dockerutil.Container, lang, image string, batchSize int, timeout time.Duration, excludes map[string]struct{}) ([]testing.InternalTest, error) { // Start the container. opts := dockerutil.RunOpts{ Image: fmt.Sprintf("runtimes/%s", image), @@ -90,18 +85,9 @@ func getTests(ctx context.Context, d *dockerutil.Container, lang, image string, return nil, fmt.Errorf("docker exec failed: %v", err) } - // Calculate a subset of tests to run corresponding to the current - // shard. + // Calculate a subset of tests. tests := strings.Fields(list) sort.Strings(tests) - - partitionSize := len(tests) / totalPartitions - if partitionNum == totalPartitions { - tests = tests[(partitionNum-1)*partitionSize:] - } else { - tests = tests[(partitionNum-1)*partitionSize : partitionNum*partitionSize] - } - indices, err := testutil.TestIndicesForShard(len(tests)) if err != nil { return nil, fmt.Errorf("TestsForShard() failed: %v", err) @@ -122,6 +108,10 @@ func getTests(ctx context.Context, d *dockerutil.Container, lang, image string, } tcs = append(tcs, tests[tc]) } + if len(tcs) == 0 { + // No tests to add to this batch. + continue + } itests = append(itests, testing.InternalTest{ Name: strings.Join(tcs, ", "), F: func(t *testing.T) { diff --git a/test/runtimes/runner/main.go b/test/runtimes/runner/main.go index 5b3443e36..ec79a22c2 100644 --- a/test/runtimes/runner/main.go +++ b/test/runtimes/runner/main.go @@ -25,13 +25,11 @@ import ( ) var ( - lang = flag.String("lang", "", "language runtime to test") - image = flag.String("image", "", "docker image with runtime tests") - excludeFile = flag.String("exclude_file", "", "file containing list of tests to exclude, in CSV format with fields: test name, bug id, comment") - partition = flag.Int("partition", 1, "partition number, this is 1-indexed") - totalPartitions = flag.Int("total_partitions", 1, "total number of partitions") - batchSize = flag.Int("batch", 50, "number of test cases run in one command") - timeout = flag.Duration("timeout", 90*time.Minute, "batch timeout") + lang = flag.String("lang", "", "language runtime to test") + image = flag.String("image", "", "docker image with runtime tests") + excludeFile = flag.String("exclude_file", "", "file containing list of tests to exclude, in CSV format with fields: test name, bug id, comment") + batchSize = flag.Int("batch", 50, "number of test cases run in one command") + timeout = flag.Duration("timeout", 90*time.Minute, "batch timeout") ) func main() { @@ -40,5 +38,5 @@ func main() { fmt.Fprintf(os.Stderr, "lang and image flags must not be empty\n") os.Exit(1) } - os.Exit(lib.RunTests(*lang, *image, *excludeFile, *partition, *totalPartitions, *batchSize, *timeout)) + os.Exit(lib.RunTests(*lang, *image, *excludeFile, *batchSize, *timeout)) } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index b5a4ef4df..a5b9233f7 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -1,3 +1,4 @@ +load("//tools:defs.bzl", "more_shards", "most_shards") load("//test/runner:defs.bzl", "syscall_test") package(licenses = ["notice"]) @@ -12,7 +13,7 @@ syscall_test( syscall_test( size = "large", - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:accept_bind_test", ) @@ -32,7 +33,7 @@ syscall_test( syscall_test( size = "medium", - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:alarm_test", ) @@ -66,7 +67,7 @@ syscall_test( size = "large", # Produce too many logs in the debug mode. debug = False, - shard_count = 50, + shard_count = most_shards, # Takes too long for TSAN. Since this is kind of a stress test that doesn't # involve much concurrency, TSAN's usefulness here is limited anyway. tags = ["nogotsan"], @@ -211,7 +212,7 @@ syscall_test( syscall_test( size = "medium", - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:futex_test", ) @@ -258,7 +259,7 @@ syscall_test( syscall_test( size = "large", - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:itimer_test", ) @@ -313,7 +314,7 @@ syscall_test( syscall_test( size = "medium", - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:mmap_test", ) @@ -347,6 +348,7 @@ syscall_test( syscall_test( add_overlay = True, + shard_count = more_shards, test = "//test/syscalls/linux:open_test", ) @@ -376,7 +378,7 @@ syscall_test( syscall_test( size = "large", add_overlay = True, - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:pipe_test", ) @@ -448,7 +450,7 @@ syscall_test( syscall_test( size = "medium", - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:pty_test", ) @@ -475,6 +477,7 @@ syscall_test( ) syscall_test( + shard_count = more_shards, test = "//test/syscalls/linux:raw_socket_test", ) @@ -490,7 +493,7 @@ syscall_test( syscall_test( size = "medium", - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:readv_socket_test", ) @@ -539,7 +542,7 @@ syscall_test( ) syscall_test( - shard_count = 20, + shard_count = more_shards, test = "//test/syscalls/linux:semaphore_test", ) @@ -594,7 +597,7 @@ syscall_test( syscall_test( size = "large", - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:socket_abstract_test", ) @@ -605,7 +608,7 @@ syscall_test( syscall_test( size = "large", - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:socket_domain_test", ) @@ -618,19 +621,19 @@ syscall_test( syscall_test( size = "large", add_overlay = True, - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:socket_filesystem_test", ) syscall_test( size = "large", - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:socket_inet_loopback_test", ) syscall_test( size = "large", - shard_count = 50, + shard_count = most_shards, # Takes too long for TSAN. Creates a lot of TCP sockets. tags = ["nogotsan"], test = "//test/syscalls/linux:socket_inet_loopback_nogotsan_test", @@ -638,35 +641,38 @@ syscall_test( syscall_test( size = "large", - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:socket_ip_tcp_generic_loopback_test", ) syscall_test( size = "medium", + add_hostinet = True, test = "//test/syscalls/linux:socket_ip_tcp_loopback_non_blocking_test", ) syscall_test( size = "large", - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:socket_ip_tcp_loopback_test", ) syscall_test( size = "medium", - shard_count = 50, + add_hostinet = True, + shard_count = most_shards, test = "//test/syscalls/linux:socket_ip_tcp_udp_generic_loopback_test", ) syscall_test( size = "medium", + add_hostinet = True, test = "//test/syscalls/linux:socket_ip_udp_loopback_non_blocking_test", ) syscall_test( size = "large", - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:socket_ip_udp_loopback_test", ) @@ -677,6 +683,8 @@ syscall_test( syscall_test( size = "medium", + add_hostinet = True, + shard_count = more_shards, # Takes too long under gotsan to run. tags = ["nogotsan"], test = "//test/syscalls/linux:socket_ipv4_udp_unbound_loopback_nogotsan_test", @@ -691,6 +699,7 @@ syscall_test( ) syscall_test( + shard_count = more_shards, test = "//test/syscalls/linux:socket_ip_unbound_test", ) @@ -723,6 +732,7 @@ syscall_test( ) syscall_test( + add_hostinet = True, test = "//test/syscalls/linux:socket_non_stream_blocking_local_test", ) @@ -753,7 +763,7 @@ syscall_test( syscall_test( # NOTE(b/116636318): Large sendmsg may stall a long time. size = "enormous", - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:socket_unix_dgram_local_test", ) @@ -765,14 +775,14 @@ syscall_test( syscall_test( size = "large", add_overlay = True, - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:socket_unix_pair_test", ) syscall_test( # NOTE(b/116636318): Large sendmsg may stall a long time. size = "enormous", - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:socket_unix_seqpacket_local_test", ) @@ -798,13 +808,13 @@ syscall_test( syscall_test( size = "medium", - shard_count = 10, + shard_count = more_shards, test = "//test/syscalls/linux:socket_unix_unbound_seqpacket_test", ) syscall_test( size = "large", - shard_count = 50, + shard_count = most_shards, test = "//test/syscalls/linux:socket_unix_unbound_stream_test", ) @@ -858,7 +868,7 @@ syscall_test( syscall_test( size = "medium", - shard_count = 10, + shard_count = more_shards, test = "//test/syscalls/linux:tcp_socket_test", ) @@ -867,6 +877,7 @@ syscall_test( ) syscall_test( + shard_count = more_shards, test = "//test/syscalls/linux:timerfd_test", ) @@ -897,13 +908,14 @@ syscall_test( ) syscall_test( + add_hostinet = True, test = "//test/syscalls/linux:udp_bind_test", ) syscall_test( size = "medium", add_hostinet = True, - shard_count = 10, + shard_count = more_shards, test = "//test/syscalls/linux:udp_socket_test", ) @@ -947,7 +959,7 @@ syscall_test( syscall_test( size = "medium", - shard_count = 5, + shard_count = more_shards, test = "//test/syscalls/linux:wait_test", ) @@ -961,6 +973,7 @@ syscall_test( ) syscall_test( + add_hostinet = True, test = "//test/syscalls/linux:proc_net_tcp_test", ) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 50baafbf7..760456a98 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -432,6 +432,9 @@ cc_binary( testonly = 1, srcs = ["chown.cc"], linkstatic = 1, + # We require additional UIDs for this test, so don't include the bazel + # sandbox as standard. + tags = ["no-sandbox"], deps = [ "//test/util:capability_util", "//test/util:file_descriptor", @@ -618,10 +621,7 @@ cc_binary( cc_binary( name = "exceptions_test", testonly = 1, - srcs = select_arch( - amd64 = ["exceptions.cc"], - arm64 = [], - ), + srcs = ["exceptions.cc"], linkstatic = 1, deps = [ gtest, @@ -796,8 +796,8 @@ cc_binary( deps = [ ":socket_test_util", "//test/util:cleanup", - "//test/util:epoll_util", "//test/util:eventfd_util", + "//test/util:file_descriptor", "//test/util:fs_util", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/flags:flag", @@ -808,6 +808,7 @@ cc_binary( "//test/util:multiprocess_util", "//test/util:posix_error", "//test/util:save_util", + "//test/util:signal_util", "//test/util:temp_path", "//test/util:test_util", "//test/util:thread_util", @@ -2450,6 +2451,27 @@ cc_library( ) cc_library( + name = "socket_ipv6_udp_unbound_test_cases", + testonly = 1, + srcs = [ + "socket_ipv6_udp_unbound.cc", + ], + hdrs = [ + "socket_ipv6_udp_unbound.h", + ], + deps = [ + ":ip_socket_test_util", + ":socket_test_util", + "@com_google_absl//absl/memory", + gtest, + "//test/util:posix_error", + "//test/util:save_util", + "//test/util:test_util", + ], + alwayslink = 1, +) + +cc_library( name = "socket_ipv4_udp_unbound_netlink_test_cases", testonly = 1, srcs = [ @@ -2789,6 +2811,22 @@ cc_binary( ) cc_binary( + name = "socket_ipv6_udp_unbound_loopback_test", + testonly = 1, + srcs = [ + "socket_ipv6_udp_unbound_loopback.cc", + ], + linkstatic = 1, + deps = [ + ":ip_socket_test_util", + ":socket_ipv6_udp_unbound_test_cases", + ":socket_test_util", + "//test/util:test_main", + "//test/util:test_util", + ], +) + +cc_binary( name = "socket_ipv4_udp_unbound_loopback_nogotsan_test", testonly = 1, srcs = [ diff --git a/test/syscalls/linux/chown.cc b/test/syscalls/linux/chown.cc index 7a28b674d..5530ad18f 100644 --- a/test/syscalls/linux/chown.cc +++ b/test/syscalls/linux/chown.cc @@ -75,7 +75,16 @@ TEST_P(ChownParamTest, ChownFileSucceeds) { if (num_groups > 0) { std::vector<gid_t> list(num_groups); EXPECT_THAT(getgroups(list.size(), list.data()), SyscallSucceeds()); - gid = list[0]; + // Scan the list of groups for a valid gid. Note that if a group is not + // defined in this local user namespace, then we will see 65534, and the + // group will not chown below as expected. So only change if we find a + // valid group in this list. + for (const gid_t other_gid : list) { + if (other_gid != 65534) { + gid = other_gid; + break; + } + } } EXPECT_NO_ERRNO(GetParam()(file.path(), geteuid(), gid)); @@ -90,6 +99,7 @@ TEST_P(ChownParamTest, ChownFilePermissionDenied) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID))); const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0777)); + EXPECT_THAT(chmod(GetAbsoluteTestTmpdir().c_str(), 0777), SyscallSucceeds()); // Drop privileges and change IDs only in child thread, or else this parent // thread won't be able to open some log files after the test ends. @@ -119,6 +129,7 @@ TEST_P(ChownParamTest, ChownFileSucceedsAsRoot) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_SETUID)))); const std::string filename = NewTempAbsPath(); + EXPECT_THAT(chmod(GetAbsoluteTestTmpdir().c_str(), 0777), SyscallSucceeds()); absl::Notification fileCreated, fileChowned; // Change UID only in child thread, or else this parent thread won't be able diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc index 420b9543f..11dc1c651 100644 --- a/test/syscalls/linux/exceptions.cc +++ b/test/syscalls/linux/exceptions.cc @@ -23,6 +23,7 @@ namespace gvisor { namespace testing { +#if defined(__x86_64__) // Default value for the x87 FPU control word. See Intel SDM Vol 1, Ch 8.1.5 // "x87 FPU Control Word". constexpr uint16_t kX87ControlWordDefault = 0x37f; @@ -93,6 +94,9 @@ void InIOHelper(int width, int value) { }, ::testing::KilledBySignal(SIGSEGV), ""); } +#elif defined(__aarch64__) +void inline Halt() { asm("hlt #0\r\n"); } +#endif TEST(ExceptionTest, Halt) { // In order to prevent the regular handler from messing with things (and @@ -102,9 +106,14 @@ TEST(ExceptionTest, Halt) { sa.sa_handler = SIG_DFL; auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa)); +#if defined(__x86_64__) EXPECT_EXIT(Halt(), ::testing::KilledBySignal(SIGSEGV), ""); +#elif defined(__aarch64__) + EXPECT_EXIT(Halt(), ::testing::KilledBySignal(SIGILL), ""); +#endif } +#if defined(__x86_64__) TEST(ExceptionTest, DivideByZero) { // See above. struct sigaction sa = {}; @@ -362,6 +371,7 @@ TEST(ExceptionTest, Int3Compact) { EXPECT_EXIT(Int3Compact(), ::testing::KilledBySignal(SIGTRAP), ""); } +#endif } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 34016d4bd..4b581045b 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -14,10 +14,13 @@ #include <fcntl.h> #include <signal.h> +#include <sys/epoll.h> #include <sys/types.h> #include <syscall.h> #include <unistd.h> +#include <atomic> +#include <deque> #include <iostream> #include <list> #include <string> @@ -34,25 +37,27 @@ #include "test/syscalls/linux/socket_test_util.h" #include "test/util/cleanup.h" #include "test/util/eventfd_util.h" +#include "test/util/file_descriptor.h" #include "test/util/fs_util.h" #include "test/util/multiprocess_util.h" #include "test/util/posix_error.h" #include "test/util/save_util.h" +#include "test/util/signal_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" #include "test/util/timer_util.h" -ABSL_FLAG(std::string, child_setlock_on, "", +ABSL_FLAG(std::string, child_set_lock_on, "", "Contains the path to try to set a file lock on."); -ABSL_FLAG(bool, child_setlock_write, false, +ABSL_FLAG(bool, child_set_lock_write, false, "Whether to set a writable lock (otherwise readable)"); ABSL_FLAG(bool, blocking, false, "Whether to set a blocking lock (otherwise non-blocking)."); ABSL_FLAG(bool, retry_eintr, false, "Whether to retry in the subprocess on EINTR."); -ABSL_FLAG(uint64_t, child_setlock_start, 0, "The value of struct flock start"); -ABSL_FLAG(uint64_t, child_setlock_len, 0, "The value of struct flock len"); +ABSL_FLAG(uint64_t, child_set_lock_start, 0, "The value of struct flock start"); +ABSL_FLAG(uint64_t, child_set_lock_len, 0, "The value of struct flock len"); ABSL_FLAG(int32_t, socket_fd, -1, "A socket to use for communicating more state back " "to the parent."); @@ -60,6 +65,11 @@ ABSL_FLAG(int32_t, socket_fd, -1, namespace gvisor { namespace testing { +std::function<void(int, siginfo_t*, void*)> setsig_signal_handle; +void setsig_signal_handler(int signum, siginfo_t* siginfo, void* ucontext) { + setsig_signal_handle(signum, siginfo, ucontext); +} + class FcntlLockTest : public ::testing::Test { public: void SetUp() override { @@ -84,18 +94,93 @@ class FcntlLockTest : public ::testing::Test { int fds_[2] = {}; }; +struct SignalDelivery { + int num; + siginfo_t info; +}; + +class FcntlSignalTest : public ::testing::Test { + public: + void SetUp() override { + int pipe_fds[2]; + ASSERT_THAT(pipe2(pipe_fds, O_NONBLOCK), SyscallSucceeds()); + pipe_read_fd_ = pipe_fds[0]; + pipe_write_fd_ = pipe_fds[1]; + } + + PosixErrorOr<Cleanup> RegisterSignalHandler(int signum) { + struct sigaction handler; + handler.sa_sigaction = setsig_signal_handler; + setsig_signal_handle = [&](int signum, siginfo_t* siginfo, + void* unused_ucontext) { + SignalDelivery sig; + sig.num = signum; + sig.info = *siginfo; + signals_received_.push_back(sig); + num_signals_received_++; + }; + sigemptyset(&handler.sa_mask); + handler.sa_flags = SA_SIGINFO; + return ScopedSigaction(signum, handler); + } + + void FlushAndCloseFD(int fd) { + char buf; + int read_bytes; + do { + read_bytes = read(fd, &buf, 1); + } while (read_bytes > 0); + // read() can also fail with EWOULDBLOCK since the pipe is open in + // non-blocking mode. This is not an error. + EXPECT_TRUE(read_bytes == 0 || (read_bytes == -1 && errno == EWOULDBLOCK)); + EXPECT_THAT(close(fd), SyscallSucceeds()); + } + + void DupReadFD() { + ASSERT_THAT(pipe_read_fd_dup_ = dup(pipe_read_fd_), SyscallSucceeds()); + max_expected_signals++; + } + + void RegisterFD(int fd, int signum) { + ASSERT_THAT(fcntl(fd, F_SETOWN, getpid()), SyscallSucceeds()); + ASSERT_THAT(fcntl(fd, F_SETSIG, signum), SyscallSucceeds()); + int old_flags; + ASSERT_THAT(old_flags = fcntl(fd, F_GETFL), SyscallSucceeds()); + ASSERT_THAT(fcntl(fd, F_SETFL, old_flags | O_ASYNC), SyscallSucceeds()); + } + + void GenerateIOEvent() { + ASSERT_THAT(write(pipe_write_fd_, "test", 4), SyscallSucceedsWithValue(4)); + } + + void WaitForSignalDelivery(absl::Duration timeout) { + absl::Time wait_start = absl::Now(); + while (num_signals_received_ < max_expected_signals && + absl::Now() - wait_start < timeout) { + absl::SleepFor(absl::Milliseconds(10)); + } + } + + int pipe_read_fd_ = -1; + int pipe_read_fd_dup_ = -1; + int pipe_write_fd_ = -1; + int max_expected_signals = 1; + std::deque<SignalDelivery> signals_received_; + std::atomic<int> num_signals_received_ = 0; +}; + namespace { PosixErrorOr<Cleanup> SubprocessLock(std::string const& path, bool for_write, bool blocking, bool retry_eintr, int fd, off_t start, off_t length, pid_t* child) { std::vector<std::string> args = { - "/proc/self/exe", "--child_setlock_on", path, - "--child_setlock_start", absl::StrCat(start), "--child_setlock_len", - absl::StrCat(length), "--socket_fd", absl::StrCat(fd)}; + "/proc/self/exe", "--child_set_lock_on", path, + "--child_set_lock_start", absl::StrCat(start), "--child_set_lock_len", + absl::StrCat(length), "--socket_fd", absl::StrCat(fd)}; if (for_write) { - args.push_back("--child_setlock_write"); + args.push_back("--child_set_lock_write"); } if (blocking) { @@ -965,7 +1050,6 @@ TEST(FcntlTest, GetOwnNone) { // into F_{GET,SET}OWN_EX. EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); - MaybeSave(); } TEST(FcntlTest, GetOwnExNone) { @@ -1009,7 +1093,6 @@ TEST(FcntlTest, SetOwnPid) { EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(pid)); - MaybeSave(); } TEST(FcntlTest, SetOwnPgrp) { @@ -1030,7 +1113,6 @@ TEST(FcntlTest, SetOwnPgrp) { SyscallSucceedsWithValue(0)); EXPECT_EQ(got_owner.type, F_OWNER_PGRP); EXPECT_EQ(got_owner.pid, pgid); - MaybeSave(); } TEST(FcntlTest, SetOwnUnset) { @@ -1058,7 +1140,6 @@ TEST(FcntlTest, SetOwnUnset) { EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); - MaybeSave(); } // F_SETOWN flips the sign of negative values, an operation that is guarded @@ -1130,7 +1211,6 @@ TEST(FcntlTest, SetOwnExTid) { EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(owner.pid)); - MaybeSave(); } TEST(FcntlTest, SetOwnExPid) { @@ -1146,7 +1226,6 @@ TEST(FcntlTest, SetOwnExPid) { EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(owner.pid)); - MaybeSave(); } TEST(FcntlTest, SetOwnExPgrp) { @@ -1168,7 +1247,6 @@ TEST(FcntlTest, SetOwnExPgrp) { SyscallSucceedsWithValue(0)); EXPECT_EQ(got_owner.type, set_owner.type); EXPECT_EQ(got_owner.pid, set_owner.pid); - MaybeSave(); } TEST(FcntlTest, SetOwnExUnset) { @@ -1201,7 +1279,6 @@ TEST(FcntlTest, SetOwnExUnset) { EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN), SyscallSucceedsWithValue(0)); - MaybeSave(); } TEST(FcntlTest, GetOwnExTid) { @@ -1258,9 +1335,269 @@ TEST(FcntlTest, GetOwnExPgrp) { EXPECT_EQ(got_owner.pid, set_owner.pid); } +TEST(FcntlTest, SetSig) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETSIG, SIGUSR1), + SyscallSucceedsWithValue(0)); + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETSIG), + SyscallSucceedsWithValue(SIGUSR1)); +} + +TEST(FcntlTest, SetSigDefaultsToZero) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + // Defaults to returning the zero value, indicating default behavior (SIGIO). + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETSIG), + SyscallSucceedsWithValue(0)); +} + +TEST(FcntlTest, SetSigToDefault) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETSIG, SIGIO), + SyscallSucceedsWithValue(0)); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETSIG), + SyscallSucceedsWithValue(SIGIO)); + + // Can be reset to the default behavior. + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETSIG, 0), + SyscallSucceedsWithValue(0)); + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETSIG), + SyscallSucceedsWithValue(0)); +} + +TEST(FcntlTest, SetSigInvalid) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETSIG, SIGRTMAX + 1), + SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETSIG), + SyscallSucceedsWithValue(0)); +} + +TEST(FcntlTest, SetSigInvalidDoesNotResetPreviousChoice) { + FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); + + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETSIG, SIGUSR1), + SyscallSucceedsWithValue(0)); + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETSIG, SIGRTMAX + 1), + SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETSIG), + SyscallSucceedsWithValue(SIGUSR1)); +} + +TEST_F(FcntlSignalTest, SetSigDefault) { + const auto signal_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGIO)); + RegisterFD(pipe_read_fd_, 0); // Zero = default behavior + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + signals_received_.pop_front(); + EXPECT_EQ(sig.num, SIGIO); + EXPECT_EQ(sig.info.si_signo, SIGIO); + // siginfo contents is undefined in this case. +} + +TEST_F(FcntlSignalTest, SetSigCustom) { + const auto signal_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR1)); + RegisterFD(pipe_read_fd_, SIGUSR1); + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + signals_received_.pop_front(); + EXPECT_EQ(sig.num, SIGUSR1); + EXPECT_EQ(sig.info.si_signo, SIGUSR1); + EXPECT_EQ(sig.info.si_fd, pipe_read_fd_); + EXPECT_EQ(sig.info.si_band, EPOLLIN | EPOLLRDNORM); +} + +TEST_F(FcntlSignalTest, SetSigUnregisterStillGetsSigio) { + const auto sigio_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGIO)); + const auto sigusr1_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR1)); + RegisterFD(pipe_read_fd_, SIGUSR1); + RegisterFD(pipe_read_fd_, 0); + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + signals_received_.pop_front(); + EXPECT_EQ(sig.num, SIGIO); + // siginfo contents is undefined in this case. +} + +TEST_F(FcntlSignalTest, SetSigWithSigioStillGetsSiginfo) { + const auto signal_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGIO)); + RegisterFD(pipe_read_fd_, SIGIO); + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + EXPECT_EQ(sig.num, SIGIO); + EXPECT_EQ(sig.info.si_signo, SIGIO); + EXPECT_EQ(sig.info.si_fd, pipe_read_fd_); + EXPECT_EQ(sig.info.si_band, EPOLLIN | EPOLLRDNORM); +} + +TEST_F(FcntlSignalTest, SetSigDupThenCloseOld) { + const auto sigusr1_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR1)); + RegisterFD(pipe_read_fd_, SIGUSR1); + DupReadFD(); + FlushAndCloseFD(pipe_read_fd_); + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + // We get a signal with the **old** FD (even though it is closed). + EXPECT_EQ(sig.num, SIGUSR1); + EXPECT_EQ(sig.info.si_signo, SIGUSR1); + EXPECT_EQ(sig.info.si_fd, pipe_read_fd_); + EXPECT_EQ(sig.info.si_band, EPOLLIN | EPOLLRDNORM); +} + +TEST_F(FcntlSignalTest, SetSigDupThenCloseNew) { + const auto sigusr1_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR1)); + RegisterFD(pipe_read_fd_, SIGUSR1); + DupReadFD(); + FlushAndCloseFD(pipe_read_fd_dup_); + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + // We get a signal with the old FD. + EXPECT_EQ(sig.num, SIGUSR1); + EXPECT_EQ(sig.info.si_signo, SIGUSR1); + EXPECT_EQ(sig.info.si_fd, pipe_read_fd_); + EXPECT_EQ(sig.info.si_band, EPOLLIN | EPOLLRDNORM); +} + +TEST_F(FcntlSignalTest, SetSigDupOldRegistered) { + const auto sigusr1_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR1)); + RegisterFD(pipe_read_fd_, SIGUSR1); + DupReadFD(); + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + // We get a signal with the old FD. + EXPECT_EQ(sig.num, SIGUSR1); + EXPECT_EQ(sig.info.si_signo, SIGUSR1); + EXPECT_EQ(sig.info.si_fd, pipe_read_fd_); + EXPECT_EQ(sig.info.si_band, EPOLLIN | EPOLLRDNORM); +} + +TEST_F(FcntlSignalTest, SetSigDupNewRegistered) { + const auto sigusr2_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR2)); + DupReadFD(); + RegisterFD(pipe_read_fd_dup_, SIGUSR2); + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + // We get a signal with the new FD. + EXPECT_EQ(sig.num, SIGUSR2); + EXPECT_EQ(sig.info.si_signo, SIGUSR2); + EXPECT_EQ(sig.info.si_fd, pipe_read_fd_dup_); + EXPECT_EQ(sig.info.si_band, EPOLLIN | EPOLLRDNORM); +} + +TEST_F(FcntlSignalTest, SetSigDupBothRegistered) { + const auto sigusr1_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR1)); + const auto sigusr2_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR2)); + RegisterFD(pipe_read_fd_, SIGUSR1); + DupReadFD(); + RegisterFD(pipe_read_fd_dup_, SIGUSR2); + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + // We get a signal with the **new** signal number, but the **old** FD. + EXPECT_EQ(sig.num, SIGUSR2); + EXPECT_EQ(sig.info.si_signo, SIGUSR2); + EXPECT_EQ(sig.info.si_fd, pipe_read_fd_); + EXPECT_EQ(sig.info.si_band, EPOLLIN | EPOLLRDNORM); +} + +TEST_F(FcntlSignalTest, SetSigDupBothRegisteredAfterDup) { + const auto sigusr1_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR1)); + const auto sigusr2_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR2)); + DupReadFD(); + RegisterFD(pipe_read_fd_, SIGUSR1); + RegisterFD(pipe_read_fd_dup_, SIGUSR2); + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + // We get a signal with the **new** signal number, but the **old** FD. + EXPECT_EQ(sig.num, SIGUSR2); + EXPECT_EQ(sig.info.si_signo, SIGUSR2); + EXPECT_EQ(sig.info.si_fd, pipe_read_fd_); + EXPECT_EQ(sig.info.si_band, EPOLLIN | EPOLLRDNORM); +} + +TEST_F(FcntlSignalTest, SetSigDupUnregisterOld) { + const auto sigio_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGIO)); + const auto sigusr1_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR1)); + const auto sigusr2_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR2)); + RegisterFD(pipe_read_fd_, SIGUSR1); + DupReadFD(); + RegisterFD(pipe_read_fd_dup_, SIGUSR2); + RegisterFD(pipe_read_fd_, 0); // Should go back to SIGIO behavior. + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + // We get a signal with SIGIO. + EXPECT_EQ(sig.num, SIGIO); + // siginfo is undefined in this case. +} + +TEST_F(FcntlSignalTest, SetSigDupUnregisterNew) { + const auto sigio_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGIO)); + const auto sigusr1_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR1)); + const auto sigusr2_cleanup = + ASSERT_NO_ERRNO_AND_VALUE(RegisterSignalHandler(SIGUSR2)); + RegisterFD(pipe_read_fd_, SIGUSR1); + DupReadFD(); + RegisterFD(pipe_read_fd_dup_, SIGUSR2); + RegisterFD(pipe_read_fd_dup_, 0); // Should go back to SIGIO behavior. + GenerateIOEvent(); + WaitForSignalDelivery(absl::Seconds(1)); + ASSERT_EQ(num_signals_received_, 1); + SignalDelivery sig = signals_received_.front(); + // We get a signal with SIGIO. + EXPECT_EQ(sig.num, SIGIO); + // siginfo is undefined in this case. +} + // Make sure that making multiple concurrent changes to async signal generation // does not cause any race issues. -TEST(FcntlTest, SetFlSetOwnDoNotRace) { +TEST(FcntlTest, SetFlSetOwnSetSigDoNotRace) { FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE( Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0)); @@ -1268,32 +1605,40 @@ TEST(FcntlTest, SetFlSetOwnDoNotRace) { EXPECT_THAT(pid = getpid(), SyscallSucceeds()); constexpr absl::Duration runtime = absl::Milliseconds(300); - auto setAsync = [&s, &runtime] { + auto set_async = [&s, &runtime] { for (auto start = absl::Now(); absl::Now() - start < runtime;) { ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETFL, O_ASYNC), SyscallSucceeds()); sched_yield(); } }; - auto resetAsync = [&s, &runtime] { + auto reset_async = [&s, &runtime] { for (auto start = absl::Now(); absl::Now() - start < runtime;) { ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETFL, 0), SyscallSucceeds()); sched_yield(); } }; - auto setOwn = [&s, &pid, &runtime] { + auto set_own = [&s, &pid, &runtime] { for (auto start = absl::Now(); absl::Now() - start < runtime;) { ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), SyscallSucceeds()); sched_yield(); } }; + auto set_sig = [&s, &runtime] { + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETSIG, SIGUSR1), + SyscallSucceeds()); + sched_yield(); + } + }; std::list<ScopedThread> threads; for (int i = 0; i < 10; i++) { - threads.emplace_back(setAsync); - threads.emplace_back(resetAsync); - threads.emplace_back(setOwn); + threads.emplace_back(set_async); + threads.emplace_back(reset_async); + threads.emplace_back(set_own); + threads.emplace_back(set_sig); } } @@ -1302,57 +1647,60 @@ TEST(FcntlTest, SetFlSetOwnDoNotRace) { } // namespace testing } // namespace gvisor -int main(int argc, char** argv) { - gvisor::testing::TestInit(&argc, &argv); - - const std::string setlock_on = absl::GetFlag(FLAGS_child_setlock_on); - if (!setlock_on.empty()) { - int socket_fd = absl::GetFlag(FLAGS_socket_fd); - int fd = open(setlock_on.c_str(), O_RDWR, 0666); - if (fd == -1 && errno != 0) { - int err = errno; - std::cerr << "CHILD open " << setlock_on << " failed " << err - << std::endl; - exit(err); - } +int set_lock() { + const std::string set_lock_on = absl::GetFlag(FLAGS_child_set_lock_on); + int socket_fd = absl::GetFlag(FLAGS_socket_fd); + int fd = open(set_lock_on.c_str(), O_RDWR, 0666); + if (fd == -1 && errno != 0) { + int err = errno; + std::cerr << "CHILD open " << set_lock_on << " failed: " << err + << std::endl; + return err; + } - struct flock fl; - if (absl::GetFlag(FLAGS_child_setlock_write)) { - fl.l_type = F_WRLCK; - } else { - fl.l_type = F_RDLCK; - } - fl.l_whence = SEEK_SET; - fl.l_start = absl::GetFlag(FLAGS_child_setlock_start); - fl.l_len = absl::GetFlag(FLAGS_child_setlock_len); + struct flock fl; + if (absl::GetFlag(FLAGS_child_set_lock_write)) { + fl.l_type = F_WRLCK; + } else { + fl.l_type = F_RDLCK; + } + fl.l_whence = SEEK_SET; + fl.l_start = absl::GetFlag(FLAGS_child_set_lock_start); + fl.l_len = absl::GetFlag(FLAGS_child_set_lock_len); + + // Test the fcntl. + int err = 0; + int ret = 0; + + gvisor::testing::MonotonicTimer timer; + timer.Start(); + do { + ret = fcntl(fd, absl::GetFlag(FLAGS_blocking) ? F_SETLKW : F_SETLK, &fl); + } while (absl::GetFlag(FLAGS_retry_eintr) && ret == -1 && errno == EINTR); + auto usec = absl::ToInt64Microseconds(timer.Duration()); + + if (ret == -1 && errno != 0) { + err = errno; + std::cerr << "CHILD lock " << set_lock_on << " failed " << err << std::endl; + } - // Test the fcntl. - int err = 0; - int ret = 0; + // If there is a socket fd let's send back the time in microseconds it took + // to execute this syscall. + if (socket_fd != -1) { + gvisor::testing::WriteFd(socket_fd, reinterpret_cast<void*>(&usec), + sizeof(usec)); + close(socket_fd); + } - gvisor::testing::MonotonicTimer timer; - timer.Start(); - do { - ret = fcntl(fd, absl::GetFlag(FLAGS_blocking) ? F_SETLKW : F_SETLK, &fl); - } while (absl::GetFlag(FLAGS_retry_eintr) && ret == -1 && errno == EINTR); - auto usec = absl::ToInt64Microseconds(timer.Duration()); - - if (ret == -1 && errno != 0) { - err = errno; - std::cerr << "CHILD lock " << setlock_on << " failed " << err - << std::endl; - } + close(fd); + return err; +} - // If there is a socket fd let's send back the time in microseconds it took - // to execute this syscall. - if (socket_fd != -1) { - gvisor::testing::WriteFd(socket_fd, reinterpret_cast<void*>(&usec), - sizeof(usec)); - close(socket_fd); - } +int main(int argc, char** argv) { + gvisor::testing::TestInit(&argc, &argv); - close(fd); - exit(err); + if (!absl::GetFlag(FLAGS_child_set_lock_on).empty()) { + exit(set_lock()); } return gvisor::testing::RunAllTests(); diff --git a/test/syscalls/linux/kill.cc b/test/syscalls/linux/kill.cc index db29bd59c..5d1735853 100644 --- a/test/syscalls/linux/kill.cc +++ b/test/syscalls/linux/kill.cc @@ -58,6 +58,12 @@ void SigHandler(int sig, siginfo_t* info, void* context) { _exit(0); } // If pid equals -1, then sig is sent to every process for which the calling // process has permission to send signals, except for process 1 (init). TEST(KillTest, CanKillAllPIDs) { + // If we're not running inside the sandbox, then we skip this test + // as our namespace may contain may more processes that cannot tolerate + // the signal below. We also cannot reliably create a new pid namespace + // for ourselves and test the same functionality. + SKIP_IF(!IsRunningOnGvisor()); + int pipe_fds[2]; ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds()); FileDescriptor read_fd(pipe_fds[0]); diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc index 78c36f98f..9d63782fb 100644 --- a/test/syscalls/linux/open_create.cc +++ b/test/syscalls/linux/open_create.cc @@ -112,14 +112,6 @@ TEST(CreateTest, CreatFileWithOTruncAndReadOnly) { ASSERT_THAT(close(dirfd), SyscallSucceeds()); } -TEST(CreateTest, CreateFailsOnUnpermittedDir) { - // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to - // always override directory permissions. - ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false)); - ASSERT_THAT(open("/foo", O_CREAT | O_RDWR, 0644), - SyscallFailsWithErrno(EACCES)); -} - TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) { // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to // always override directory permissions. diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index 7a0f33dff..575be014c 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -17,6 +17,7 @@ #include <fcntl.h> #include <limits.h> #include <linux/magic.h> +#include <linux/sem.h> #include <sched.h> #include <signal.h> #include <stddef.h> @@ -2409,6 +2410,28 @@ TEST(ProcFilesystems, PresenceOfShmMaxMniAll) { ASSERT_LE(shmall, ULONG_MAX - (1UL << 24)); } +TEST(ProcFilesystems, PresenceOfSem) { + uint32_t semmsl = 0; + uint32_t semmns = 0; + uint32_t semopm = 0; + uint32_t semmni = 0; + std::string proc_file; + proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/sem")); + ASSERT_FALSE(proc_file.empty()); + std::vector<absl::string_view> sem_limits = + absl::StrSplit(proc_file, absl::ByAnyChar("\t"), absl::SkipWhitespace()); + ASSERT_EQ(sem_limits.size(), 4); + ASSERT_TRUE(absl::SimpleAtoi(sem_limits[0], &semmsl)); + ASSERT_TRUE(absl::SimpleAtoi(sem_limits[1], &semmns)); + ASSERT_TRUE(absl::SimpleAtoi(sem_limits[2], &semopm)); + ASSERT_TRUE(absl::SimpleAtoi(sem_limits[3], &semmni)); + + ASSERT_EQ(semmsl, SEMMSL); + ASSERT_EQ(semmns, SEMMNS); + ASSERT_EQ(semopm, SEMOPM); + ASSERT_EQ(semmni, SEMMNI); +} + // Check that /proc/mounts is a symlink to self/mounts. TEST(ProcMounts, IsSymlink) { auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/mounts")); @@ -2459,7 +2482,7 @@ void CheckDuplicatesRecursively(std::string path) { return; } auto dir_closer = Cleanup([&dir]() { closedir(dir); }); - std::unordered_set<std::string> children; + absl::node_hash_set<std::string> children; while (true) { // Readdir(3): If the end of the directory stream is reached, NULL is // returned and errno is not changed. If an error occurs, NULL is @@ -2478,6 +2501,10 @@ void CheckDuplicatesRecursively(std::string path) { absl::EndsWith(path, "/net")) { break; } + // We may also see permission failures traversing some files. + if (errno == EACCES && absl::StartsWith(path, "/proc/")) { + break; + } // Otherwise, no errors are allowed. ASSERT_EQ(errno, 0) << path; diff --git a/test/syscalls/linux/raw_socket.cc b/test/syscalls/linux/raw_socket.cc index 54709371c..955bcee4b 100644 --- a/test/syscalls/linux/raw_socket.cc +++ b/test/syscalls/linux/raw_socket.cc @@ -852,6 +852,51 @@ TEST(RawSocketTest, IPv6ProtoRaw) { SyscallFailsWithErrno(EINVAL)); } +TEST(RawSocketTest, IPv6SendMsg) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int sock; + ASSERT_THAT(sock = socket(AF_INET6, SOCK_RAW, IPPROTO_TCP), + SyscallSucceeds()); + + char kBuf[] = "hello"; + struct iovec iov = {}; + iov.iov_base = static_cast<void*>(const_cast<char*>(kBuf)); + iov.iov_len = static_cast<size_t>(sizeof(kBuf)); + + struct sockaddr_storage addr = {}; + struct sockaddr_in* sin = reinterpret_cast<struct sockaddr_in*>(&addr); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + struct msghdr msg = {}; + msg.msg_name = static_cast<void*>(&addr); + msg.msg_namelen = sizeof(sockaddr_in); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + ASSERT_THAT(sendmsg(sock, &msg, 0), SyscallFailsWithErrno(EINVAL)); +} + +TEST_P(RawSocketTest, ConnectOnIPv6Socket) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int sock; + ASSERT_THAT(sock = socket(AF_INET6, SOCK_RAW, IPPROTO_TCP), + SyscallSucceeds()); + + struct sockaddr_storage addr = {}; + struct sockaddr_in* sin = reinterpret_cast<struct sockaddr_in*>(&addr); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + ASSERT_THAT(connect(sock, reinterpret_cast<struct sockaddr*>(&addr), + sizeof(sockaddr_in6)), + SyscallFailsWithErrno(EAFNOSUPPORT)); +} + INSTANTIATE_TEST_SUITE_P( AllInetTests, RawSocketTest, ::testing::Combine(::testing::Values(IPPROTO_TCP, IPPROTO_UDP), diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc index 890f4a246..c2f080917 100644 --- a/test/syscalls/linux/semaphore.cc +++ b/test/syscalls/linux/semaphore.cc @@ -20,6 +20,7 @@ #include <atomic> #include <cerrno> #include <ctime> +#include <set> #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -31,10 +32,23 @@ #include "test/util/test_util.h" #include "test/util/thread_util.h" +using ::testing::Contains; + namespace gvisor { namespace testing { namespace { +constexpr int kSemMap = 1024000000; +constexpr int kSemMni = 32000; +constexpr int kSemMns = 1024000000; +constexpr int kSemMnu = 1024000000; +constexpr int kSemMsl = 32000; +constexpr int kSemOpm = 500; +constexpr int kSemUme = 500; +constexpr int kSemUsz = 20; +constexpr int kSemVmx = 32767; +constexpr int kSemAem = 32767; + class AutoSem { public: explicit AutoSem(int id) : id_(id) {} @@ -773,6 +787,154 @@ TEST(SemaphoreTest, SemopGetncntOnSignal_NoRandomSave) { EXPECT_EQ(semctl(sem.get(), 0, GETNCNT), 0); } +TEST(SemaphoreTest, IpcInfo) { + constexpr int kLoops = 5; + std::set<int> sem_ids; + struct seminfo info; + // Drop CAP_IPC_OWNER which allows us to bypass semaphore permissions. + ASSERT_NO_ERRNO(SetCapability(CAP_IPC_OWNER, false)); + ASSERT_THAT(semctl(0, 0, IPC_INFO, &info), SyscallSucceedsWithValue(0)); + for (int i = 0; i < kLoops; i++) { + AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT)); + ASSERT_THAT(sem.get(), SyscallSucceeds()); + sem_ids.insert(sem.release()); + } + ASSERT_EQ(sem_ids.size(), kLoops); + + int max_used_index = 0; + EXPECT_THAT(max_used_index = semctl(0, 0, IPC_INFO, &info), + SyscallSucceeds()); + + int index_count = 0; + for (int i = 0; i <= max_used_index; i++) { + struct semid_ds ds = {}; + int sem_id = semctl(i, 0, SEM_STAT, &ds); + // Only if index i is used within the registry. + if (sem_id != -1) { + ASSERT_THAT(sem_ids, Contains(sem_id)); + struct semid_ds ipc_stat_ds; + ASSERT_THAT(semctl(sem_id, 0, IPC_STAT, &ipc_stat_ds), SyscallSucceeds()); + EXPECT_EQ(ds.sem_perm.__key, ipc_stat_ds.sem_perm.__key); + EXPECT_EQ(ds.sem_perm.uid, ipc_stat_ds.sem_perm.uid); + EXPECT_EQ(ds.sem_perm.gid, ipc_stat_ds.sem_perm.gid); + EXPECT_EQ(ds.sem_perm.cuid, ipc_stat_ds.sem_perm.cuid); + EXPECT_EQ(ds.sem_perm.cgid, ipc_stat_ds.sem_perm.cgid); + EXPECT_EQ(ds.sem_perm.mode, ipc_stat_ds.sem_perm.mode); + EXPECT_EQ(ds.sem_otime, ipc_stat_ds.sem_otime); + EXPECT_EQ(ds.sem_ctime, ipc_stat_ds.sem_ctime); + EXPECT_EQ(ds.sem_nsems, ipc_stat_ds.sem_nsems); + + // Remove the semaphore set's read permission. + struct semid_ds ipc_set_ds; + ipc_set_ds.sem_perm.uid = getuid(); + ipc_set_ds.sem_perm.gid = getgid(); + // Keep the semaphore set's write permission so that it could be removed. + ipc_set_ds.sem_perm.mode = 0200; + ASSERT_THAT(semctl(sem_id, 0, IPC_SET, &ipc_set_ds), SyscallSucceeds()); + ASSERT_THAT(semctl(i, 0, SEM_STAT, &ds), SyscallFailsWithErrno(EACCES)); + + index_count += 1; + } + } + EXPECT_EQ(index_count, kLoops); + ASSERT_THAT(semctl(0, 0, IPC_INFO, &info), + SyscallSucceedsWithValue(max_used_index)); + for (const int sem_id : sem_ids) { + ASSERT_THAT(semctl(sem_id, 0, IPC_RMID), SyscallSucceeds()); + } + + ASSERT_THAT(semctl(0, 0, IPC_INFO, &info), SyscallSucceedsWithValue(0)); + EXPECT_EQ(info.semmap, kSemMap); + EXPECT_EQ(info.semmni, kSemMni); + EXPECT_EQ(info.semmns, kSemMns); + EXPECT_EQ(info.semmnu, kSemMnu); + EXPECT_EQ(info.semmsl, kSemMsl); + EXPECT_EQ(info.semopm, kSemOpm); + EXPECT_EQ(info.semume, kSemUme); + EXPECT_EQ(info.semusz, kSemUsz); + EXPECT_EQ(info.semvmx, kSemVmx); + EXPECT_EQ(info.semaem, kSemAem); +} + +TEST(SemaphoreTest, SemInfo) { + constexpr int kLoops = 5; + constexpr int kSemSetSize = 3; + std::set<int> sem_ids; + struct seminfo info; + // Drop CAP_IPC_OWNER which allows us to bypass semaphore permissions. + ASSERT_NO_ERRNO(SetCapability(CAP_IPC_OWNER, false)); + ASSERT_THAT(semctl(0, 0, IPC_INFO, &info), SyscallSucceedsWithValue(0)); + for (int i = 0; i < kLoops; i++) { + AutoSem sem(semget(IPC_PRIVATE, kSemSetSize, 0600 | IPC_CREAT)); + ASSERT_THAT(sem.get(), SyscallSucceeds()); + sem_ids.insert(sem.release()); + } + ASSERT_EQ(sem_ids.size(), kLoops); + int max_used_index = 0; + EXPECT_THAT(max_used_index = semctl(0, 0, SEM_INFO, &info), + SyscallSucceeds()); + EXPECT_EQ(info.semmap, kSemMap); + EXPECT_EQ(info.semmni, kSemMni); + EXPECT_EQ(info.semmns, kSemMns); + EXPECT_EQ(info.semmnu, kSemMnu); + EXPECT_EQ(info.semmsl, kSemMsl); + EXPECT_EQ(info.semopm, kSemOpm); + EXPECT_EQ(info.semume, kSemUme); + EXPECT_EQ(info.semusz, sem_ids.size()); + EXPECT_EQ(info.semvmx, kSemVmx); + EXPECT_EQ(info.semaem, sem_ids.size() * kSemSetSize); + + int index_count = 0; + for (int i = 0; i <= max_used_index; i++) { + struct semid_ds ds = {}; + int sem_id = semctl(i, 0, SEM_STAT, &ds); + // Only if index i is used within the registry. + if (sem_id != -1) { + ASSERT_THAT(sem_ids, Contains(sem_id)); + struct semid_ds ipc_stat_ds; + ASSERT_THAT(semctl(sem_id, 0, IPC_STAT, &ipc_stat_ds), SyscallSucceeds()); + EXPECT_EQ(ds.sem_perm.__key, ipc_stat_ds.sem_perm.__key); + EXPECT_EQ(ds.sem_perm.uid, ipc_stat_ds.sem_perm.uid); + EXPECT_EQ(ds.sem_perm.gid, ipc_stat_ds.sem_perm.gid); + EXPECT_EQ(ds.sem_perm.cuid, ipc_stat_ds.sem_perm.cuid); + EXPECT_EQ(ds.sem_perm.cgid, ipc_stat_ds.sem_perm.cgid); + EXPECT_EQ(ds.sem_perm.mode, ipc_stat_ds.sem_perm.mode); + EXPECT_EQ(ds.sem_otime, ipc_stat_ds.sem_otime); + EXPECT_EQ(ds.sem_ctime, ipc_stat_ds.sem_ctime); + EXPECT_EQ(ds.sem_nsems, ipc_stat_ds.sem_nsems); + + // Remove the semaphore set's read permission. + struct semid_ds ipc_set_ds; + ipc_set_ds.sem_perm.uid = getuid(); + ipc_set_ds.sem_perm.gid = getgid(); + // Keep the semaphore set's write permission so that it could be removed. + ipc_set_ds.sem_perm.mode = 0200; + ASSERT_THAT(semctl(sem_id, 0, IPC_SET, &ipc_set_ds), SyscallSucceeds()); + ASSERT_THAT(semctl(i, 0, SEM_STAT, &ds), SyscallFailsWithErrno(EACCES)); + + index_count += 1; + } + } + EXPECT_EQ(index_count, kLoops); + ASSERT_THAT(semctl(0, 0, SEM_INFO, &info), + SyscallSucceedsWithValue(max_used_index)); + for (const int sem_id : sem_ids) { + ASSERT_THAT(semctl(sem_id, 0, IPC_RMID), SyscallSucceeds()); + } + + ASSERT_THAT(semctl(0, 0, SEM_INFO, &info), SyscallSucceedsWithValue(0)); + EXPECT_EQ(info.semmap, kSemMap); + EXPECT_EQ(info.semmni, kSemMni); + EXPECT_EQ(info.semmns, kSemMns); + EXPECT_EQ(info.semmnu, kSemMnu); + EXPECT_EQ(info.semmsl, kSemMsl); + EXPECT_EQ(info.semopm, kSemOpm); + EXPECT_EQ(info.semume, kSemUme); + EXPECT_EQ(info.semusz, 0); + EXPECT_EQ(info.semvmx, kSemVmx); + EXPECT_EQ(info.semaem, 0); +} + } // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc index 389e5fca2..c86cd2755 100644 --- a/test/syscalls/linux/signalfd.cc +++ b/test/syscalls/linux/signalfd.cc @@ -126,7 +126,7 @@ TEST_P(SignalfdTest, Blocking) { // Shared tid variable. absl::Mutex mu; - bool has_tid; + bool has_tid = false; pid_t tid; // Start a thread reading. diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc index 796546224..a28ee2233 100644 --- a/test/syscalls/linux/socket_generic.cc +++ b/test/syscalls/linux/socket_generic.cc @@ -818,32 +818,55 @@ TEST_P(AllSocketPairTest, GetSockoptProtocol) { } } -TEST_P(AllSocketPairTest, GetSockoptBroadcast) { - auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); - int opt = -1; - socklen_t optlen = sizeof(opt); - EXPECT_THAT( - getsockopt(sockets->first_fd(), SOL_SOCKET, SO_BROADCAST, &opt, &optlen), - SyscallSucceeds()); - ASSERT_EQ(optlen, sizeof(opt)); - EXPECT_EQ(opt, 0); +TEST_P(AllSocketPairTest, SetAndGetBooleanSocketOptions) { + int sock_opts[] = {SO_BROADCAST, SO_PASSCRED, SO_NO_CHECK, + SO_REUSEADDR, SO_REUSEPORT, SO_KEEPALIVE}; + for (int sock_opt : sock_opts) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + int enable = -1; + socklen_t enableLen = sizeof(enable); + + // Test that the option is initially set to false. + ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, sock_opt, &enable, + &enableLen), + SyscallSucceeds()); + ASSERT_EQ(enableLen, sizeof(enable)); + EXPECT_EQ(enable, 0) << absl::StrFormat( + "getsockopt(fd, SOL_SOCKET, %d, &enable, &enableLen) => enable=%d", + sock_opt, enable); + + // Test that setting the option to true is reflected in the subsequent + // call to getsockopt(2). + enable = 1; + ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, sock_opt, &enable, + sizeof(enable)), + SyscallSucceeds()); + enable = -1; + enableLen = sizeof(enable); + ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, sock_opt, &enable, + &enableLen), + SyscallSucceeds()); + ASSERT_EQ(enableLen, sizeof(enable)); + EXPECT_EQ(enable, 1) << absl::StrFormat( + "getsockopt(fd, SOL_SOCKET, %d, &enable, &enableLen) => enable=%d", + sock_opt, enable); + } } -TEST_P(AllSocketPairTest, SetAndGetSocketBroadcastOption) { - auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); - int kSockOptOn = 1; - ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_BROADCAST, - &kSockOptOn, sizeof(kSockOptOn)), - SyscallSucceedsWithValue(0)); +TEST_P(AllSocketPairTest, GetSocketOutOfBandInlineOption) { + // We do not support disabling this option. It is always enabled. + SKIP_IF(!IsRunningOnGvisor()); - int got = -1; - socklen_t length = sizeof(got); - ASSERT_THAT( - getsockopt(sockets->first_fd(), SOL_SOCKET, SO_BROADCAST, &got, &length), - SyscallSucceedsWithValue(0)); + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + int enable = -1; + socklen_t enableLen = sizeof(enable); - ASSERT_EQ(length, sizeof(got)); - EXPECT_EQ(got, kSockOptOn); + int want = 1; + ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_OOBINLINE, &enable, + &enableLen), + SyscallSucceeds()); + ASSERT_EQ(enableLen, sizeof(enable)); + EXPECT_EQ(enable, want); } } // namespace testing diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc index e19a83413..51b77ad85 100644 --- a/test/syscalls/linux/socket_inet_loopback.cc +++ b/test/syscalls/linux/socket_inet_loopback.cc @@ -1185,19 +1185,44 @@ TEST_P(SocketInetLoopbackTest, TCPAcceptAfterReset) { listen_fd.get(), reinterpret_cast<sockaddr*>(&accept_addr), &addrlen)); ASSERT_EQ(addrlen, listener.addr_len); - // TODO(gvisor.dev/issue/3812): Remove after SO_ERROR is fixed. - if (IsRunningOnGvisor()) { - char buf[10]; - ASSERT_THAT(ReadFd(accept_fd.get(), buf, sizeof(buf)), - SyscallFailsWithErrno(ECONNRESET)); - } else { + // Wait for accept_fd to process the RST. + const int kTimeout = 10000; + struct pollfd pfd = { + .fd = accept_fd.get(), + .events = POLLIN, + }; + ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); + ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR); + + { int err; socklen_t optlen = sizeof(err); ASSERT_THAT( getsockopt(accept_fd.get(), SOL_SOCKET, SO_ERROR, &err, &optlen), SyscallSucceeds()); + // This should return ECONNRESET as the socket just received a RST packet + // from the peer. + ASSERT_EQ(optlen, sizeof(err)); ASSERT_EQ(err, ECONNRESET); + } + { + int err; + socklen_t optlen = sizeof(err); + ASSERT_THAT( + getsockopt(accept_fd.get(), SOL_SOCKET, SO_ERROR, &err, &optlen), + SyscallSucceeds()); + // This should return no error as the previous getsockopt call would have + // cleared the socket error. ASSERT_EQ(optlen, sizeof(err)); + ASSERT_EQ(err, 0); + } + { + sockaddr_storage peer_addr; + socklen_t addrlen = sizeof(peer_addr); + // The socket is not connected anymore and should return ENOTCONN. + ASSERT_THAT(getpeername(accept_fd.get(), + reinterpret_cast<sockaddr*>(&peer_addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); } } @@ -2805,5 +2830,28 @@ INSTANTIATE_TEST_SUITE_P( } // namespace +// Check that loopback receives connections from any address in the range: +// 127.0.0.1 to 127.254.255.255. This behavior is exclusive to IPv4. +TEST_F(SocketInetLoopbackTest, LoopbackAddressRangeConnect) { + TestAddress const& listener = V4Any(); + + in_addr_t addresses[] = { + INADDR_LOOPBACK, + INADDR_LOOPBACK + 1, // 127.0.0.2 + (in_addr_t)0x7f000101, // 127.0.1.1 + (in_addr_t)0x7f010101, // 127.1.1.1 + (in_addr_t)0x7ffeffff, // 127.254.255.255 + }; + for (const auto& address : addresses) { + TestAddress connector("V4Loopback"); + connector.addr.ss_family = AF_INET; + connector.addr_len = sizeof(sockaddr_in); + reinterpret_cast<sockaddr_in*>(&connector.addr)->sin_addr.s_addr = + htonl(address); + + tcpSimpleConnectTest(listener, connector, true); + } +} + } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc index f69f8f99f..2fcd08112 100644 --- a/test/syscalls/linux/socket_ip_udp_generic.cc +++ b/test/syscalls/linux/socket_ip_udp_generic.cc @@ -15,6 +15,9 @@ #include "test/syscalls/linux/socket_ip_udp_generic.h" #include <errno.h> +#ifdef __linux__ +#include <linux/in6.h> +#endif // __linux__ #include <netinet/in.h> #include <netinet/tcp.h> #include <poll.h> @@ -356,6 +359,58 @@ TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) { EXPECT_EQ(get_len, sizeof(get)); } +// Test getsockopt for a socket which is not set with IP_RECVORIGDSTADDR option. +TEST_P(UDPSocketPairTest, ReceiveOrigDstAddrDefault) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + int get = -1; + socklen_t get_len = sizeof(get); + int level = SOL_IP; + int type = IP_RECVORIGDSTADDR; + if (sockets->first_addr()->sa_family == AF_INET6) { + level = SOL_IPV6; + type = IPV6_RECVORIGDSTADDR; + } + ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get_len, sizeof(get)); + EXPECT_EQ(get, kSockOptOff); +} + +// Test setsockopt and getsockopt for a socket with IP_RECVORIGDSTADDR option. +TEST_P(UDPSocketPairTest, SetAndGetReceiveOrigDstAddr) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + int level = SOL_IP; + int type = IP_RECVORIGDSTADDR; + if (sockets->first_addr()->sa_family == AF_INET6) { + level = SOL_IPV6; + type = IPV6_RECVORIGDSTADDR; + } + + // Check getsockopt before IP_PKTINFO is set. + int get = -1; + socklen_t get_len = sizeof(get); + + ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceedsWithValue(0)); + + ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get, kSockOptOn); + EXPECT_EQ(get_len, sizeof(get)); + + ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOff, + sizeof(kSockOptOff)), + SyscallSucceedsWithValue(0)); + + ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len), + SyscallSucceedsWithValue(0)); + EXPECT_EQ(get, kSockOptOff); + EXPECT_EQ(get_len, sizeof(get)); +} + // Holds TOS or TClass information for IPv4 or IPv6 respectively. struct RecvTosOption { int level; diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc index b3f54e7f6..e557572a7 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc @@ -2222,6 +2222,90 @@ TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) { EXPECT_EQ(received_pktinfo.ipi_addr.s_addr, htonl(INADDR_LOOPBACK)); } +// Test that socket will receive IP_RECVORIGDSTADDR control message. +TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPReceiveOrigDstAddr) { + auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto receiver_addr = V4Loopback(); + int level = SOL_IP; + int type = IP_RECVORIGDSTADDR; + + ASSERT_THAT( + bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr), + receiver_addr.addr_len), + SyscallSucceeds()); + + // Retrieve the port bound by the receiver. + socklen_t receiver_addr_len = receiver_addr.addr_len; + ASSERT_THAT(getsockname(receiver->get(), + reinterpret_cast<sockaddr*>(&receiver_addr.addr), + &receiver_addr_len), + SyscallSucceeds()); + EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len); + + ASSERT_THAT( + connect(sender->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr), + receiver_addr.addr_len), + SyscallSucceeds()); + + // Get address and port bound by the sender. + sockaddr_storage sender_addr_storage; + socklen_t sender_addr_len = sizeof(sender_addr_storage); + ASSERT_THAT(getsockname(sender->get(), + reinterpret_cast<sockaddr*>(&sender_addr_storage), + &sender_addr_len), + SyscallSucceeds()); + ASSERT_EQ(sender_addr_len, sizeof(struct sockaddr_in)); + + // Enable IP_RECVORIGDSTADDR on socket so that we get the original destination + // address of the datagram as auxiliary information in the control message. + ASSERT_THAT( + setsockopt(receiver->get(), level, type, &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + + // Prepare message to send. + constexpr size_t kDataLength = 1024; + msghdr sent_msg = {}; + iovec sent_iov = {}; + char sent_data[kDataLength]; + sent_iov.iov_base = sent_data; + sent_iov.iov_len = kDataLength; + sent_msg.msg_iov = &sent_iov; + sent_msg.msg_iovlen = 1; + sent_msg.msg_flags = 0; + + ASSERT_THAT(RetryEINTR(sendmsg)(sender->get(), &sent_msg, 0), + SyscallSucceedsWithValue(kDataLength)); + + msghdr received_msg = {}; + iovec received_iov = {}; + char received_data[kDataLength]; + char received_cmsg_buf[CMSG_SPACE(sizeof(sockaddr_in))] = {}; + size_t cmsg_data_len = sizeof(sockaddr_in); + received_iov.iov_base = received_data; + received_iov.iov_len = kDataLength; + received_msg.msg_iov = &received_iov; + received_msg.msg_iovlen = 1; + received_msg.msg_controllen = CMSG_LEN(cmsg_data_len); + received_msg.msg_control = received_cmsg_buf; + + ASSERT_THAT(RecvMsgTimeout(receiver->get(), &received_msg, 1 /*timeout*/), + IsPosixErrorOkAndHolds(kDataLength)); + + cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg); + ASSERT_NE(cmsg, nullptr); + EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len)); + EXPECT_EQ(cmsg->cmsg_level, level); + EXPECT_EQ(cmsg->cmsg_type, type); + + // Check the data + sockaddr_in received_addr = {}; + memcpy(&received_addr, CMSG_DATA(cmsg), sizeof(received_addr)); + auto orig_receiver_addr = reinterpret_cast<sockaddr_in*>(&receiver_addr.addr); + EXPECT_EQ(received_addr.sin_addr.s_addr, orig_receiver_addr->sin_addr.s_addr); + EXPECT_EQ(received_addr.sin_port, orig_receiver_addr->sin_port); +} + // Check that setting SO_RCVBUF below min is clamped to the minimum // receive buffer size. TEST_P(IPv4UDPUnboundSocketTest, SetSocketRecvBufBelowMin) { diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound.cc b/test/syscalls/linux/socket_ipv6_udp_unbound.cc new file mode 100644 index 000000000..08526468e --- /dev/null +++ b/test/syscalls/linux/socket_ipv6_udp_unbound.cc @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/syscalls/linux/socket_ipv6_udp_unbound.h" + +#include <arpa/inet.h> +#include <netinet/in.h> +#ifdef __linux__ +#include <linux/in6.h> +#endif // __linux__ +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> + +#include <cstdio> +#include <cstring> + +#include "gtest/gtest.h" +#include "absl/memory/memory.h" +#include "test/syscalls/linux/ip_socket_test_util.h" +#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/posix_error.h" +#include "test/util/save_util.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +// Test that socket will receive IP_RECVORIGDSTADDR control message. +TEST_P(IPv6UDPUnboundSocketTest, SetAndReceiveIPReceiveOrigDstAddr) { + auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto receiver_addr = V6Loopback(); + int level = SOL_IPV6; + int type = IPV6_RECVORIGDSTADDR; + + ASSERT_THAT( + bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr), + receiver_addr.addr_len), + SyscallSucceeds()); + + // Retrieve the port bound by the receiver. + socklen_t receiver_addr_len = receiver_addr.addr_len; + ASSERT_THAT(getsockname(receiver->get(), + reinterpret_cast<sockaddr*>(&receiver_addr.addr), + &receiver_addr_len), + SyscallSucceeds()); + EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len); + + ASSERT_THAT( + connect(sender->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr), + receiver_addr.addr_len), + SyscallSucceeds()); + + // Get address and port bound by the sender. + sockaddr_storage sender_addr_storage; + socklen_t sender_addr_len = sizeof(sender_addr_storage); + ASSERT_THAT(getsockname(sender->get(), + reinterpret_cast<sockaddr*>(&sender_addr_storage), + &sender_addr_len), + SyscallSucceeds()); + ASSERT_EQ(sender_addr_len, sizeof(struct sockaddr_in6)); + + // Enable IP_RECVORIGDSTADDR on socket so that we get the original destination + // address of the datagram as auxiliary information in the control message. + ASSERT_THAT( + setsockopt(receiver->get(), level, type, &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + + // Prepare message to send. + constexpr size_t kDataLength = 1024; + msghdr sent_msg = {}; + iovec sent_iov = {}; + char sent_data[kDataLength]; + sent_iov.iov_base = sent_data; + sent_iov.iov_len = kDataLength; + sent_msg.msg_iov = &sent_iov; + sent_msg.msg_iovlen = 1; + sent_msg.msg_flags = 0; + + ASSERT_THAT(RetryEINTR(sendmsg)(sender->get(), &sent_msg, 0), + SyscallSucceedsWithValue(kDataLength)); + + msghdr received_msg = {}; + iovec received_iov = {}; + char received_data[kDataLength]; + char received_cmsg_buf[CMSG_SPACE(sizeof(sockaddr_in6))] = {}; + size_t cmsg_data_len = sizeof(sockaddr_in6); + received_iov.iov_base = received_data; + received_iov.iov_len = kDataLength; + received_msg.msg_iov = &received_iov; + received_msg.msg_iovlen = 1; + received_msg.msg_controllen = CMSG_LEN(cmsg_data_len); + received_msg.msg_control = received_cmsg_buf; + + ASSERT_THAT(RecvMsgTimeout(receiver->get(), &received_msg, 1 /*timeout*/), + IsPosixErrorOkAndHolds(kDataLength)); + + cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg); + ASSERT_NE(cmsg, nullptr); + EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len)); + EXPECT_EQ(cmsg->cmsg_level, level); + EXPECT_EQ(cmsg->cmsg_type, type); + + // Check that the received address in the control message matches the expected + // receiver's address. + sockaddr_in6 received_addr = {}; + memcpy(&received_addr, CMSG_DATA(cmsg), sizeof(received_addr)); + auto orig_receiver_addr = + reinterpret_cast<sockaddr_in6*>(&receiver_addr.addr); + EXPECT_EQ(memcmp(&received_addr.sin6_addr, &orig_receiver_addr->sin6_addr, + sizeof(in6_addr)), + 0); + EXPECT_EQ(received_addr.sin6_port, orig_receiver_addr->sin6_port); +} + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound.h b/test/syscalls/linux/socket_ipv6_udp_unbound.h new file mode 100644 index 000000000..71e160f73 --- /dev/null +++ b/test/syscalls/linux/socket_ipv6_udp_unbound.h @@ -0,0 +1,29 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_H_ +#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_H_ + +#include "test/syscalls/linux/socket_test_util.h" + +namespace gvisor { +namespace testing { + +// Test fixture for tests that apply to IPv6 UDP sockets. +using IPv6UDPUnboundSocketTest = SimpleSocketTest; + +} // namespace testing +} // namespace gvisor + +#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_H_ diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_loopback.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback.cc new file mode 100644 index 000000000..058336ecc --- /dev/null +++ b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback.cc @@ -0,0 +1,32 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <vector> + +#include "test/syscalls/linux/ip_socket_test_util.h" +#include "test/syscalls/linux/socket_ipv6_udp_unbound.h" +#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +INSTANTIATE_TEST_SUITE_P( + IPv6UDPSockets, IPv6UDPUnboundSocketTest, + ::testing::ValuesIn(ApplyVec<SocketKind>(IPv6UDPUnboundSocket, + AllBitwiseCombinations(List<int>{ + 0, SOCK_NONBLOCK})))); + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index bc2c8278c..714848b8e 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -964,37 +964,156 @@ TEST_P(TcpSocketTest, PollAfterShutdown) { SyscallSucceedsWithValue(1)); } -TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) { +TEST_P(SimpleTcpSocketTest, NonBlockingConnectRetry) { + const FileDescriptor listener = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + // Initialize address to the loopback one. sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); socklen_t addrlen = sizeof(addr); - const FileDescriptor s = + // Bind to some port but don't listen yet. + ASSERT_THAT( + bind(listener.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallSucceeds()); + + // Get the address we're bound to, then connect to it. We need to do this + // because we're allowing the stack to pick a port for us. + ASSERT_THAT(getsockname(listener.get(), + reinterpret_cast<struct sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + + FileDescriptor connector = ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); - // Set the FD to O_NONBLOCK. - int opts; - ASSERT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds()); - opts |= O_NONBLOCK; - ASSERT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds()); + // Verify that connect fails. + ASSERT_THAT( + RetryEINTR(connect)(connector.get(), + reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNREFUSED)); - ASSERT_THAT(RetryEINTR(connect)( + // Now start listening + ASSERT_THAT(listen(listener.get(), SOMAXCONN), SyscallSucceeds()); + + // TODO(gvisor.dev/issue/3828): Issuing connect() again on a socket that + // failed first connect should succeed. + if (IsRunningOnGvisor()) { + ASSERT_THAT( + RetryEINTR(connect)(connector.get(), + reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNABORTED)); + return; + } + + // Verify that connect now succeeds. + ASSERT_THAT( + RetryEINTR(connect)(connector.get(), + reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallSucceeds()); + + // Accept the connection. + const FileDescriptor accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listener.get(), nullptr, nullptr)); +} + +// nonBlockingConnectNoListener returns a socket on which a connect that is +// expected to fail has been issued. +PosixErrorOr<FileDescriptor> nonBlockingConnectNoListener(const int family, + sockaddr_storage addr, + socklen_t addrlen) { + // We will first create a socket and bind to ensure we bind a port but will + // not call listen on this socket. + // Then we will create a new socket that will connect to the port bound by + // the first socket and that shoud fail. + constexpr int sock_type = SOCK_STREAM | SOCK_NONBLOCK; + int b_sock; + RETURN_ERROR_IF_SYSCALL_FAIL(b_sock = socket(family, sock_type, IPPROTO_TCP)); + FileDescriptor b(b_sock); + EXPECT_THAT(bind(b.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallSucceeds()); + + // Get the address bound by the listening socket. + EXPECT_THAT( + getsockname(b.get(), reinterpret_cast<struct sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + + // Now create another socket and issue a connect on this one. This connect + // should fail as there is no listener. + int c_sock; + RETURN_ERROR_IF_SYSCALL_FAIL(c_sock = socket(family, sock_type, IPPROTO_TCP)); + FileDescriptor s(c_sock); + + // Now connect to the bound address and this should fail as nothing + // is listening on the bound address. + EXPECT_THAT(RetryEINTR(connect)( s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), SyscallFailsWithErrno(EINPROGRESS)); - // Now polling on the FD with a timeout should return 0 corresponding to no - // FDs ready. - struct pollfd poll_fd = {s.get(), POLLOUT, 0}; - EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000), - SyscallSucceedsWithValue(1)); + // Wait for the connect to fail. + struct pollfd poll_fd = {s.get(), POLLERR, 0}; + EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 1000), SyscallSucceedsWithValue(1)); + return std::move(s); +} + +TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) { + sockaddr_storage addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t addrlen = sizeof(addr); + + const FileDescriptor s = + nonBlockingConnectNoListener(GetParam(), addr, addrlen).ValueOrDie(); int err; socklen_t optlen = sizeof(err); ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_ERROR, &err, &optlen), SyscallSucceeds()); - + ASSERT_THAT(optlen, sizeof(err)); EXPECT_EQ(err, ECONNREFUSED); + + unsigned char c; + ASSERT_THAT(read(s.get(), &c, sizeof(c)), SyscallSucceedsWithValue(0)); + int opts; + EXPECT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds()); + opts &= ~O_NONBLOCK; + EXPECT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds()); + // Try connecting again. + ASSERT_THAT(RetryEINTR(connect)( + s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNABORTED)); +} + +TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListenerRead) { + sockaddr_storage addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t addrlen = sizeof(addr); + + const FileDescriptor s = + nonBlockingConnectNoListener(GetParam(), addr, addrlen).ValueOrDie(); + + unsigned char c; + ASSERT_THAT(read(s.get(), &c, 1), SyscallFailsWithErrno(ECONNREFUSED)); + ASSERT_THAT(read(s.get(), &c, 1), SyscallSucceedsWithValue(0)); + ASSERT_THAT(RetryEINTR(connect)( + s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNABORTED)); +} + +TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListenerPeek) { + sockaddr_storage addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t addrlen = sizeof(addr); + + const FileDescriptor s = + nonBlockingConnectNoListener(GetParam(), addr, addrlen).ValueOrDie(); + + unsigned char c; + ASSERT_THAT(recv(s.get(), &c, 1, MSG_PEEK), + SyscallFailsWithErrno(ECONNREFUSED)); + ASSERT_THAT(recv(s.get(), &c, 1, MSG_PEEK), SyscallSucceedsWithValue(0)); + ASSERT_THAT(RetryEINTR(connect)( + s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNABORTED)); } TEST_P(SimpleTcpSocketTest, SelfConnectSendRecv_NoRandomSave) { @@ -1235,6 +1354,19 @@ TEST_P(SimpleTcpSocketTest, CleanupOnConnectionRefused) { // Attempt #2, with the new socket and reused addr our connect should fail in // the same way as before, not with an EADDRINUSE. + // + // TODO(gvisor.dev/issue/3828): 2nd connect on a socket which failed connect + // first time should succeed. + // gVisor never issues the second connect and returns ECONNABORTED instead. + // Linux actually sends a SYN again and gets a RST and correctly returns + // ECONNREFUSED. + if (IsRunningOnGvisor()) { + ASSERT_THAT(connect(client_s.get(), + reinterpret_cast<const struct sockaddr*>(&bound_addr), + bound_addrlen), + SyscallFailsWithErrno(ECONNABORTED)); + return; + } ASSERT_THAT(connect(client_s.get(), reinterpret_cast<const struct sockaddr*>(&bound_addr), bound_addrlen), diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc index d65275fd3..90ef8bf21 100644 --- a/test/syscalls/linux/udp_socket.cc +++ b/test/syscalls/linux/udp_socket.cc @@ -374,6 +374,69 @@ TEST_P(UdpSocketTest, BindInUse) { SyscallFailsWithErrno(EADDRINUSE)); } +TEST_P(UdpSocketTest, ConnectWriteToInvalidPort) { + // Discover a free unused port by creating a new UDP socket, binding it + // recording the just bound port and closing it. This is not guaranteed as it + // can still race with other port UDP sockets trying to bind a port at the + // same time. + struct sockaddr_storage addr_storage = InetLoopbackAddr(); + socklen_t addrlen = sizeof(addr_storage); + struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage); + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP)); + ASSERT_THAT(bind(s.get(), addr, addrlen), SyscallSucceeds()); + ASSERT_THAT(getsockname(s.get(), addr, &addrlen), SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_NE(*Port(&addr_storage), 0); + ASSERT_THAT(close(s.release()), SyscallSucceeds()); + + // Now connect to the port that we just released. This should generate an + // ECONNREFUSED error. + ASSERT_THAT(connect(sock_.get(), addr, addrlen_), SyscallSucceeds()); + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + // Send from sock_ to an unbound port. + ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, addr, addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + + // Now verify that we got an ICMP error back of ECONNREFUSED. + int err; + socklen_t optlen = sizeof(err); + ASSERT_THAT(getsockopt(sock_.get(), SOL_SOCKET, SO_ERROR, &err, &optlen), + SyscallSucceeds()); + ASSERT_EQ(err, ECONNREFUSED); + ASSERT_EQ(optlen, sizeof(err)); +} + +TEST_P(UdpSocketTest, ConnectSimultaneousWriteToInvalidPort) { + // Discover a free unused port by creating a new UDP socket, binding it + // recording the just bound port and closing it. This is not guaranteed as it + // can still race with other port UDP sockets trying to bind a port at the + // same time. + struct sockaddr_storage addr_storage = InetLoopbackAddr(); + socklen_t addrlen = sizeof(addr_storage); + struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage); + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP)); + ASSERT_THAT(bind(s.get(), addr, addrlen), SyscallSucceeds()); + ASSERT_THAT(getsockname(s.get(), addr, &addrlen), SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_NE(*Port(&addr_storage), 0); + ASSERT_THAT(close(s.release()), SyscallSucceeds()); + + // Now connect to the port that we just released. + ScopedThread t([&] { + ASSERT_THAT(connect(sock_.get(), addr, addrlen_), SyscallSucceeds()); + }); + + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + // Send from sock_ to an unbound port. + ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, addr, addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + t.Join(); +} + TEST_P(UdpSocketTest, ReceiveAfterConnect) { ASSERT_NO_ERRNO(BindLoopback()); ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds()); diff --git a/tools/bazel.mk b/tools/bazel.mk index 3a7de427f..396785e16 100644 --- a/tools/bazel.mk +++ b/tools/bazel.mk @@ -14,49 +14,81 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Make hacks. -EMPTY := -SPACE := $(EMPTY) $(EMPTY) +## +## Docker options. +## +## This file supports targets that wrap bazel in a running Docker +## container to simplify development. Some options are available to +## control the behavior of this container: +## +## USER - The in-container user. +## DOCKER_RUN_OPTIONS - Options for the container (default: --privileged, required for tests). +## DOCKER_NAME - The container name (default: gvisor-bazel-HASH). +## DOCKER_PRIVILEGED - Docker privileged flags (default: --privileged). +## BAZEL_CACHE - The bazel cache directory (default: detected). +## GCLOUD_CONFIG - The gcloud config directory (detect: detected). +## DOCKER_SOCKET - The Docker socket (default: detected). +## +## To opt out of these wrappers, set DOCKER_BUILD=false. +DOCKER_BUILD := true +ifeq ($(DOCKER_BUILD),true) +-include bazel-server +endif # See base Makefile. -SHELL=/bin/bash -o pipefail BRANCH_NAME := $(shell (git branch --show-current 2>/dev/null || \ - git rev-parse --abbrev-ref HEAD 2>/dev/null) | \ - xargs -n 1 basename 2>/dev/null) + git rev-parse --abbrev-ref HEAD 2>/dev/null) | \ + xargs -n 1 basename 2>/dev/null) BUILD_ROOTS := bazel-bin/ bazel-out/ # Bazel container configuration (see below). USER := $(shell whoami) HASH := $(shell readlink -m $(CURDIR) | md5sum | cut -c1-8) -BUILDER_BASE := gvisor.dev/images/default -BUILDER_IMAGE := gvisor.dev/images/builder -BUILDER_NAME := gvisor-builder-$(HASH) -DOCKER_NAME := gvisor-bazel-$(HASH) +BUILDER_NAME := gvisor-builder-$(HASH)-$(ARCH) +DOCKER_NAME := gvisor-bazel-$(HASH)-$(ARCH) DOCKER_PRIVILEGED := --privileged BAZEL_CACHE := $(shell readlink -m ~/.cache/bazel/) GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/) DOCKER_SOCKET := /var/run/docker.sock -DOCKER_CONFIG := /etc/docker/daemon.json +DOCKER_CONFIG := /etc/docker -# Bazel flags. -BAZEL := bazel $(STARTUP_OPTIONS) -OPTIONS += --color=no --curses=no +## +## Bazel helpers. +## +## Bazel will be run with standard flags. You can specify the following flags +## to control which flags are passed: +## +## STARTUP_OPTIONS - Startup options passed to Bazel. +## BAZEL_CONFIG - A bazel config file. +## +STARTUP_OPTIONS := +BAZEL_CONFIG := +BAZEL := bazel $(STARTUP_OPTIONS) +BASE_OPTIONS := --color=no --curses=no +ifneq (,$(BAZEL_CONFIG)) +BASE_OPTIONS += --config=$(BAZEL_CONFIG) +endif +TEST_OPTIONS := $(BASE_OPTIONS) \ + --test_output=errors \ + --keep_going \ + --verbose_failures=true \ + --build_event_json_file=.build_events.json # Basic options. UID := $(shell id -u ${USER}) GID := $(shell id -g ${USER}) USERADD_OPTIONS := -FULL_DOCKER_RUN_OPTIONS := $(DOCKER_RUN_OPTIONS) -FULL_DOCKER_RUN_OPTIONS += --user $(UID):$(GID) -FULL_DOCKER_RUN_OPTIONS += --entrypoint "" -FULL_DOCKER_RUN_OPTIONS += --init -FULL_DOCKER_RUN_OPTIONS += -v "$(BAZEL_CACHE):$(BAZEL_CACHE)" -FULL_DOCKER_RUN_OPTIONS += -v "$(GCLOUD_CONFIG):$(GCLOUD_CONFIG)" -FULL_DOCKER_RUN_OPTIONS += -v "/tmp:/tmp" -FULL_DOCKER_EXEC_OPTIONS := --user $(UID):$(GID) -FULL_DOCKER_EXEC_OPTIONS += --interactive -ifeq (true,$(shell [[ -t 0 ]] && echo true)) -FULL_DOCKER_EXEC_OPTIONS += --tty +DOCKER_RUN_OPTIONS := +DOCKER_RUN_OPTIONS += --user $(UID):$(GID) +DOCKER_RUN_OPTIONS += --entrypoint "" +DOCKER_RUN_OPTIONS += --init +DOCKER_RUN_OPTIONS += -v "$(BAZEL_CACHE):$(BAZEL_CACHE)" +DOCKER_RUN_OPTIONS += -v "$(GCLOUD_CONFIG):$(GCLOUD_CONFIG)" +DOCKER_RUN_OPTIONS += -v "/tmp:/tmp" +DOCKER_EXEC_OPTIONS := --user $(UID):$(GID) +DOCKER_EXEC_OPTIONS += --interactive +ifeq (true,$(shell test -t 0 && echo true)) +DOCKER_EXEC_OPTIONS += --tty endif # Add basic UID/GID options. @@ -72,7 +104,7 @@ endif # out of disk space. ifneq ($(UID),0) USERADD_DOCKER += useradd -l --uid $(UID) --non-unique --no-create-home \ - --gid $(GID) $(USERADD_OPTIONS) -d $(HOME) $(USER) && + --gid $(GID) $(USERADD_OPTIONS) -d $(HOME) $(USER) && endif ifneq ($(GID),0) GROUPADD_DOCKER += groupadd --gid $(GID) --non-unique $(USER) && @@ -80,126 +112,110 @@ endif # Add docker passthrough options. ifneq ($(DOCKER_PRIVILEGED),) -FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)" -# TODO(gvisor.dev/issue/1624): Remove docker config volume. This is required -# temporarily for checking VFS1 vs VFS2 by some tests. -FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_CONFIG):$(DOCKER_CONFIG)" -FULL_DOCKER_RUN_OPTIONS += $(DOCKER_PRIVILEGED) -FULL_DOCKER_EXEC_OPTIONS += $(DOCKER_PRIVILEGED) +DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)" +DOCKER_RUN_OPTIONS += -v "$(DOCKER_CONFIG):$(DOCKER_CONFIG)" +DOCKER_RUN_OPTIONS += $(DOCKER_PRIVILEGED) +DOCKER_EXEC_OPTIONS += $(DOCKER_PRIVILEGED) DOCKER_GROUP := $(shell stat -c '%g' $(DOCKER_SOCKET)) ifneq ($(GID),$(DOCKER_GROUP)) USERADD_OPTIONS += --groups $(DOCKER_GROUP) GROUPADD_DOCKER += groupadd --gid $(DOCKER_GROUP) --non-unique docker-$(HASH) && -FULL_DOCKER_RUN_OPTIONS += --group-add $(DOCKER_GROUP) +DOCKER_RUN_OPTIONS += --group-add $(DOCKER_GROUP) endif endif # Add KVM passthrough options. ifneq (,$(wildcard /dev/kvm)) -FULL_DOCKER_RUN_OPTIONS += --device=/dev/kvm +DOCKER_RUN_OPTIONS += --device=/dev/kvm KVM_GROUP := $(shell stat -c '%g' /dev/kvm) ifneq ($(GID),$(KVM_GROUP)) USERADD_OPTIONS += --groups $(KVM_GROUP) GROUPADD_DOCKER += groupadd --gid $(KVM_GROUP) --non-unique kvm-$(HASH) && -FULL_DOCKER_RUN_OPTIONS += --group-add $(KVM_GROUP) +DOCKER_RUN_OPTIONS += --group-add $(KVM_GROUP) endif endif -# Load the appropriate config. -ifneq (,$(BAZEL_CONFIG)) -OPTIONS += --config=$(BAZEL_CONFIG) +# Top-level functions. +# +# This command runs a bazel server, and the container sticks around +# until the bazel server exits. This should ensure that it does not +# exit in the middle of running a build, but also it won't stick around +# forever. The build commands wrap around an appropriate exec into the +# container in order to perform work via the bazel client. +ifeq ($(DOCKER_BUILD),true) +wrapper = docker exec $(DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) $(1) +else +wrapper = $(1) endif -bazel-image: load-default - @if docker ps --all | grep $(BUILDER_NAME); then docker rm -f $(BUILDER_NAME); fi - docker run --user 0:0 --entrypoint "" --name $(BUILDER_NAME) \ - $(BUILDER_BASE) \ - sh -c "$(GROUPADD_DOCKER) \ - $(USERADD_DOCKER) \ - if [[ -e /dev/kvm ]]; then chmod a+rw /dev/kvm; fi" - docker commit $(BUILDER_NAME) $(BUILDER_IMAGE) - @docker rm -f $(BUILDER_NAME) -.PHONY: bazel-image - -## -## Bazel helpers. -## -## This file supports targets that wrap bazel in a running Docker -## container to simplify development. Some options are available to -## control the behavior of this container: -## USER - The in-container user. -## DOCKER_RUN_OPTIONS - Options for the container (default: --privileged, required for tests). -## DOCKER_NAME - The container name (default: gvisor-bazel-HASH). -## BAZEL_CACHE - The bazel cache directory (default: detected). -## GCLOUD_CONFIG - The gcloud config directory (detect: detected). -## DOCKER_SOCKET - The Docker socket (default: detected). -## -bazel-server-start: bazel-image ## Starts the bazel server. - @mkdir -p $(BAZEL_CACHE) - @mkdir -p $(GCLOUD_CONFIG) - @if docker ps --all | grep $(DOCKER_NAME); then docker rm -f $(DOCKER_NAME); fi - # This command runs a bazel server, and the container sticks around - # until the bazel server exits. This should ensure that it does not - # exit in the middle of running a build, but also it won't stick around - # forever. The build commands wrap around an appropriate exec into the - # container in order to perform work via the bazel client. - docker run -d --rm --name $(DOCKER_NAME) \ - -v "$(CURDIR):$(CURDIR)" \ - --workdir "$(CURDIR)" \ - $(FULL_DOCKER_RUN_OPTIONS) \ - $(BUILDER_IMAGE) \ - sh -c "tail -f --pid=\$$($(BAZEL) info server_pid) /dev/null" -.PHONY: bazel-server-start - bazel-shutdown: ## Shuts down a running bazel server. - @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) $(BAZEL) shutdown; \ - rc=$$?; docker kill $(DOCKER_NAME) || [[ $$rc -ne 0 ]] + @$(call wrapper,$(BAZEL) shutdown) .PHONY: bazel-shutdown bazel-alias: ## Emits an alias that can be used within the shell. - @echo "alias bazel='docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) bazel'" + @echo "alias bazel='$(call wrapper,$(BAZEL))'" .PHONY: bazel-alias -bazel-server: ## Ensures that the server exists. Used as an internal target. - @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) true || $(MAKE) bazel-server-start -.PHONY: bazel-server +bazel-image: load-default ## Ensures that the local builder exists. + @$(call header,DOCKER BUILD) + @docker rm -f $(BUILDER_NAME) 2>/dev/null || true + @docker run --user 0:0 --entrypoint "" --name $(BUILDER_NAME) gvisor.dev/images/default \ + sh -c "$(GROUPADD_DOCKER) $(USERADD_DOCKER) if test -e /dev/kvm; then chmod a+rw /dev/kvm; fi" + @docker commit $(BUILDER_NAME) gvisor.dev/images/builder +.PHONY: bazel-image -build_cmd = docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) sh -o pipefail -c '$(BAZEL) build $(OPTIONS) "$(TARGETS)"' - -build_paths = $(build_cmd) 2>&1 \ - | tee /proc/self/fd/2 \ - | grep -A1 -E '^Target' \ - | grep -E '^ ($(subst $(SPACE),|,$(BUILD_ROOTS)))' \ - | sed "s/ /\n/g" \ - | strings -n 10 \ - | awk '{$$1=$$1};1' \ - | xargs -n 1 -I {} readlink -f "{}" \ - | xargs -n 1 -I {} sh -c "$(1)" - -build: bazel-server - @$(call build_cmd) -.PHONY: build - -copy: bazel-server -ifeq (,$(DESTINATION)) - $(error Destination not provided.) +ifneq (true,$(shell $(wrapper echo true))) +bazel-server: bazel-image ## Ensures that the server exists. + @$(call header,DOCKER RUN) + @docker rm -f $(DOCKER_NAME) 2>/dev/null || true + @mkdir -p $(GCLOUD_CONFIG) + @mkdir -p $(BAZEL_CACHE) + @docker run -d --rm --name $(DOCKER_NAME) \ + -v "$(CURDIR):$(CURDIR)" \ + --workdir "$(CURDIR)" \ + $(DOCKER_RUN_OPTIONS) \ + gvisor.dev/images/builder \ + sh -c "set -x; tail -f --pid=\$$($(BAZEL) info server_pid) /dev/null" +else +bazel-server: + @ endif - @$(call build_paths,cp -fa {} $(DESTINATION)) - -run: bazel-server - @$(call build_paths,{} $(ARGS)) -.PHONY: run - -sudo: bazel-server - @$(call build_paths,sudo -E {} $(ARGS)) -.PHONY: sudo - -test: OPTIONS += --test_output=errors --keep_going --verbose_failures=true -test: bazel-server - @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) $(BAZEL) test $(OPTIONS) $(TARGETS) -.PHONY: test +.PHONY: bazel-server -query: - @$(MAKE) bazel-server >&2 # If we need to start, ensure stdout is not polluted. - @docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) sh -o pipefail -c '$(BAZEL) query $(OPTIONS) "$(TARGETS)" 2>/dev/null' -.PHONY: query +# build_paths extracts the built binary from the bazel stderr output. +# +# This could be alternately done by parsing the bazel build event stream, but +# this is a complex schema, and begs the question: what will build the thing +# that parses the output? Bazel? Do we need a separate bootstrapping build +# command here? Yikes, let's just stick with the ugly shell pipeline. +# +# The last line is used to prevent terminal shenanigans. +build_paths = \ + $(call wrapper,$(BAZEL) build $(BASE_OPTIONS) $(1)) 2>&1 \ + | tee /proc/self/fd/2 \ + | grep -A1 -E '^Target' \ + | grep -E '^ ($(subst $(SPACE),|,$(BUILD_ROOTS)))' \ + | sed "s/ /\n/g" \ + | strings -n 10 \ + | awk '{$$1=$$1};1' \ + | xargs -n 1 -I {} readlink -f "{}" \ + | xargs -n 1 -I {} bash -c 'set -xeuo pipefail; $(2)' + +clean = $(call header,CLEAN) && $(call wrapper,$(BAZEL) clean) +build = $(call header,BUILD $(1)) && $(call build_paths,$(1),echo {}) +copy = $(call header,COPY $(1) $(2)) && $(call build_paths,$(1),cp -fa {} $(2)) +run = $(call header,RUN $(1) $(2)) && $(call build_paths,$(1),{} $(2)) +sudo = $(call header,SUDO $(1) $(2)) && $(call build_paths,$(1),sudo -E {} $(2)) +test = $(call header,TEST $(1)) && $(call wrapper,$(BAZEL) test $(TEST_OPTIONS) $(1)) + +clean: ## Cleans the bazel cache. + @$(call clean) +.PHONY: clean + +testlogs: ## Returns the most recent set of test logs. + @if test -f .build_events.json; then \ + cat .build_events.json | jq -r \ + 'select(.testSummary?.overallStatus? | tostring | test("(FAILED|FLAKY|TIMEOUT)")) | .testSummary.failed | .[] | .uri' | \ + awk -Ffile:// '{print $$2;}'; \ + fi +.PHONY: testlogs diff --git a/tools/bazel_gazelle.patch b/tools/bazel_gazelle.patch new file mode 100644 index 000000000..e35f38933 --- /dev/null +++ b/tools/bazel_gazelle.patch @@ -0,0 +1,24 @@ +diff -r -u2 a/language/go/resolve.go b/language/go/resolve.go +--- a/language/go/resolve.go 2020-10-02 14:22:18.000000000 -0700 ++++ b/language/go/resolve.go 2020-11-17 19:40:59.770648029 -0800 +@@ -20,5 +20,4 @@ + "fmt" + "go/build" +- "log" + "path" + "regexp" +@@ -80,5 +79,5 @@ + resolve = ResolveGo + } +- deps, errs := imports.Map(func(imp string) (string, error) { ++ deps, _ := imports.Map(func(imp string) (string, error) { + l, err := resolve(c, ix, rc, imp, from) + if err == skipImportError { +@@ -95,7 +94,4 @@ + return l.String(), nil + }) +- for _, err := range errs { +- log.Print(err) +- } + if !deps.IsEmpty() { + if r.Kind() == "go_proto_library" { diff --git a/tools/bazeldefs/BUILD b/tools/bazeldefs/BUILD index 27e85a75e..97c7cb45f 100644 --- a/tools/bazeldefs/BUILD +++ b/tools/bazeldefs/BUILD @@ -58,3 +58,21 @@ bzl_library( srcs = ["defs.bzl"], visibility = ["//visibility:private"], ) + +config_setting( + name = "linux_arm64_cross", + values = { + "cpu": "aarch64", + "host_cpu": "k8", + }, + visibility = ["//visibility:private"], +) + +config_setting( + name = "linux_amd64_cross", + values = { + "cpu": "k8", + "host_cpu": "aarch64", + }, + visibility = ["//visibility:private"], +) diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl index c2f94bb9c..279a38fed 100644 --- a/tools/bazeldefs/defs.bzl +++ b/tools/bazeldefs/defs.bzl @@ -7,6 +7,8 @@ build_test = _build_test bzl_library = _bzl_library rbe_platform = native.platform rbe_toolchain = native.toolchain +more_shards = 4 +most_shards = 8 def short_path(path): return path @@ -37,3 +39,44 @@ def default_net_util(): def coreutil(): return [] # Nothing needed. + +def select_native_vs_cross(native = [], amd64 = [], arm64 = [], cross = []): + values = { + "//tools/bazeldefs:linux_arm64_cross": arm64 + cross, + "//tools/bazeldefs:linux_amd64_cross": amd64 + cross, + "//conditions:default": native, + } + return select(values) + +def arch_genrule(name, srcs, outs, cmd, tools): + """Runs a gen command on the target architecture. + + If the target architecture isn't match the host architecture, it will build + a command for the target architecture and run it via qemu. + + The native genrule runs the command on the host architecture. + + Args: + name: name of generated target. + srcs: A list of inputs for this rule. + cmd: The command to run. It has to contain " QEMU " before executed binaries. + outs: A list of files generated by this rule. + tools: A list of tool dependencies for this rule. + """ + qemu_arm64 = "qemu-aarch64-static" + qemu_amd64 = "qemu-x86_64-static" + srcs = select_native_vs_cross( + cross = srcs + tools, + native = srcs, + ) + tools = select_native_vs_cross( + cross = [], + native = tools, + ) + cmd = select_native_vs_cross( + arm64 = cmd.replace("QEMU", qemu_arm64), + amd64 = cmd.replace("QEMU", qemu_amd64), + native = cmd.replace("QEMU", ""), + cross = "", + ) + native.genrule(name = name, srcs = srcs, outs = outs, cmd = cmd, tools = tools) diff --git a/tools/bazeldefs/go.bzl b/tools/bazeldefs/go.bzl index 661c9727e..bcd8cffe7 100644 --- a/tools/bazeldefs/go.bzl +++ b/tools/bazeldefs/go.bzl @@ -28,7 +28,7 @@ def go_proto_library(name, **kwargs): def go_grpc_and_proto_libraries(name, **kwargs): _go_proto_or_grpc_library(_go_grpc_library, name, **kwargs) -def go_binary(name, static = False, pure = False, x_defs = None, **kwargs): +def go_binary(name, static = False, pure = False, x_defs = None, system_malloc = False, **kwargs): """Build a go binary. Args: @@ -52,7 +52,7 @@ def go_importpath(target): """Returns the importpath for the target.""" return target[GoLibrary].importpath -def go_library(name, **kwargs): +def go_library(name, arch_deps = [], **kwargs): _go_library( name = name, importpath = "gvisor.dev/gvisor/" + native.package_name(), diff --git a/tools/bigquery/bigquery.go b/tools/bigquery/bigquery.go index 544af3876..a4ca93ec2 100644 --- a/tools/bigquery/bigquery.go +++ b/tools/bigquery/bigquery.go @@ -21,6 +21,7 @@ package bigquery import ( "context" "fmt" + "strconv" "strings" "time" @@ -109,6 +110,12 @@ func NewBenchmark(name string, iters int) *Benchmark { return &Benchmark{ Name: name, Metric: make([]*Metric, 0), + Condition: []*Condition{ + { + Name: "iterations", + Value: strconv.Itoa(iters), + }, + }, } } diff --git a/tools/checkescape/test1/test1.go b/tools/checkescape/test1/test1.go index 27991649f..f46eba39b 100644 --- a/tools/checkescape/test1/test1.go +++ b/tools/checkescape/test1/test1.go @@ -36,17 +36,20 @@ func (t Type) Foo() { fmt.Printf("%v", t) // Never executed. } +// InterfaceFunction is passed an interface argument. // +checkescape:all,hard //go:nosplit func InterfaceFunction(i Interface) { // Do nothing; exported for tests. } +// TypeFunction is passed a concrete pointer argument. // +checkesacape:all,hard //go:nosplit func TypeFunction(t *Type) { } +// BuiltinMap creates a new map. // +mustescape:local,builtin //go:noinline //go:nosplit @@ -61,7 +64,8 @@ func builtinMapRec(x int) map[string]bool { return BuiltinMap(x) } -// +temustescapestescape:local,builtin +// BuiltinClosure returns a closure around x. +// +mustescape:local,builtin //go:noinline //go:nosplit func BuiltinClosure(x int) func() { @@ -77,6 +81,7 @@ func builtinClosureRec(x int) func() { return BuiltinClosure(x) } +// BuiltinMakeSlice makes a new slice. // +mustescape:local,builtin //go:noinline //go:nosplit @@ -91,6 +96,7 @@ func builtinMakeSliceRec(x int) []byte { return BuiltinMakeSlice(x) } +// BuiltinAppend calls append on a slice. // +mustescape:local,builtin //go:noinline //go:nosplit @@ -105,6 +111,7 @@ func builtinAppendRec() []byte { return BuiltinAppend(nil) } +// BuiltinChan makes a channel. // +mustescape:local,builtin //go:noinline //go:nosplit @@ -119,6 +126,7 @@ func builtinChanRec() chan int { return BuiltinChan() } +// Heap performs an explicit heap allocation. // +mustescape:local,heap //go:noinline //go:nosplit @@ -134,6 +142,7 @@ func heapRec() *Type { return Heap() } +// Dispatch dispatches via an interface. // +mustescape:local,interface //go:noinline //go:nosplit @@ -148,6 +157,7 @@ func dispatchRec(i Interface) { Dispatch(i) } +// Dynamic invokes a dynamic function. // +mustescape:local,dynamic //go:noinline //go:nosplit @@ -167,6 +177,7 @@ func dynamicRec(f func()) { func internalFunc() { } +// Split includes a guaranteed stack split. // +mustescape:local,stack //go:noinline func Split() { diff --git a/tools/defs.bzl b/tools/defs.bzl index 2c8129e7e..54d756e55 100644 --- a/tools/defs.bzl +++ b/tools/defs.bzl @@ -8,7 +8,7 @@ change for Google-internal and bazel-compatible rules. load("//tools/go_stateify:defs.bzl", "go_stateify") load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps") load("//tools/nogo:defs.bzl", "nogo_test") -load("//tools/bazeldefs:defs.bzl", _build_test = "build_test", _bzl_library = "bzl_library", _coreutil = "coreutil", _default_installer = "default_installer", _default_net_util = "default_net_util", _proto_library = "proto_library", _rbe_platform = "rbe_platform", _rbe_toolchain = "rbe_toolchain", _select_arch = "select_arch", _select_system = "select_system", _short_path = "short_path") +load("//tools/bazeldefs:defs.bzl", _arch_genrule = "arch_genrule", _build_test = "build_test", _bzl_library = "bzl_library", _coreutil = "coreutil", _default_installer = "default_installer", _default_net_util = "default_net_util", _more_shards = "more_shards", _most_shards = "most_shards", _proto_library = "proto_library", _rbe_platform = "rbe_platform", _rbe_toolchain = "rbe_toolchain", _select_arch = "select_arch", _select_system = "select_system", _short_path = "short_path") load("//tools/bazeldefs:cc.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _gbenchmark = "gbenchmark", _grpcpp = "grpcpp", _gtest = "gtest", _vdso_linker_option = "vdso_linker_option") load("//tools/bazeldefs:go.bzl", _gazelle = "gazelle", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_path = "go_path", _go_proto_library = "go_proto_library", _go_test = "go_test", _select_goarch = "select_goarch", _select_goos = "select_goos") load("//tools/bazeldefs:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar") @@ -16,6 +16,7 @@ load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", load("//tools/bazeldefs:tags.bzl", "go_suffixes") # Core rules. +arch_genrule = _arch_genrule build_test = _build_test bzl_library = _bzl_library default_installer = _default_installer @@ -26,6 +27,8 @@ short_path = _short_path rbe_platform = _rbe_platform rbe_toolchain = _rbe_toolchain coreutil = _coreutil +more_shards = _more_shards +most_shards = _most_shards # C++ rules. cc_binary = _cc_binary @@ -182,6 +185,7 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F name + suffix + "_state_autogen.go" for suffix in state_sets.keys() ] + if "//pkg/state" not in all_deps: all_deps = all_deps + ["//pkg/state"] diff --git a/tools/go_branch.sh b/tools/go_branch.sh index 71d036b12..ca07246a6 100755 --- a/tools/go_branch.sh +++ b/tools/go_branch.sh @@ -39,7 +39,7 @@ declare tmp_dir tmp_dir=$(mktemp -d) readonly tmp_dir finish() { - cd # Leave tmp_dir. + cd / # Leave tmp_dir. rm -rf "${tmp_dir}" } trap finish EXIT @@ -90,7 +90,7 @@ find . -type f -exec chmod 0644 {} \; find . -type d -exec chmod 0755 {} \; # Sync the entire gopath_dir. -rsync --recursive --verbose --delete --exclude .git -L "${gopath_dir}/" . +rsync --recursive --delete --exclude .git -L "${gopath_dir}/" . # Add additional files. for file in "${othersrc[@]}"; do diff --git a/tools/go_generics/defs.bzl b/tools/go_generics/defs.bzl index ad97208a8..50e2546bf 100644 --- a/tools/go_generics/defs.bzl +++ b/tools/go_generics/defs.bzl @@ -67,7 +67,7 @@ def _go_template_instance_impl(ctx): # Check that all defined types are expected by the template. for t in ctx.attr.types: if (t not in info.types) and (t not in info.opt_types): - fail("Type %s it not a parameter to %s" % (t, ctx.attr.template.label)) + fail("Type %s is not a parameter to %s" % (t, ctx.attr.template.label)) # Check that all required consts are defined. for t in info.consts: @@ -77,7 +77,7 @@ def _go_template_instance_impl(ctx): # Check that all defined consts are expected by the template. for t in ctx.attr.consts: if (t not in info.consts) and (t not in info.opt_consts): - fail("Const %s it not a parameter to %s" % (t, ctx.attr.template.label)) + fail("Const %s is not a parameter to %s" % (t, ctx.attr.template.label)) # Build the argument list. args = ["-i=%s" % info.template.path, "-o=%s" % output.path] diff --git a/tools/go_generics/generics.go b/tools/go_generics/generics.go index 0860ca9db..30584006c 100644 --- a/tools/go_generics/generics.go +++ b/tools/go_generics/generics.go @@ -223,7 +223,7 @@ func main() { } else { switch kind { case globals.KindType, globals.KindVar, globals.KindConst, globals.KindFunction: - if ident.Name != "_" { + if ident.Name != "_" && !(ident.Name == "init" && kind == globals.KindFunction) { ident.Name = *prefix + ident.Name + *suffix } case globals.KindTag: diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go index 4a53d25be..6f41b1b79 100644 --- a/tools/go_marshal/gomarshal/generator.go +++ b/tools/go_marshal/gomarshal/generator.go @@ -213,10 +213,11 @@ type sliceAPI struct { type marshallableType struct { spec *ast.TypeSpec slice *sliceAPI + recv string } -func newMarshallableType(fset *token.FileSet, tagLine *ast.Comment, spec *ast.TypeSpec) marshallableType { - mt := marshallableType{ +func newMarshallableType(fset *token.FileSet, tagLine *ast.Comment, spec *ast.TypeSpec) *marshallableType { + mt := &marshallableType{ spec: spec, slice: nil, } @@ -261,12 +262,31 @@ func newMarshallableType(fset *token.FileSet, tagLine *ast.Comment, spec *ast.Ty // collectMarshallableTypes walks the parsed AST and collects a list of type // declarations for which we need to generate the Marshallable interface. -func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []marshallableType { - var types []marshallableType +func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) map[*ast.TypeSpec]*marshallableType { + recv := make(map[string]string) // Type name to recevier name. + types := make(map[*ast.TypeSpec]*marshallableType) for _, decl := range a.Decls { gdecl, ok := decl.(*ast.GenDecl) // Type declaration? if !ok || gdecl.Tok != token.TYPE { + // Is this a function declaration? We remember receiver names. + d, ok := decl.(*ast.FuncDecl) + if ok && d.Recv != nil && len(d.Recv.List) == 1 { + // Accept concrete methods & pointer methods. + ident, ok := d.Recv.List[0].Type.(*ast.Ident) + if !ok { + var st *ast.StarExpr + st, ok = d.Recv.List[0].Type.(*ast.StarExpr) + if ok { + ident, ok = st.X.(*ast.Ident) + } + } + // The receiver name may be not present. + if ok && len(d.Recv.List[0].Names) == 1 { + // Recover the type receiver name in this case. + recv[ident.Name] = d.Recv.List[0].Names[0].Name + } + } debugfAt(f.Position(decl.Pos()), "Skipping declaration since it's not a type declaration.\n") continue } @@ -305,9 +325,19 @@ func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []ma // don't support it. abortAt(f.Position(t.Pos()), fmt.Sprintf("Marshalling codegen was requested on type '%s', but go-marshal doesn't support this kind of declaration.\n", t.Name)) } - types = append(types, newMarshallableType(f, tagLine, t)) - + types[t] = newMarshallableType(f, tagLine, t) + } + } + // Update the types with the last seen receiver. As long as the + // receiver name is consistent for the type, then we will generate + // code that is still consistent with itself. + for t, mt := range types { + r, ok := recv[t.Name.Name] + if !ok { + mt.recv = receiverName(t) // Default. + continue } + mt.recv = r // Last seen. } return types } @@ -345,8 +375,8 @@ func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]imp } -func (g *Generator) generateOne(t marshallableType, fset *token.FileSet) *interfaceGenerator { - i := newInterfaceGenerator(t.spec, fset) +func (g *Generator) generateOne(t *marshallableType, fset *token.FileSet) *interfaceGenerator { + i := newInterfaceGenerator(t.spec, t.recv, fset) switch ty := t.spec.Type.(type) { case *ast.StructType: i.validateStruct(t.spec, ty) @@ -376,8 +406,8 @@ func (g *Generator) generateOne(t marshallableType, fset *token.FileSet) *interf // generateOneTestSuite generates a test suite for the automatically generated // implementations type t. -func (g *Generator) generateOneTestSuite(t marshallableType) *testGenerator { - i := newTestGenerator(t.spec) +func (g *Generator) generateOneTestSuite(t *marshallableType) *testGenerator { + i := newTestGenerator(t.spec, t.recv) i.emitTests(t.slice) return i } diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go index 36447b86b..65f5ea34d 100644 --- a/tools/go_marshal/gomarshal/generator_interfaces.go +++ b/tools/go_marshal/gomarshal/generator_interfaces.go @@ -54,10 +54,10 @@ func (g *interfaceGenerator) typeName() string { } // newinterfaceGenerator creates a new interface generator. -func newInterfaceGenerator(t *ast.TypeSpec, fset *token.FileSet) *interfaceGenerator { +func newInterfaceGenerator(t *ast.TypeSpec, r string, fset *token.FileSet) *interfaceGenerator { g := &interfaceGenerator{ t: t, - r: receiverName(t), + r: r, f: fset, is: make(map[string]struct{}), ms: make(map[string]struct{}), diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go index 631295373..6cf00843f 100644 --- a/tools/go_marshal/gomarshal/generator_tests.go +++ b/tools/go_marshal/gomarshal/generator_tests.go @@ -53,10 +53,10 @@ type testGenerator struct { decl *importStmt } -func newTestGenerator(t *ast.TypeSpec) *testGenerator { +func newTestGenerator(t *ast.TypeSpec, r string) *testGenerator { g := &testGenerator{ t: t, - r: receiverName(t), + r: r, imports: newImportTable(), } diff --git a/tools/images.mk b/tools/images.mk new file mode 100644 index 000000000..46f56bb2c --- /dev/null +++ b/tools/images.mk @@ -0,0 +1,169 @@ +#!/usr/bin/make -f + +# Copyright 2018 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## +## Docker image targets. +## +## Images used by the tests must also be built and available locally. +## The canonical test targets defined below will automatically load +## relevant images. These can be loaded or built manually via these +## targets. +## +## (*) Note that you may provide an ARCH parameter in order to build +## and load images from an alternate archiecture (using qemu). When +## bazel is run as a server, this has the effect of running an full +## cross-architecture chain, and can produce cross-compiled binaries. +## + +# ARCH is the architecture used for the build. This may be overriden at the +# command line in order to perform a cross-build (in a limited capacity). +ARCH := $(shell uname -m) +ifneq ($(ARCH),$(shell uname -m)) +DOCKER_PLATFORM_ARGS := --platform=$(ARCH) +else +DOCKER_PLATFORM_ARGS := +endif + +# Note that the image prefixes used here must match the image mangling in +# runsc/testutil.MangleImage. Names are mangled in this way to ensure that all +# tests are using locally-defined images (that are consistent and idempotent). +REMOTE_IMAGE_PREFIX ?= gcr.io/gvisor-presubmit +LOCAL_IMAGE_PREFIX ?= gvisor.dev/images +ALL_IMAGES := $(subst /,_,$(subst images/,,$(shell find images/ -name Dockerfile -o -name Dockerfile.$(ARCH) | xargs -n 1 dirname | uniq))) +SUB_IMAGES := $(foreach image,$(ALL_IMAGES),$(if $(findstring _,$(image)),$(image),)) +IMAGE_GROUPS := $(sort $(foreach image,$(SUB_IMAGES),$(firstword $(subst _, ,$(image))))) + +define expand_group = +load-$(1): $$(patsubst $(1)_%, load-$(1)_%, $$(filter $(1)_%,$$(ALL_IMAGES))) + @ +.PHONY: load-$(1) +push-$(1): $$(patsubst $(1)_%, push-$(1)_%, $$(filter $(1)_%,$$(ALL_IMAGES))) + @ +.PHONY: push-$(1) +endef +$(foreach group,$(IMAGE_GROUPS),$(eval $(call expand_group,$(group)))) + +list-all-images: ## List all images. + @for image in $(ALL_IMAGES); do echo $${image}; done +.PHONY: list-all-images + +load-all-images: ## Load all images. +load-all-images: $(patsubst %,load-%,$(ALL_IMAGES)) +.PHONY: load-all-images + +push-all-images: ## Push all images. +push-all-images: $(patsubst %,push-%,$(ALL_IMAGES)) +.PHONY: push-all-images + +# path and dockerfile are used to extract the relevant path and dockerfile +# (depending on what's available for the given architecture). +path = images/$(subst _,/,$(1)) +dockerfile = $$(if [ -f "$(call path,$(1))/Dockerfile.$(ARCH)" ]; then echo Dockerfile.$(ARCH); else echo Dockerfile; fi) + +# The tag construct is used to memoize the image generated (see README.md). +# This scheme is used to enable aggressive caching in a central repository, but +# ensuring that images will always be sourced using the local files. +tag = $(shell cd images && find $(subst _,/,$(1)) -type f | sort | xargs -n 1 sha256sum | sha256sum - | cut -c 1-16) +remote_image = $(REMOTE_IMAGE_PREFIX)/$(subst _,/,$(1))_$(ARCH) +local_image = $(LOCAL_IMAGE_PREFIX)/$(subst _,/,$(1)) + +# Include all existing images as targets here. +# +# Note that we use a _ for the tag separator, instead of :, as the latter is +# interpreted by Make, unfortunately. tag_expand expands the generic rules to +# tag-specific targets. These is needed to provide sensible targets for load +# below, with caching. Basically, if there is a rule generated here, then the +# load will be skipped. If there is no load generated here, then the default +# rule for load will kick in. +# +# Note that if this rule does not successfully rule, we will simply have +# additional Docker pull commands that run for all images that are already +# pulled. No real harm done. +EXISTING_IMAGES = $(shell docker images --format '{{.Repository}}_{{.Tag}}' | grep -v '<none>') +define existing_image_rule = +loaded0_$(1)=load-$$(1): tag-$$(1) # Already available. +loaded1_$(1)=.PHONY: load-$$(1) +endef +$(foreach image, $(EXISTING_IMAGES), $(eval $(call existing_image_rule,$(image)))) +define tag_expand_rule = +$(eval $(loaded0_$(call remote_image,$(1))_$(call tag,$(1)))) +$(eval $(loaded1_$(call remote_image,$(1))_$(call tag,$(1)))) +endef +$(foreach image, $(ALL_IMAGES), $(eval $(call tag_expand_rule,$(image)))) + +# tag tags a local image. This applies both the hash-based tag from above to +# ensure that caching works as expected, as well as the "latest" tag that is +# used by the tests. +local_tag = \ + docker tag $(call remote_image,$(1)):$(call tag,$(1)) $(call local_image,$(1)):$(call tag,$(1)) +latest_tag = \ + docker tag $(call local_image,$(1)):$(call tag,$(1)) $(call local_image,$(1)) +tag-%: ## Tag a local image. + @$(call header,TAG $*) + @$(call local_tag,$*) && $(call latest_tag,$*) + +# pull forces the image to be pulled. +pull = \ + $(call header,PULL $(1)) && \ + docker pull $(DOCKER_PLATFORM_ARGS) $(call remote_image,$(1)):$(call tag,$(1)) && \ + $(call local_tag,$(1)) && \ + $(call latest_tag,$(1)) +pull-%: register-cross ## Force a repull of the image. + @$(call pull,$*) + +# rebuild builds the image locally. Only the "remote" tag will be applied. Note +# we need to explicitly repull the base layer in order to ensure that the +# architecture is correct. Note that we use the term "rebuild" here to avoid +# conflicting with the bazel "build" terminology, which is used elsewhere. +rebuild = \ + $(call header,REBUILD $(1)) && \ + (T=$$(mktemp -d) && cp -a $(call path,$(1))/* $$T && \ + $(foreach image,$(shell grep FROM "$(call path,$(1))/$(call dockerfile,$(1))" 2>/dev/null | cut -d' ' -f2),docker pull $(DOCKER_PLATFORM_ARGS) $(image) &&) \ + docker build $(DOCKER_PLATFORM_ARGS) \ + -f "$$T/$(call dockerfile,$(1))" \ + -t "$(call remote_image,$(1)):$(call tag,$(1))" \ + $$T && \ + rm -rf $$T) && \ + $(call local_tag,$(1)) && \ + $(call latest_tag,$(1)) +rebuild-%: register-cross ## Force rebuild an image locally. + @$(call rebuild,$*) + +# load will either pull the "remote" or build it locally. This is the preferred +# entrypoint, as it should never fail. The local tag should always be set after +# this returns (either by the pull or the build). +load-%: register-cross ## Pull or build an image locally. + @($(call pull,$*)) || ($(call rebuild,$*)) + +# push pushes the remote image, after either pulling (to validate that the tag +# already exists) or building manually. Note that this generic rule will match +# the fully-expanded remote image tag. +push-%: load-% ## Push a given image. + @docker push $(call remote_image,$*):$(call tag,$*) + +# register-cross registers the necessary qemu binaries for cross-compilation. +# This may be used by any target that may execute containers that are not the +# native format. Note that this will only apply on the first execution. +register-cross: +ifneq ($(ARCH),$(shell uname -m)) +ifeq (,$(wildcard /proc/sys/fs/binfmt_misc/qemu-*)) + @docker run --rm --privileged multiarch/qemu-user-static --reset --persistent yes +else + @ +endif +else + @ +endif diff --git a/tools/installers/BUILD b/tools/installers/BUILD index 13d3cc5e0..bbf3c1f85 100644 --- a/tools/installers/BUILD +++ b/tools/installers/BUILD @@ -1,4 +1,4 @@ -# Installers for use by the tools/vm_test rules. +# Installers for use by top-level scripts. package( default_visibility = ["//:sandbox"], @@ -14,14 +14,6 @@ sh_binary( ) sh_binary( - name = "images", - srcs = ["images.sh"], - data = [ - "//images", - ], -) - -sh_binary( name = "master", srcs = ["master.sh"], ) diff --git a/tools/installers/containerd.sh b/tools/installers/containerd.sh index 6b7bb261c..d28549734 100755 --- a/tools/installers/containerd.sh +++ b/tools/installers/containerd.sh @@ -16,7 +16,7 @@ set -xeo pipefail -declare -r CONTAINERD_VERSION=${CONTAINERD_VERSION:-1.3.0} +declare -r CONTAINERD_VERSION=${1:-1.3.0} declare -r CONTAINERD_MAJOR="$(echo ${CONTAINERD_VERSION} | awk -F '.' '{ print $1; }')" declare -r CONTAINERD_MINOR="$(echo ${CONTAINERD_VERSION} | awk -F '.' '{ print $2; }')" @@ -43,10 +43,23 @@ install_helper() { make install) } +# Figure out were btrfs headers are. +# +# Ubuntu 16.04 has only btrfs-tools, while 18.04 has a transitional package, +# and later versions no longer have the transitional package. +source /etc/os-release +declare BTRFS_DEV +if [[ "${VERSION_ID%.*}" -le "18" ]]; then + BTRFS_DEV="btrfs-tools" +else + BTRFS_DEV="libbtrfs-dev" +fi +readonly BTRFS_DEV + # Install dependencies for the crictl tests. while true; do if (apt-get update && apt-get install -y \ - btrfs-tools \ + "${BTRFS_DEV}" \ libseccomp-dev); then break fi diff --git a/tools/nogo/filter/main.go b/tools/nogo/filter/main.go index 9cf41b3b0..8be38ca6d 100644 --- a/tools/nogo/filter/main.go +++ b/tools/nogo/filter/main.go @@ -16,6 +16,7 @@ package main import ( + "bytes" "flag" "fmt" "io/ioutil" @@ -76,12 +77,14 @@ func main() { log.Fatalf("unable to read %s: %v", filename, err) } var newConfig nogo.Config // For current file. - if err := yaml.Unmarshal(content, &newConfig); err != nil { + dec := yaml.NewDecoder(bytes.NewBuffer(content)) + dec.SetStrict(true) + if err := dec.Decode(&newConfig); err != nil { log.Fatalf("unable to decode %s: %v", filename, err) } config.Merge(&newConfig) if showConfig { - bytes, err := yaml.Marshal(&newConfig) + content, err := yaml.Marshal(&newConfig) if err != nil { log.Fatalf("error marshalling config: %v", err) } @@ -89,7 +92,7 @@ func main() { if err != nil { log.Fatalf("error marshalling config: %v", err) } - fmt.Fprintf(os.Stdout, "Loaded configuration from %s:\n%s\n", filename, string(bytes)) + fmt.Fprintf(os.Stdout, "Loaded configuration from %s:\n%s\n", filename, string(content)) fmt.Fprintf(os.Stdout, "Merged configuration:\n%s\n", string(mergedBytes)) } } diff --git a/tools/parsers/go_parser_test.go b/tools/parsers/go_parser_test.go index f0737d46b..39a13b4af 100644 --- a/tools/parsers/go_parser_test.go +++ b/tools/parsers/go_parser_test.go @@ -34,6 +34,10 @@ func TestParseLine(t *testing.T) { Name: "BenchmarkIperf", Condition: []*bigquery.Condition{ { + Name: "iterations", + Value: "1", + }, + { Name: "GOMAXPROCS", Value: "6", }, @@ -63,6 +67,10 @@ func TestParseLine(t *testing.T) { Name: "BenchmarkRuby", Condition: []*bigquery.Condition{ { + Name: "iterations", + Value: "1", + }, + { Name: "GOMAXPROCS", Value: "6", }, @@ -100,12 +108,14 @@ func TestParseLine(t *testing.T) { } if !cmp.Equal(tc.want, got, nil) { - for _, c := range got.Condition { - t.Logf("Cond: %+v", c) + for i := range got.Condition { + t.Logf("Metric: want: %+v got:%+v", got.Condition[i], tc.want.Condition[i]) } - for _, m := range got.Metric { - t.Logf("Metric: %+v", m) + + for i := range got.Metric { + t.Logf("Metric: want: %+v got:%+v", got.Metric[i], tc.want.Metric[i]) } + t.Fatalf("Compare failed want: %+v got: %+v", tc.want, got) } }) @@ -131,7 +141,7 @@ func TestParseOutput(t *testing.T) { `, numBenchmarks: 2, numMetrics: 1, - numConditions: 1, + numConditions: 2, }, { name: "Ruby", @@ -142,7 +152,7 @@ BenchmarkRuby/server_threads.5 BenchmarkRuby/server_threads.5-6 1 1416003331 ns/op 0.00950 average_latency.s 465 requests_per_second.QPS`, numBenchmarks: 2, numMetrics: 3, - numConditions: 2, + numConditions: 3, }, } diff --git a/tools/vm/BUILD b/tools/vm/BUILD deleted file mode 100644 index d95ca6c63..000000000 --- a/tools/vm/BUILD +++ /dev/null @@ -1,63 +0,0 @@ -load("//tools:defs.bzl", "bzl_library", "cc_binary", "gtest") -load("//tools/vm:defs.bzl", "vm_image", "vm_test") - -package( - default_visibility = ["//:sandbox"], - licenses = ["notice"], -) - -sh_binary( - name = "zone", - srcs = ["zone.sh"], -) - -sh_binary( - name = "builder", - srcs = ["build.sh"], -) - -sh_binary( - name = "executer", - srcs = ["execute.sh"], -) - -cc_binary( - name = "test", - testonly = 1, - srcs = ["test.cc"], - linkstatic = 1, - deps = [ - gtest, - "//test/util:test_main", - ], -) - -vm_image( - name = "ubuntu1604", - family = "ubuntu-1604-lts", - project = "ubuntu-os-cloud", - scripts = [ - "//tools/vm/ubuntu1604", - ], -) - -vm_image( - name = "ubuntu1804", - family = "ubuntu-1804-lts", - project = "ubuntu-os-cloud", - scripts = [ - "//tools/vm/ubuntu1804", - ], -) - -vm_test( - name = "vm_test", - shard_count = 2, - targets = [":test"], -) - -bzl_library( - name = "defs_bzl", - srcs = ["defs.bzl"], - visibility = ["//visibility:private"], -) diff --git a/tools/vm/README.md b/tools/vm/README.md deleted file mode 100644 index 1e9859e66..000000000 --- a/tools/vm/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# VM Images & Tests - -All commands in this directory require the `gcloud` project to be set. - -For example: `gcloud config set project gvisor-kokoro-testing`. - -Images can be generated by using the `vm_image` rule. This rule will generate a -binary target that builds an image in an idempotent way, and can be referenced -from other rules. - -For example: - -``` -vm_image( - name = "ubuntu", - project = "ubuntu-1604-lts", - family = "ubuntu-os-cloud", - scripts = [ - "script.sh", - "other.sh", - ], -) -``` - -These images can be built manually by executing the target. The output on -`stdout` will be the image id (in the current project). - -For example: - -``` -$ bazel build :ubuntu -``` - -Images are always named per the hash of all the hermetic input scripts. This -allows images to be memoized quickly and easily. - -The `vm_test` rule can be used to execute a command remotely. This is still -under development however, and will likely change over time. - -For example: - -``` -vm_test( - name = "mycommand", - image = ":ubuntu", - targets = [":test"], -) -``` diff --git a/tools/vm/build.sh b/tools/vm/build.sh deleted file mode 100755 index 752b2b77b..000000000 --- a/tools/vm/build.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script is responsible for building a new GCP image that: 1) has nested -# virtualization enabled, and 2) has been completely set up with the -# image_setup.sh script. This script should be idempotent, as we memoize the -# setup script with a hash and check for that name. - -set -eou pipefail - -# Parameters. -declare -r USERNAME=${USERNAME:-test} -declare -r IMAGE_PROJECT=${IMAGE_PROJECT:-ubuntu-os-cloud} -declare -r IMAGE_FAMILY=${IMAGE_FAMILY:-ubuntu-1604-lts} -declare -r ZONE=${ZONE:-us-central1-f} - -# Random names. -declare -r DISK_NAME=$(mktemp -u disk-XXXXXX | tr A-Z a-z) -declare -r SNAPSHOT_NAME=$(mktemp -u snapshot-XXXXXX | tr A-Z a-z) -declare -r INSTANCE_NAME=$(mktemp -u build-XXXXXX | tr A-Z a-z) - -# Hash inputs in order to memoize the produced image. -declare -r SETUP_HASH=$( (echo ${USERNAME} ${IMAGE_PROJECT} ${IMAGE_FAMILY} && cat "$@") | sha256sum - | cut -d' ' -f1 | cut -c 1-16) -declare -r IMAGE_NAME=${IMAGE_FAMILY:-image}-${SETUP_HASH} - -# Does the image already exist? Skip the build. -declare -r existing=$(set -x; gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)") -if ! [[ -z "${existing}" ]]; then - echo "${existing}" - exit 0 -fi - -# Standard arguments (applies only on script execution). -declare -ar SSH_ARGS=("-o" "ConnectTimeout=60" "--") - -# gcloud has path errors; is this a result of being a genrule? -export PATH=${PATH:-/bin:/usr/bin:/usr/local/bin} - -# Start a unique instance. Note that this instance will have a unique persistent -# disk as it's boot disk with the same name as the instance. -(set -x; gcloud compute instances create \ - --quiet \ - --image-project "${IMAGE_PROJECT}" \ - --image-family "${IMAGE_FAMILY}" \ - --boot-disk-size "200GB" \ - --zone "${ZONE}" \ - "${INSTANCE_NAME}" >/dev/null) -function cleanup { - (set -x; gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}") -} -trap cleanup EXIT - -# Wait for the instance to become available (up to 5 minutes). -echo -n "Waiting for ${INSTANCE_NAME}" >&2 -declare timeout=300 -declare success=0 -declare internal="" -declare -r start=$(date +%s) -declare -r end=$((${start}+${timeout})) -while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do - echo -n "." >&2 - if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then - success=$((${success}+1)) - elif gcloud compute ssh --internal-ip --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then - success=$((${success}+1)) - internal="--internal-ip" - fi -done - -if [[ "${success}" -eq "0" ]]; then - echo "connect timed out after ${timeout} seconds." >&2 - exit 1 -else - echo "done." >&2 -fi - -# Run the install scripts provided. -for arg; do - (set -x; gcloud compute ssh ${internal} \ - --zone "${ZONE}" \ - "${USERNAME}"@"${INSTANCE_NAME}" -- \ - "${SSH_ARGS[@]}" \ - sudo bash - <"${arg}" >/dev/null) -done - -# Stop the instance; required before creating an image. -(set -x; gcloud compute instances stop --quiet --zone "${ZONE}" "${INSTANCE_NAME}" >/dev/null) - -# Create a snapshot of the instance disk. -(set -x; gcloud compute disks snapshot \ - --quiet \ - --zone "${ZONE}" \ - --snapshot-names="${SNAPSHOT_NAME}" \ - "${INSTANCE_NAME}" >/dev/null) - -# Create the disk image. -(set -x; gcloud compute images create \ - --quiet \ - --source-snapshot="${SNAPSHOT_NAME}" \ - --licenses="https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" \ - "${IMAGE_NAME}" >/dev/null) - -# Finish up. -echo "${IMAGE_NAME}" diff --git a/tools/vm/defs.bzl b/tools/vm/defs.bzl deleted file mode 100644 index 9af5ad3b4..000000000 --- a/tools/vm/defs.bzl +++ /dev/null @@ -1,202 +0,0 @@ -"""Image configuration. See README.md.""" - -load("//tools:defs.bzl", "default_installer") - -# vm_image_builder is a rule that will construct a shell script that actually -# generates a given VM image. Note that this does not _run_ the shell script -# (although it can be run manually). It will be run manually during generation -# of the vm_image target itself. This level of indirection is used so that the -# build system itself only runs the builder once when multiple targets depend -# on it, avoiding a set of races and conflicts. -def _vm_image_builder_impl(ctx): - # Generate a binary that actually builds the image. - builder = ctx.actions.declare_file(ctx.label.name) - script_paths = [] - for script in ctx.files.scripts: - script_paths.append(script.short_path) - builder_content = "\n".join([ - "#!/bin/bash", - "export ZONE=$(%s)" % ctx.files.zone[0].short_path, - "export USERNAME=%s" % ctx.attr.username, - "export IMAGE_PROJECT=%s" % ctx.attr.project, - "export IMAGE_FAMILY=%s" % ctx.attr.family, - "%s %s" % (ctx.files._builder[0].short_path, " ".join(script_paths)), - "", - ]) - ctx.actions.write(builder, builder_content, is_executable = True) - - # Note that the scripts should only be files, and should not include any - # indirect transitive dependencies. The build script wouldn't work. - return [DefaultInfo( - executable = builder, - runfiles = ctx.runfiles( - files = ctx.files.scripts + ctx.files._builder + ctx.files.zone, - ), - )] - -vm_image_builder = rule( - attrs = { - "_builder": attr.label( - executable = True, - default = "//tools/vm:builder", - cfg = "host", - ), - "username": attr.string(default = "$(whoami)"), - "zone": attr.label( - executable = True, - default = "//tools/vm:zone", - cfg = "host", - ), - "family": attr.string(mandatory = True), - "project": attr.string(mandatory = True), - "scripts": attr.label_list(allow_files = True), - }, - executable = True, - implementation = _vm_image_builder_impl, -) - -# See vm_image_builder above. -def _vm_image_impl(ctx): - # Run the builder to generate our output. - echo = ctx.actions.declare_file(ctx.label.name) - resolved_inputs, argv, runfiles_manifests = ctx.resolve_command( - command = "\n".join([ - "set -e", - "image=$(%s)" % ctx.files.builder[0].path, - "echo -ne \"#!/bin/bash\\necho ${image}\\n\" > %s" % echo.path, - "chmod 0755 %s" % echo.path, - ]), - tools = [ctx.attr.builder], - ) - ctx.actions.run_shell( - tools = resolved_inputs, - outputs = [echo], - progress_message = "Building image...", - execution_requirements = {"local": "true"}, - command = argv, - input_manifests = runfiles_manifests, - ) - - # Return just the echo command. All of the builder runfiles have been - # resolved and consumed in the generation of the trivial echo script. - return [DefaultInfo(executable = echo)] - -_vm_image_test = rule( - attrs = { - "builder": attr.label( - executable = True, - cfg = "host", - ), - }, - test = True, - implementation = _vm_image_impl, -) - -def vm_image(name, **kwargs): - vm_image_builder( - name = name + "_builder", - **kwargs - ) - _vm_image_test( - name = name, - builder = ":" + name + "_builder", - tags = [ - "local", - "manual", - ], - ) - -def _vm_test_impl(ctx): - runner = ctx.actions.declare_file("%s-executer" % ctx.label.name) - - # Note that the remote execution case must actually generate an - # intermediate target in order to collect all the relevant runfiles so that - # they can be copied over for remote execution. - runner_content = "\n".join([ - "#!/bin/bash", - "export ZONE=$(%s)" % ctx.files.zone[0].short_path, - "export USERNAME=%s" % ctx.attr.username, - "export IMAGE=$(%s)" % ctx.files.image[0].short_path, - "export SUDO=%s" % "true" if ctx.attr.sudo else "false", - "%s %s" % ( - ctx.executable.executer.short_path, - " ".join([ - target.files_to_run.executable.short_path - for target in ctx.attr.targets - ]), - ), - "", - ]) - ctx.actions.write(runner, runner_content, is_executable = True) - - # Return with all transitive files. - runfiles = ctx.runfiles( - transitive_files = depset(transitive = [ - depset(target.data_runfiles.files) - for target in ctx.attr.targets - if hasattr(target, "data_runfiles") - ]), - files = ctx.files.executer + ctx.files.zone + ctx.files.image + - ctx.files.targets, - collect_default = True, - collect_data = True, - ) - return [DefaultInfo(executable = runner, runfiles = runfiles)] - -_vm_test = rule( - attrs = { - "image": attr.label( - executable = True, - default = "//tools/vm:ubuntu1804", - cfg = "host", - ), - "executer": attr.label( - executable = True, - default = "//tools/vm:executer", - cfg = "host", - ), - "username": attr.string(default = "$(whoami)"), - "zone": attr.label( - executable = True, - default = "//tools/vm:zone", - cfg = "host", - ), - "sudo": attr.bool(default = True), - "machine": attr.string(default = "n1-standard-1"), - "targets": attr.label_list( - mandatory = True, - allow_empty = False, - cfg = "target", - ), - }, - test = True, - implementation = _vm_test_impl, -) - -def vm_test( - installers = None, - **kwargs): - """Runs the given targets as a remote test. - - Args: - installer: Script to run before all targets. - **kwargs: All test arguments. Should include targets and image. - """ - targets = kwargs.pop("targets", []) - if installers == None: - installers = [ - "//tools/installers:head", - "//tools/installers:images", - ] - targets = installers + targets - if default_installer(): - targets = [default_installer()] + targets - _vm_test( - tags = [ - "local", - "manual", - ], - targets = targets, - local = 1, - **kwargs - ) diff --git a/tools/vm/execute.sh b/tools/vm/execute.sh deleted file mode 100755 index 1f1f3ce01..000000000 --- a/tools/vm/execute.sh +++ /dev/null @@ -1,160 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -xeo pipefail - -# Required input. -if ! [[ -v IMAGE ]]; then - echo "no image provided: set IMAGE." - exit 1 -fi - -# Parameters. -declare -r USERNAME=${USERNAME:-test} -declare -r KEYNAME=$(mktemp --tmpdir -u key-XXXXXX) -declare -r SSHKEYS=$(mktemp --tmpdir -u sshkeys-XXXXXX) -declare -r INSTANCE_NAME=$(mktemp -u test-XXXXXX | tr A-Z a-z) -declare -r MACHINE=${MACHINE:-n1-standard-1} -declare -r ZONE=${ZONE:-us-central1-f} -declare -r SUDO=${SUDO:-false} - -# Standard arguments (applies only on script execution). -declare -ar SSH_ARGS=("-o" "ConnectTimeout=60" "--") - -# This script is executed as a test rule, which will reset the value of HOME. -# Unfortunately, it is needed to load the gconfig credentials. We will reset -# HOME when we actually execute in the remote environment, defined below. -export HOME=$(eval echo ~$(whoami)) - -# Generate unique keys for this test. -[[ -f "${KEYNAME}" ]] || ssh-keygen -t rsa -N "" -f "${KEYNAME}" -C "${USERNAME}" -cat > "${SSHKEYS}" <<EOF -${USERNAME}:$(cat ${KEYNAME}.pub) -EOF - -# Start a unique instance. This means that we first generate a unique set of ssh -# keys to ensure that only we have access to this instance. Note that we must -# constrain ourselves to Haswell or greater in order to have nested -# virtualization available. -gcloud compute instances create \ - --min-cpu-platform "Intel Haswell" \ - --preemptible \ - --no-scopes \ - --metadata block-project-ssh-keys=TRUE \ - --metadata-from-file ssh-keys="${SSHKEYS}" \ - --machine-type "${MACHINE}" \ - --image "${IMAGE}" \ - --zone "${ZONE}" \ - "${INSTANCE_NAME}" -function cleanup { - gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}" -} -trap cleanup EXIT - -# Wait for the instance to become available (up to 5 minutes). -declare timeout=300 -declare success=0 -declare -r start=$(date +%s) -declare -r end=$((${start}+${timeout})) -while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do - if gcloud compute ssh --ssh-key-file="${KEYNAME}" --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then - success=$((${success}+1)) - fi -done -if [[ "${success}" -eq "0" ]]; then - echo "connect timed out after ${timeout} seconds." - exit 1 -fi - -# Copy the local directory over. -tar czf - --dereference --exclude=.git . | - gcloud compute ssh \ - --ssh-key-file="${KEYNAME}" \ - --zone "${ZONE}" \ - "${USERNAME}"@"${INSTANCE_NAME}" -- \ - "${SSH_ARGS[@]}" \ - tar xzf - - -# Execute the command remotely. -for cmd; do - # Setup relevant environment. - # - # N.B. This is not a complete test environment, but is complete enough to - # provide rudimentary sharding and test output support. - declare -a PREFIX=( "env" ) - if [[ -v TEST_SHARD_INDEX ]]; then - PREFIX+=( "TEST_SHARD_INDEX=${TEST_SHARD_INDEX}" ) - fi - if [[ -v TEST_SHARD_STATUS_FILE ]]; then - SHARD_STATUS_FILE=$(mktemp -u test-shard-status-XXXXXX) - PREFIX+=( "TEST_SHARD_STATUS_FILE=/tmp/${SHARD_STATUS_FILE}" ) - fi - if [[ -v TEST_TOTAL_SHARDS ]]; then - PREFIX+=( "TEST_TOTAL_SHARDS=${TEST_TOTAL_SHARDS}" ) - fi - if [[ -v TEST_TMPDIR ]]; then - REMOTE_TMPDIR=$(mktemp -u test-XXXXXX) - PREFIX+=( "TEST_TMPDIR=/tmp/${REMOTE_TMPDIR}" ) - # Create remotely. - gcloud compute ssh \ - --ssh-key-file="${KEYNAME}" \ - --zone "${ZONE}" \ - "${USERNAME}"@"${INSTANCE_NAME}" -- \ - "${SSH_ARGS[@]}" \ - mkdir -p "/tmp/${REMOTE_TMPDIR}" - fi - if [[ -v XML_OUTPUT_FILE ]]; then - TEST_XML_OUTPUT=$(mktemp -u xml-output-XXXXXX) - PREFIX+=( "XML_OUTPUT_FILE=/tmp/${TEST_XML_OUTPUT}" ) - fi - if [[ "${SUDO}" == "true" ]]; then - PREFIX+=( "sudo" "-E" ) - fi - - # Execute the command. - gcloud compute ssh \ - --ssh-key-file="${KEYNAME}" \ - --zone "${ZONE}" \ - "${USERNAME}"@"${INSTANCE_NAME}" -- \ - "${SSH_ARGS[@]}" \ - "${PREFIX[@]}" "${cmd}" - - # Collect relevant results. - if [[ -v TEST_SHARD_STATUS_FILE ]]; then - gcloud compute scp \ - --ssh-key-file="${KEYNAME}" \ - --zone "${ZONE}" \ - "${USERNAME}"@"${INSTANCE_NAME}":/tmp/"${SHARD_STATUS_FILE}" \ - "${TEST_SHARD_STATUS_FILE}" 2>/dev/null || true # Allowed to fail. - fi - if [[ -v XML_OUTPUT_FILE ]]; then - gcloud compute scp \ - --ssh-key-file="${KEYNAME}" \ - --zone "${ZONE}" \ - "${USERNAME}"@"${INSTANCE_NAME}":/tmp/"${TEST_XML_OUTPUT}" \ - "${XML_OUTPUT_FILE}" 2>/dev/null || true # Allowed to fail. - fi - - # Clean up the temporary directory. - if [[ -v TEST_TMPDIR ]]; then - gcloud compute ssh \ - --ssh-key-file="${KEYNAME}" \ - --zone "${ZONE}" \ - "${USERNAME}"@"${INSTANCE_NAME}" -- \ - "${SSH_ARGS[@]}" \ - rm -rf "/tmp/${REMOTE_TMPDIR}" - fi -done diff --git a/tools/vm/ubuntu1604/10_core.sh b/tools/vm/ubuntu1604/10_core.sh deleted file mode 100755 index 629f7cf7a..000000000 --- a/tools/vm/ubuntu1604/10_core.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -xeo pipefail - -# Install all essential build tools. -while true; do - if (apt-get update && apt-get install -y \ - make \ - git-core \ - build-essential \ - linux-headers-$(uname -r) \ - pkg-config); then - break - fi - result=$? - if [[ $result -ne 100 ]]; then - exit $result - fi -done - -# Install a recent go toolchain. -if ! [[ -d /usr/local/go ]]; then - wget https://dl.google.com/go/go1.13.5.linux-amd64.tar.gz - tar -xvf go1.13.5.linux-amd64.tar.gz - mv go /usr/local -fi - -# Link the Go binary from /usr/bin; replacing anything there. -(cd /usr/bin && rm -f go && ln -fs /usr/local/go/bin/go go) diff --git a/tools/vm/ubuntu1604/15_gcloud.sh b/tools/vm/ubuntu1604/15_gcloud.sh deleted file mode 100755 index bc2e5eccc..000000000 --- a/tools/vm/ubuntu1604/15_gcloud.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -xeo pipefail - -# Install all essential build tools. -while true; do - if (apt-get update && apt-get install -y \ - apt-transport-https \ - ca-certificates \ - gnupg); then - break - fi - result=$? - if [[ $result -ne 100 ]]; then - exit $result - fi -done - -# Add gcloud repositories. -echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | \ - tee -a /etc/apt/sources.list.d/google-cloud-sdk.list - -# Add the appropriate key. -curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ - apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - - -# Install the gcloud SDK. -while true; do - if (apt-get update && apt-get install -y google-cloud-sdk); then - break - fi - result=$? - if [[ $result -ne 100 ]]; then - exit $result - fi -done diff --git a/tools/vm/ubuntu1604/20_bazel.sh b/tools/vm/ubuntu1604/20_bazel.sh deleted file mode 100755 index bb7afa676..000000000 --- a/tools/vm/ubuntu1604/20_bazel.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -xeo pipefail - -declare -r BAZEL_VERSION=2.0.0 - -# Install bazel dependencies. -while true; do - if (apt-get update && apt-get install -y \ - openjdk-8-jdk-headless \ - unzip); then - break - fi - result=$? - if [[ $result -ne 100 ]]; then - exit $result - fi -done - -# Use the release installer. -curl -L -o bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh -chmod a+x bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh -./bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh -rm -f bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh diff --git a/tools/vm/ubuntu1604/30_docker.sh b/tools/vm/ubuntu1604/30_docker.sh deleted file mode 100755 index d393133e4..000000000 --- a/tools/vm/ubuntu1604/30_docker.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Add dependencies. -while true; do - if (apt-get update && apt-get install -y \ - apt-transport-https \ - ca-certificates \ - curl \ - gnupg-agent \ - software-properties-common); then - break - fi - result=$? - if [[ $result -ne 100 ]]; then - exit $result - fi -done - -# Install the key. -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - - -# Add the repository. -add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" - -# Install docker. -while true; do - if (apt-get update && apt-get install -y \ - docker-ce \ - docker-ce-cli \ - containerd.io); then - break - fi - result=$? - if [[ $result -ne 100 ]]; then - exit $result - fi -done - -# Enable experimental features, for cross-building aarch64 images. -# Enable Docker IPv6. -cat > /etc/docker/daemon.json <<EOF -{ - "experimental": true, - "fixed-cidr-v6": "2001:db8:1::/64", - "ipv6": true -} -EOF diff --git a/tools/vm/ubuntu1604/40_kokoro.sh b/tools/vm/ubuntu1604/40_kokoro.sh deleted file mode 100755 index d3b96c9ad..000000000 --- a/tools/vm/ubuntu1604/40_kokoro.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -xeo pipefail - -# Declare kokoro's required public keys. -declare -r ssh_public_keys=( - "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDg7L/ZaEauETWrPklUTky3kvxqQfe2Ax/2CsSqhNIGNMnK/8d79CHlmY9+dE1FFQ/RzKNCaltgy7XcN/fCYiCZr5jm2ZtnLuGNOTzupMNhaYiPL419qmL+5rZXt4/dWTrsHbFRACxT8j51PcRMO5wgbL0Bg2XXimbx8kDFaurL2gqduQYqlu4lxWCaJqOL71WogcimeL63Nq/yeH5PJPWpqE4P9VUQSwAzBWFK/hLeds/AiP3MgVS65qHBnhq0JsHy8JQsqjZbG7Iidt/Ll0+gqzEbi62gDIcczG4KC0iOVzDDP/1BxDtt1lKeA23ll769Fcm3rJyoBMYxjvdw1TDx sabujp@trigger.mtv.corp.google.com" - "ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBNgGK/hCdjmulHfRE3hp4rZs38NCR8yAh0eDsztxqGcuXnuSnL7jOlRrbcQpremJ84omD4eKrIpwJUs+YokMdv4= sabujp@trigger.svl.corp.google.com" -) - -# Install dependencies. -while true; do - if (apt-get update && apt-get install -y \ - rsync \ - coreutils \ - python-psutil \ - qemu-kvm \ - python-pip \ - python3-pip \ - zip); then - break - fi - result=$? - if [[ $result -ne 100 ]]; then - exit $result - fi -done - -# junitparser is used to merge junit xml files. -pip install --no-cache-dir junitparser - -# We need a kbuilder user, which may already exist. -useradd -c "kbuilder user" -m -s /bin/bash kbuilder || true - -# We need to provision appropriate keys. -mkdir -p ~kbuilder/.ssh -(IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys -chmod 0600 ~kbuilder/.ssh/authorized_keys -chown -R kbuilder ~kbuilder/.ssh - -# Give passwordless sudo access. -cat > /etc/sudoers.d/kokoro <<EOF -kbuilder ALL=(ALL) NOPASSWD:ALL -EOF - -# Ensure we can run Docker without sudo. -usermod -aG docker kbuilder - -# Ensure that we can access kvm. -usermod -aG kvm kbuilder - -# Ensure that /tmpfs exists and is writable by kokoro. -# -# Note that kokoro will typically attach a second disk (sdb) to the instance -# that is used for the /tmpfs volume. In the future we could setup an init -# script that formats and mounts this here; however, we don't expect our build -# artifacts to be that large. -mkdir -p /tmpfs && chmod 0777 /tmpfs && touch /tmpfs/READY diff --git a/tools/vm/ubuntu1604/BUILD b/tools/vm/ubuntu1604/BUILD deleted file mode 100644 index ab1df0c4c..000000000 --- a/tools/vm/ubuntu1604/BUILD +++ /dev/null @@ -1,7 +0,0 @@ -package(licenses = ["notice"]) - -filegroup( - name = "ubuntu1604", - srcs = glob(["*.sh"]), - visibility = ["//:sandbox"], -) diff --git a/tools/vm/ubuntu1804/BUILD b/tools/vm/ubuntu1804/BUILD deleted file mode 100644 index 0c8856dde..000000000 --- a/tools/vm/ubuntu1804/BUILD +++ /dev/null @@ -1,7 +0,0 @@ -package(licenses = ["notice"]) - -alias( - name = "ubuntu1804", - actual = "//tools/vm/ubuntu1604", - visibility = ["//:sandbox"], -) diff --git a/website/blog/README.md b/website/blog/README.md new file mode 100644 index 000000000..e1d685288 --- /dev/null +++ b/website/blog/README.md @@ -0,0 +1,62 @@ +# gVisor blog + +The gVisor blog is owned and run by the gVisor team. + +## Contact + +Reach out to us on [gitter](https://gitter.im/gvisor/community) or the +[mailing list](https://groups.google.com/forum/#!forum/gvisor-users) if you +would like to write a blog post. + +## Submit a Post + +Anyone can write a blog post and submit it for review. Purely commercial content +or vendor pitches are not allowed. Please refer to the +[blog guidelines](#blog-guidelines) for more guidance about content is that +allowed. + +To submit a blog post, follow the steps below. + +1. [Sign the Contributor License Agreements](https://gvisor.dev/contributing/) + if you have not yet done so. +1. Familiarize yourself with the Markdown format for the + [existing blog posts](https://github.com/google/gvisor/tree/master/website/blog). +1. Write your blog post in a text editor of your choice. +1. (Optional) If you need help with markdown, check out + [StakEdit](https://stackedit.io/app#) or read + [Jekyll's formatting reference](https://jekyllrb.com/docs/posts/#creating-posts) + for more information. +1. Click **Add file** > **Create new file**. +1. Paste your content into the editor and save it. Name the file in the + following way: *[BLOG] Your proposed title* , but don’t put the date in the + file name. The blog reviewers will work with you on the final file name, and + the date on which the blog will be published. +1. When you save the file, GitHub will walk you through the pull request (PR) + process. +1. Send us a message on [gitter](https://gitter.im/gvisor/community) with a + link to your recently created PR. +1. A reviewer will be assigned to the pull request. They check your submission, + and work with you on feedback and final details. When the pull request is + approved, the blog will be scheduled for publication. + +### Blog Guidelines {#blog-guidelines} + +#### Suitable content: + +- **Original content only** +- gVisor features or project updates +- Tutorials and demos +- Use cases +- Content that is specific to a vendor or platform about gVisor installation + and use + +#### Unsuitable Content: + +- Blogs with no content relevant to gVisor +- Vendor pitches + +## Review Process + +Each blog post should be approved by at least one person on the team. Once all +of the review comments have been addressed and approved, a member of the team +will schedule publication of the blog post. diff --git a/website/blog/index.html b/website/blog/index.html index 5c67c95fc..272917fc4 100644 --- a/website/blog/index.html +++ b/website/blog/index.html @@ -20,3 +20,8 @@ pagination: {% if paginator.total_pages > 1 %} {% include paginator.html %} {% endif %} + +<hr> + +If you would like to contribute to the gVisor blog check out the +<a href="https://github.com/google/gvisor/tree/master/website/blog">instructions</a>. |