diff options
1225 files changed, 34027 insertions, 23661 deletions
diff --git a/.buildkite/hooks/post-command b/.buildkite/hooks/post-command index c4c6fc90c..5e6db407a 100644 --- a/.buildkite/hooks/post-command +++ b/.buildkite/hooks/post-command @@ -56,8 +56,10 @@ if test "${BUILDKITE_COMMAND_EXIT_STATUS}" -ne "0"; then sudo rm -rf "${HOME}/go" fi -# Kill any running containers (clear state). -CONTAINERS="$(docker ps -q)" -if ! test -z "${CONTAINERS}"; then - docker container kill ${CONTAINERS} 2>/dev/null || true -fi +# Kill any running containers (clear state), except for "bootstrap". +for container in $(docker ps -q); do + maybe_kill="$(docker inspect -f '{{if ne "/bootstrap" .Name}}true{{ end }}' "${container}")" + if test -n "${maybe_kill}"; then + docker container kill "${container}" + fi +done
\ No newline at end of file diff --git a/.buildkite/hooks/pre-command b/.buildkite/hooks/pre-command index fb2b1892d..9eb6d38f4 100644 --- a/.buildkite/hooks/pre-command +++ b/.buildkite/hooks/pre-command @@ -8,7 +8,7 @@ function install_pkgs() { done } install_pkgs make "linux-headers-$(uname -r)" linux-libc-dev \ - graphviz jq curl binutils gnupg gnupg-agent golang-go \ + graphviz jq curl binutils gnupg gnupg-agent gcc pkg-config \ apt-transport-https ca-certificates software-properties-common # Setup for parallelization with PARTITION and TOTAL_PARTITIONS. diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index c1b478dc3..adff70fee 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -169,6 +169,11 @@ steps: label: ":mechanical_arm: ARM" command: make arm-qemu-smoke-test + # Build everything. + - <<: *common + label: ":world_map: Build everything" + command: "make build OPTIONS=--build_tag_filters=-nogo TARGETS=//..." + # Run basic benchmarks smoke tests (no upload). - <<: *common label: ":fire: Benchmarks smoke test" diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index f7a6112fd..000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -name: Bug report -about: Create a bug report to help us improve -title: -labels: 'type: bug' -assignees: '' ---- - -**Description** - -A clear description of what the bug is. If possible, explicitly indicate the -expected behavior vs. the observed behavior. - -**Steps to reproduce** - -If available, please include detailed reproduction steps. - -If the bug requires software that is not publicly available, see if it can be -reproduced with software that is publicly available. - -**Environment** - -Please include the following details of your environment: - -* `runsc -version` -* `docker version` or `docker info` (if available) -* `kubectl version` and `kubectl get nodes` (if using Kubernetes) -* `uname -a` -* `git describe` (if built from source) -* `runsc` debug logs (if available) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 000000000..f096ad598 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,67 @@ +name: Bug report +description: Create a bug report to help us improve +labels: + - 'type: bug' +body: + - type: textarea + id: description + attributes: + label: Description + description: > + A clear description of the bug. If possible, explicitly indicate the + expected behavior vs. the observed behavior. + placeholder: Describe the problem. + validations: + required: true + - type: textarea + id: repro + attributes: + label: Steps to reproduce + description: > + If available, please include detailed reproduction steps. + + If the bug requires software that is not publicly available, see if it + can be reproduced with software that is publicly available. + placeholder: How can others reproduce the issue? + - type: markdown + attributes: + value: | + # Environment + + Please include the following details of your environment. + - type: textarea + id: runscVersion + attributes: + label: "runsc version" + placeholder: "`runsc -version`" + render: shell + - type: textarea + id: docker + attributes: + label: "docker version (if using docker)" + placeholder: "`docker version` or `docker info`" + render: shell + - type: input + id: uname + attributes: + label: "uname" + placeholder: "`uname -a`" + - type: textarea + id: kubectl + attributes: + label: "kubectl (if using Kubernetes)" + placeholder: "`kubectl version` and `kubectl get nodes`" + render: shell + - type: input + id: gitDescribe + attributes: + label: "repo state (if built from source)" + placeholder: "`git describe`" + - type: textarea + id: runscLogs + attributes: + label: "runsc debug logs (if available)" + description: > + See the [debug guide](https://gvisor.dev/docs/user_guide/debugging/) + to learn about logging. + render: shell diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 9acc45574..000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: Feature request -about: Suggest an idea or improvement -title: '' -labels: 'type: enhancement' -assignees: '' ---- - -**Description** - -A clear description of the feature or enhancement. - -**Is this feature related to a specific bug?** - -Please include a bug references if yes. - -**Do you have a specific solution in mind?** - -Please include any details about a solution that you have in mind, including any -alternatives considered. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 000000000..5073aba8b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,24 @@ +name: Feature request +description: Suggest an idea or improvement +labels: + - 'type: enhancement' +body: + - type: textarea + id: description + attributes: + label: Description + placeholder: A clear description of the feature or enhancement. + validations: + required: true + - type: textarea + id: related + attributes: + label: Is this feature related to a specific bug? + description: Please include a bug references if yes. + - type: textarea + id: solution + attributes: + label: Do you have a specific solution in mind? + description: > + Please include any details about a solution that you have in mind, + including any alternatives considered. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c53df7d25..e680428c3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -32,8 +32,9 @@ will need to be added to the appropriate `BUILD` files, and the `:gopath` target will need to be re-run to generate appropriate symlinks in the `GOPATH` directory tree. -Dependencies can be added by using `go mod get`. In order to keep the -`WORKSPACE` file in sync, run `tools/go_mod.sh` in place of `go mod`. +Dependencies can be added by using `go get`. In order to keep the `WORKSPACE` +file in sync, run `bazel run //:gazelle -- update-repos -from_file=go.mod` in +place of `go mod`. ### Coding Guidelines @@ -65,10 +66,14 @@ Rules: * Itself. * Go standard library. - * Except (transitively) package "net" (this will result in a non-cgo - binary). Use `//pkg/unet` instead. + * Except (transitively) package "net", which would result in a cgo + binary. Use `//pkg/unet` instead. * `@org_golang_x_sys//unix:go_default_library` (Go import `golang.org/x/sys/unix`). + * `@org_golang_x_time//rate:go_default_library` (Go import + `golang.org/x/time/rate`). + * `@com_github_google_btree//:go_default_library"` (Go import + `github.com/google/btree`). * Generated Go protobuf packages. * `@org_golang_google_protobuf//proto:go_default_library` (Go import `google.golang.org/protobuf`). @@ -670,13 +670,6 @@ go_repository( ) go_repository( - name = "com_github_jonboulle_clockwork", - importpath = "github.com/jonboulle/clockwork", - sum = "h1:VKV+ZcuP6l3yW9doeqz6ziZGgcynBVQO+obU0+0hcPo=", - version = "v0.1.0", -) - -go_repository( name = "com_github_jtolds_gls", importpath = "github.com/jtolds/gls", sum = "h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=", @@ -1245,8 +1238,8 @@ rbe_autoconfig(name = "rbe_default") http_archive( name = "rules_pkg", - sha256 = "6b5969a7acd7b60c02f816773b06fcf32fbe8ba0c7919ccdc2df4f8fb923804a", - url = "https://github.com/bazelbuild/rules_pkg/releases/download/0.3.0/rules_pkg-0.3.0.tar.gz", + sha256 = "353b20e8b093d42dd16889c7f918750fb8701c485ac6cceb69a5236500507c27", + url = "https://github.com/bazelbuild/rules_pkg/releases/download/0.5.0/rules_pkg-0.5.0.tar.gz", ) load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies") @@ -1254,7 +1247,7 @@ load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies") rules_pkg_dependencies() # System Call test dependencies. -# grpc also has a dependency on abseil but as this is before grpc dependency +# grpc also has a dependency on abseil but as this is before grpc dependency # declaration, it will take precedence over grpc's one # Version LTS 20210324.2 http_archive( @@ -1306,6 +1299,15 @@ http_archive( ], ) +http_archive( + name = "com_google_protobuf", + sha256 = "528927e398f4e290001886894dac17c5c6a2e5548f3fb68004cfb01af901b53a", + strip_prefix = "protobuf-3.17.3", + urls = ["https://github.com/protocolbuffers/protobuf/archive/v3.17.3.zip"], +) +load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") +protobuf_deps() + # Schemas for testing. http_file( name = "buildkite_pipeline_schema", @@ -1631,9 +1633,9 @@ go_repository( go_repository( name = "co_honnef_go_tools", importpath = "honnef.co/go/tools", - sum = "h1:EVDuO03OCZwpV2t/tLLxPmPiomagMoBOgfPt0FM+4IY=", - version = "v0.1.1", -) + sum = "h1:Tyybiul3hjaq0dkv+kcf5/MPTfo+ZBiEWrkhgxMPH54=", + version = "v0.3.0-0.dev.0.20210801021341-453cb28c0b15", + ) go_repository( name = "com_github_burntsushi_toml", diff --git a/debian/BUILD b/debian/BUILD index 32cc209bf..a62164cb0 100644 --- a/debian/BUILD +++ b/debian/BUILD @@ -28,12 +28,10 @@ pkg_deb( amd64 = "amd64", arm64 = "arm64", ), - changes = "runsc.changes", conffiles = [ "/etc/containerd/runsc.toml", ], data = ":debian-data", - deb = "runsc.deb", # Note that the description_file will be flatten (all newlines removed), # and therefore it is kept to a simple one-line description. The expected # format for debian packages is "short summary\nLonger explanation of @@ -42,6 +40,7 @@ pkg_deb( homepage = "https://gvisor.dev/", maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>", package = "runsc", + package_file_name = "runsc.deb", postinst = "postinst.sh", version_file = version, visibility = [ diff --git a/debian/postinst.sh b/debian/postinst.sh index 6a326f823..b387b9f22 100755 --- a/debian/postinst.sh +++ b/debian/postinst.sh @@ -22,7 +22,7 @@ fi if [ -f /etc/docker/daemon.json ]; then runsc install if systemctl is-active -q docker; then - systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2 + systemctl reload docker || echo "unable to reload docker; you must do so manually." >&2 fi fi diff --git a/g3doc/user_guide/compatibility.md b/g3doc/user_guide/compatibility.md index 9d3e3680f..76e879a01 100644 --- a/g3doc/user_guide/compatibility.md +++ b/g3doc/user_guide/compatibility.md @@ -49,43 +49,44 @@ Most common utilities work. Note that: <!-- mdformat off(don't wrap the table) --> -| Tool | Status | -|:--------:|:-----------------------------------------:| -| apt-get | Working. | -| bundle | Working. | -| cat | Working. | -| curl | Working. | -| dd | Working. | -| df | Working. | -| dig | Working. | -| drill | Working. | -| env | Working. | -| find | Working. | -| gdb | Working. | -| gosu | Working. | -| grep | Working (unless stdin is a pipe and stdout is /dev/null). | -| ifconfig | Works partially, like ip. Full support [in progress](https://gvisor.dev/issue/578). | -| ip | Some subcommands work (e.g. addr, route). Full support [in progress](https://gvisor.dev/issue/578). | -| less | Working. | -| ls | Working. | -| lsof | Working. | -| mount | Works in readonly mode. gVisor doesn't currently support creating new mounts at runtime. | -| nc | Working. | -| nmap | Not working. | -| netstat | [In progress](https://gvisor.dev/issue/2112). | -| nslookup | Working. | -| ping | Working. | -| ps | Working. | -| route | Working. | -| ss | [In progress](https://gvisor.dev/issue/2114). | -| sshd | Partially working. Job control [in progress](https://gvisor.dev/issue/154). | -| strace | Working. | -| tar | Working. | -| tcpdump | [In progress](https://gvisor.dev/issue/173). | -| top | Working. | -| uptime | Working. | -| vim | Working. | -| wget | Working. | +| Tool | Status | +| :--------: | :-----------------------------------------: | +| apt-get | Working. | +| bundle | Working. | +| cat | Working. | +| curl | Working. | +| dd | Working. | +| df | Working. | +| dig | Working. | +| drill | Working. | +| env | Working. | +| find | Working. | +| gcore | Working. | +| gdb | Working. | +| gosu | Working. | +| grep | Working (unless stdin is a pipe and stdout is /dev/null). | +| ifconfig | Works partially, like ip. Full support [in progress](https://gvisor.dev/issue/578). | +| ip | Some subcommands work (e.g. addr, route). Full support [in progress](https://gvisor.dev/issue/578). | +| less | Working. | +| ls | Working. | +| lsof | Working. | +| mount | Works in readonly mode. gVisor doesn't currently support creating new mounts at runtime. | +| nc | Working. | +| nmap | Not working. | +| netstat | [In progress](https://gvisor.dev/issue/2112). | +| nslookup | Working. | +| ping | Working. | +| ps | Working. | +| route | Working. | +| ss | [In progress](https://gvisor.dev/issue/2114). | +| sshd | Partially working. Job control [in progress](https://gvisor.dev/issue/154). | +| strace | Working. | +| tar | Working. | +| tcpdump | Working. [Promiscuous mode in progress](https://gvisor.dev/issue/3333). | +| top | Working. | +| uptime | Working. | +| vim | Working. | +| wget | Working. | <!-- mdformat on --> diff --git a/g3doc/user_guide/debugging.md b/g3doc/user_guide/debugging.md index 54fdce34f..2291b5fab 100644 --- a/g3doc/user_guide/debugging.md +++ b/g3doc/user_guide/debugging.md @@ -61,24 +61,39 @@ You can debug gVisor like any other Golang program. If you're running with Docker, you'll need to find the sandbox PID and attach the debugger as root. Here is an example: +Install a runsc with debug symbols (you can also use the +[nightly release](../install/#nightly)): + ```bash -# Get a runsc with debug symbols (download nightly or build with symbols). -bazel build -c dbg //runsc:runsc +make dev BAZEL_OPTIONS="-c dbg" +``` -# Start the container you want to debug. -docker run --runtime=runsc --rm --name=test -d alpine sleep 1000 +Start the container you want to debug using the runsc runtime with debug +options: -# Find the sandbox PID. -docker inspect test | grep Pid | head -n 1 +```bash +docker run --runtime=$(git branch --show-current)-d --rm --name=test -p 8080:80 -d nginx +``` -# Attach your favorite debugger. -sudo dlv attach <PID> +Find the PID and attach your favorite debugger: + +```bash +sudo dlv attach $(docker inspect test | grep Pid | head -n 1 | grep -oe "[0-9]*") +``` -# Set a breakpoint and resume. -break mm.MemoryManager.MMap +Set a breakpoint for accept: + +```bash +break gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).Accept continue ``` +In a different window connect to nginx to trigger the breakpoint: + +```bash +curl http://localhost:8080/ +``` + ## Profiling `runsc` integrates with Go profiling tools and gives you easy commands to diff --git a/g3doc/user_guide/install.md b/g3doc/user_guide/install.md index 321f13ce8..85ba6a161 100644 --- a/g3doc/user_guide/install.md +++ b/g3doc/user_guide/install.md @@ -55,7 +55,10 @@ sudo apt-get install -y \ software-properties-common ``` -Next, the configure the key used to sign archives and the repository: +Next, configure the key used to sign archives and the repository. + +NOTE: The key was updated on 2021-07-13 to replace the expired key. If you get +errors about the key being expired, run the `apt-key add` command below again. ```bash curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add - diff --git a/g3doc/user_guide/networking.md b/g3doc/user_guide/networking.md index 95f675633..75f01aac5 100644 --- a/g3doc/user_guide/networking.md +++ b/g3doc/user_guide/networking.md @@ -61,7 +61,7 @@ Add the following `runtimeArgs` to your Docker configuration ### Disable GSO {#gso} -If your Linux is older than 4.14.17, you can disable Generic Segmentation +If your Linux is older than 4.14.77, you can disable Generic Segmentation Offload (GSO) to run with a kernel that is newer than 3.17. Add the `--gso=false` flag to your Docker runtime configuration (`/etc/docker/daemon.json`) and restart the Docker daemon: diff --git a/images/default/Dockerfile b/images/default/Dockerfile index 5f652f2c3..4384d6271 100644 --- a/images/default/Dockerfile +++ b/images/default/Dockerfile @@ -15,7 +15,7 @@ RUN add-apt-repository \ "deb https://download.docker.com/linux/ubuntu \ $(lsb_release -cs) \ stable" -RUN apt-get install docker-ce-cli +RUN apt-get -y install docker-ce-cli # Install gcloud. RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-289.0.0-linux-x86_64.tar.gz | \ diff --git a/images/syzkaller/Dockerfile b/images/syzkaller/Dockerfile index 9a85ae345..9f739d972 100644 --- a/images/syzkaller/Dockerfile +++ b/images/syzkaller/Dockerfile @@ -2,7 +2,7 @@ FROM gcr.io/syzkaller/env # This image is mostly for investigating syzkaller crashes, so let's install # developer tools. -RUN apt update && apt install -y git vim strace gdb procps +RUN apt update --allow-releaseinfo-change && DEBIAN_FRONTEND=noninteractive apt install -y git vim strace gdb procps WORKDIR /syzkaller/gopath/src/github.com/google/syzkaller @@ -46,6 +46,10 @@ global: - "(field|method|struct|type) .* should be .*" # Generated proto code sometimes duplicates imports with aliases. - "duplicate import" + # These will never be annotated. + - "unexpected call to atomic function" + # Generated proto code creates declarations like 'var start int = iNdEx' + - "should omit type .* from declaration; it will be inferred from the right-hand side" internal: suppress: # We use ALL_CAPS for system definitions, @@ -55,6 +59,10 @@ global: # Same story for underscores. - "should not use ALL_CAPS in Go names" - "should not use underscores in Go names" + # These need to be annotated. + - "unexpected call to atomic function.*" + - "return with unexpected locks held.*" + - "incompatible return states.*" exclude: # Generated: exempt all. - pkg/shim/runtimeoptions/runtimeoptions_cri.go @@ -76,49 +84,7 @@ analyzers: checklocks: internal: exclude: - - "^-$" # b/181776900: analyzer fails on buildkite - - pkg/sentry/fs/dirent.go # unsupported usage. - - pkg/sentry/fs/fsutil/inode_cached.go # unsupported usage. - - pkg/sentry/fs/gofer/inode_state.go # unsupported usage. - - pkg/sentry/fs/gofer/session.go # unsupported usage. - - pkg/sentry/fs/ramfs/dir.go # unsupported usage. - - pkg/sentry/fsimpl/fuse/connection.go # unsupported usage. - - pkg/sentry/fsimpl/kernfs/filesystem.go # unsupported usage. - - pkg/sentry/fsimpl/kernfs/inode_impl_util.go # unsupported usage. - - pkg/sentry/fsimpl/fuse/dev_test.go # unsupported usage. - - pkg/sentry/fsimpl/gofer/filesystem.go # unsupported usage. - - pkg/sentry/fsimpl/gofer/gofer.go # unsupported usage. - - pkg/sentry/fsimpl/gofer/regular_file.go # unsupported usage. - - pkg/sentry/fsimpl/gofer/revalidate.go # unsupported usage. - - pkg/sentry/fsimpl/gofer/special_file.go # unsupported usage. - - pkg/sentry/fsimpl/gofer/symlink.go # unsupported usage. - - pkg/sentry/fsimpl/overlay/copy_up.go # unsupported usage. - - pkg/sentry/fsimpl/overlay/filesystem.go # unsupported usage. - - pkg/sentry/fsimpl/tmpfs/filesystem.go # unsupported usage. - - pkg/sentry/fsimpl/verity/filesystem.go # unsupported usage. - - pkg/sentry/kernel/futex/futex.go # unsupported usage. - - pkg/sentry/kernel/pipe/vfs.go # unsupported usage. - - pkg/sentry/mm/syscalls.go # unsupported usage. - - pkg/sentry/kernel/fd_table.go # unsupported usage. - - pkg/sentry/kernel/ptrace.go # unsupported usage. - - pkg/sentry/time/calibrated_clock_test.go # unsupported usage. - - pkg/sentry/kernel/task_context.go # unsupported usage. - - pkg/sentry/pgalloc/pgalloc.go # unsupported usage. - - pkg/sentry/socket/unix/transport/connectioned.go # unsupported usage. - - pkg/sentry/vfs/dentry.go # unsupported usage. - - pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go # unsupported usage. - - pkg/tcpip/stack/conntrack.go # unsupported usage. - - pkg/tcpip/transport/packet/endpoint_state.go # unsupported usage. - - pkg/tcpip/transport/raw/endpoint_state.go # unsupported usage. - - pkg/tcpip/transport/icmp/endpoint.go # unsupported usage. - - pkg/tcpip/transport/icmp/endpoint_state.go # unsupported usage. - - pkg/tcpip/transport/tcp/accept.go # unsupported usage. - - pkg/tcpip/transport/tcp/connect.go # unsupported usage. - - pkg/tcpip/transport/tcp/dispatcher.go # unsupported usage (TryLock) - - pkg/tcpip/transport/tcp/endpoint.go # unsupported usage. - - pkg/tcpip/transport/tcp/endpoint_state.go # unsupported usage. - - pkg/tcpip/transport/udp/endpoint.go # unsupported usage (defer unlock in anonymous function) - - pkg/tcpip/transport/udp/endpoint_state.go # unsupported usage (missing nested mutex annotation support) + - "^-$" # b/181776900: analyzer fails on buildkite. shadow: # Disable for now. generated: exclude: [".*"] @@ -166,6 +132,8 @@ analyzers: external: # Enabled. unreachable: external: # Enabled. + exclude: + - ".*protobuf/.*.go" unsafeptr: internal: exclude: @@ -177,35 +145,53 @@ analyzers: - pkg/sentry/platform/kvm/bluepill_unsafe.go # Special case. - pkg/sentry/platform/kvm/machine_unsafe.go # Special case. - pkg/sentry/platform/safecopy/safecopy_unsafe.go # Special case. + - pkg/sentry/usage/memory_unsafe.go # Special case. - pkg/sentry/vfs/mount_unsafe.go # Special case. - pkg/state/decode_unsafe.go # Special case. unusedresult: external: # Enabled. checkescape: external: # Enabled. - SA4016: + checklinkname: + external: # Enabled. + suppress: + # We don't care to check every single linkname in the Go standard + # library. Suppress findings about stdlib linkname targets we haven't + # described in checklinkname. + # + # Note that we _do_ want to check the signature of the known linkname + # targets in the standard library, so we still need to run + # checklinkname on stdlib generally. + - "linkname to unknown symbol" + SA1019: # Use of deprecated identifier. + # disable for now due to misattribution from golang.org/issue/44195. + generated: + exclude: [".*"] internal: - exclude: - - pkg/gohacks/gohacks_unsafe.go # x ^ 0 always equals x. - SA2001: + exclude: [".*"] + SA2001: # Empty critical section. internal: exclude: - pkg/sentry/fs/fs.go # Intentional. - pkg/sentry/fs/gofer/inode.go # Intentional. - pkg/refs/refcounter_test.go # Intentional. - ST1019: + SA4016: # Useless bitwise operations. + internal: + exclude: + - pkg/gohacks/gohacks_unsafe.go # x ^ 0 always equals x. + SA5011: # Possible nil pointer dereference. + internal: + exclude: + # https://github.com/dominikh/go-tools/issues/924 + - pkg/sentry/fs/fdpipe/pipe_opener_test.go + - pkg/tcpip/tests/integration/link_resolution_test.go + ST1019: # Multiple imports of the same package. generated: exclude: # package ".../kubeapi/core/v1/v1" is being imported more than once - generated.gen.pb.go - ST1021: + ST1021: # Doc should start with type name. internal: suppress: - "comment on exported type Translation" # Intentional. - "comment on exported type PinnedRange" # Intentional. - SA5011: - internal: - exclude: - # https://github.com/dominikh/go-tools/issues/924 - - pkg/sentry/fs/fdpipe/pipe_opener_test.go - - pkg/tcpip/tests/integration/link_resolution_test.go diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go index 3059479bd..008bbca08 100644 --- a/pkg/abi/abi_linux.go +++ b/pkg/abi/abi_linux.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package abi diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 38288bdb7..3576396c1 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -21,7 +21,6 @@ go_library( "epoll.go", "epoll_amd64.go", "epoll_arm64.go", - "errors.go", "errqueue.go", "eventfd.go", "exec.go", @@ -42,6 +41,7 @@ go_library( "linux.go", "membarrier.go", "mm.go", + "msgqueue.go", "netdevice.go", "netfilter.go", "netfilter_ipv6.go", diff --git a/pkg/abi/linux/arch_amd64.go b/pkg/abi/linux/arch_amd64.go index 0be31e755..064c0a6da 100644 --- a/pkg/abi/linux/arch_amd64.go +++ b/pkg/abi/linux/arch_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/clone.go b/pkg/abi/linux/clone.go index c2cbfca5e..322a4ef5a 100644 --- a/pkg/abi/linux/clone.go +++ b/pkg/abi/linux/clone.go @@ -16,13 +16,16 @@ package linux // Clone constants per clone(2). const ( + CSIGNAL = 0xff + CLONE_VM = 0x100 CLONE_FS = 0x200 CLONE_FILES = 0x400 CLONE_SIGHAND = 0x800 - CLONE_PARENT = 0x8000 + CLONE_PIDFD = 0x1000 CLONE_PTRACE = 0x2000 CLONE_VFORK = 0x4000 + CLONE_PARENT = 0x8000 CLONE_THREAD = 0x10000 CLONE_NEWNS = 0x20000 CLONE_SYSVSEM = 0x40000 @@ -32,10 +35,30 @@ const ( CLONE_DETACHED = 0x400000 CLONE_UNTRACED = 0x800000 CLONE_CHILD_SETTID = 0x1000000 + CLONE_NEWCGROUP = 0x2000000 CLONE_NEWUTS = 0x4000000 CLONE_NEWIPC = 0x8000000 CLONE_NEWUSER = 0x10000000 CLONE_NEWPID = 0x20000000 CLONE_NEWNET = 0x40000000 CLONE_IO = 0x80000000 + + // Only passable via clone3(2). + CLONE_CLEAR_SIGHAND = 0x100000000 + CLONE_INTO_CGROUP = 0x200000000 ) + +// CloneArgs is struct clone_args, from include/uapi/linux/sched.h. +type CloneArgs struct { + Flags uint64 + Pidfd uint64 + ChildTID uint64 + ParentTID uint64 + ExitSignal uint64 + Stack uint64 + StackSize uint64 + TLS uint64 + SetTID uint64 + SetTIDSize uint64 + Cgroup uint64 +} diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go index 7e74b1143..7d5b9fdfb 100644 --- a/pkg/abi/linux/epoll_amd64.go +++ b/pkg/abi/linux/epoll_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go index a35939cc9..5e5960d32 100644 --- a/pkg/abi/linux/epoll_arm64.go +++ b/pkg/abi/linux/epoll_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/syserror/BUILD b/pkg/abi/linux/errno/BUILD index 76bee5a64..d003342d5 100644 --- a/pkg/syserror/BUILD +++ b/pkg/abi/linux/errno/BUILD @@ -3,8 +3,7 @@ load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( - name = "syserror", - srcs = ["syserror.go"], + name = "errno", + srcs = ["errno.go"], visibility = ["//visibility:public"], - deps = ["@org_golang_x_sys//unix:go_default_library"], ) diff --git a/pkg/abi/linux/errors.go b/pkg/abi/linux/errno/errno.go index b08b2687e..38ebbb1d7 100644 --- a/pkg/abi/linux/errors.go +++ b/pkg/abi/linux/errno/errno.go @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -package linux +// Package errno holds errno codes for abi/linux. +package errno // Errno represents a Linux errno value. -type Errno int +type Errno uint32 // Errno values from include/uapi/asm-generic/errno-base.h. const ( @@ -60,8 +61,8 @@ const ( ENOLCK ENOSYS ENOTEMPTY - ELOOP //40 - _ // Skip for EWOULDBLOCK = EAGAIN + ELOOP // 40 + _ // Skip for EWOULDBLOCK = EAGAIN. ENOMSG //42 EIDRM ECHRNG @@ -78,13 +79,13 @@ const ( ENOANO EBADRQC EBADSLT - _ // Skip for EDEADLOCK = EDEADLK + _ // Skip for EDEADLOCK = EDEADLK. EBFONT ENOSTR // 60 ENODATA ETIME ENOSR - ENONET + _ // Skip for ENOENT = ENONET. ENOPKG EREMOTE ENOLINK @@ -156,8 +157,32 @@ const ( EHWPOISON ) -// errnos derived from other errnos +// errnos derived from other errnos. const ( EWOULDBLOCK = EAGAIN EDEADLOCK = EDEADLK + ENONET = ENOENT +) + +// errnos for internal errors. +const ( + // ERESTARTSYS is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler without SA_RESTART set, and restarted otherwise. + ERESTARTSYS = 512 + + // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it + // should always be restarted. + ERESTARTNOINTR = 513 + + // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler, and restarted otherwise. + ERESTARTNOHAND = 514 + + // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate + // that it should be restarted using a custom function. The interrupted + // syscall must register a custom restart function by calling + // Task.SetRestartSyscallFn. + ERESTART_RESTARTBLOCK = 516 ) diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go index 6b72364ea..ab404b17e 100644 --- a/pkg/abi/linux/file_amd64.go +++ b/pkg/abi/linux/file_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go index 6492c9038..6234955ab 100644 --- a/pkg/abi/linux/file_arm64.go +++ b/pkg/abi/linux/file_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go index 006b5a525..29062c97a 100644 --- a/pkg/abi/linux/ioctl.go +++ b/pkg/abi/linux/ioctl.go @@ -170,3 +170,18 @@ const ( KCOV_MODE_TRACE_PC = 2 KCOV_MODE_TRACE_CMP = 3 ) + +// Attestation ioctls. +var ( + SIGN_ATTESTATION_REPORT = IOC(_IOC_READ, 's', 1, 65) +) + +// SizeOfQuoteInputData is the number of bytes in the input data of ioctl call +// to get quote. +const SizeOfQuoteInputData = 64 + +// SignReport is a struct that gets signed quote from input data. +type SignReport struct { + data [64]byte + quote []byte +} diff --git a/pkg/abi/linux/msgqueue.go b/pkg/abi/linux/msgqueue.go new file mode 100644 index 000000000..0612a8214 --- /dev/null +++ b/pkg/abi/linux/msgqueue.go @@ -0,0 +1,108 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/marshal/primitive" +) + +// Linux-specific control commands. Source: include/uapi/linux/msg.h +const ( + MSG_STAT = 11 + MSG_INFO = 12 + MSG_STAT_ANY = 13 +) + +// msgrcv(2) options. Source: include/uapi/linux/msg.h +const ( + MSG_NOERROR = 010000 // No error if message is too big. + MSG_EXCEPT = 020000 // Receive any message except of specified type. + MSG_COPY = 040000 // Copy (not remove) all queue messages. +) + +// System-wide limits for message queues. Source: include/uapi/linux/msg.h +const ( + MSGMNI = 32000 // Maximum number of message queue identifiers. + MSGMAX = 8192 // Maximum size of message (bytes). + MSGMNB = 16384 // Default max size of a message queue. +) + +// System-wide limits. Unused. Source: include/uapi/linux/msg.h +const ( + MSGPOOL = (MSGMNI * MSGMNB / 1024) + MSGTQL = MSGMNB + MSGMAP = MSGMNB + MSGSSZ = 16 + + // MSGSEG is simplified due to the inexistance of a ternary operator. + MSGSEG = 0xffff +) + +// MsqidDS is equivelant to struct msqid64_ds. Source: +// include/uapi/asm-generic/shmbuf.h +// +// +marshal +type MsqidDS struct { + MsgPerm IPCPerm // IPC permissions. + MsgStime TimeT // Last msgsnd time. + MsgRtime TimeT // Last msgrcv time. + MsgCtime TimeT // Last change time. + MsgCbytes uint64 // Current number of bytes on the queue. + MsgQnum uint64 // Number of messages in the queue. + MsgQbytes uint64 // Max number of bytes in the queue. + MsgLspid int32 // PID of last msgsnd. + MsgLrpid int32 // PID of last msgrcv. + unused4 uint64 + unused5 uint64 +} + +// MsgBuf is equivelant to struct msgbuf. Source: include/uapi/linux/msg.h +// +// +marshal dynamic +type MsgBuf struct { + Type primitive.Int64 + Text primitive.ByteSlice +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (b *MsgBuf) SizeBytes() int { + return b.Type.SizeBytes() + b.Text.SizeBytes() +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (b *MsgBuf) MarshalBytes(dst []byte) { + b.Type.MarshalUnsafe(dst) + b.Text.MarshalBytes(dst[b.Type.SizeBytes():]) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (b *MsgBuf) UnmarshalBytes(src []byte) { + b.Type.UnmarshalUnsafe(src) + b.Text.UnmarshalBytes(src[b.Type.SizeBytes():]) +} + +// MsgInfo is equivelant to struct msginfo. Source: include/uapi/linux/msg.h +// +// +marshal +type MsgInfo struct { + MsgPool int32 + MsgMap int32 + MsgMax int32 + MsgMnb int32 + MsgMni int32 + MsgSsz int32 + MsgTql int32 + MsgSeg uint16 `marshal:"unaligned"` +} diff --git a/pkg/abi/linux/netfilter_ipv6.go b/pkg/abi/linux/netfilter_ipv6.go index b088b207c..f8c0e891e 100644 --- a/pkg/abi/linux/netfilter_ipv6.go +++ b/pkg/abi/linux/netfilter_ipv6.go @@ -41,7 +41,6 @@ const ( // IP6T_ORIGINAL_DST is the ip6tables SOL_IPV6 socket option. Corresponds to // the value in include/uapi/linux/netfilter_ipv6/ip6_tables.h. -// TODO(gvisor.dev/issue/3549): Support IPv6 original destination. const IP6T_ORIGINAL_DST = 80 // IP6TReplace is the argument for the IP6T_SO_SET_REPLACE sockopt. It diff --git a/pkg/abi/linux/ptrace_amd64.go b/pkg/abi/linux/ptrace_amd64.go index e722971f1..e970b5b4a 100644 --- a/pkg/abi/linux/ptrace_amd64.go +++ b/pkg/abi/linux/ptrace_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/ptrace_arm64.go b/pkg/abi/linux/ptrace_arm64.go index 3d0906565..91e5af56b 100644 --- a/pkg/abi/linux/ptrace_arm64.go +++ b/pkg/abi/linux/ptrace_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/abi/linux/sem_amd64.go b/pkg/abi/linux/sem_amd64.go index ab980cb4f..cabd2d4b8 100644 --- a/pkg/abi/linux/sem_amd64.go +++ b/pkg/abi/linux/sem_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/abi/linux/sem_arm64.go b/pkg/abi/linux/sem_arm64.go index 521468fb1..a0c467dc4 100644 --- a/pkg/abi/linux/sem_arm64.go +++ b/pkg/abi/linux/sem_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go index 31d020f77..06a4c6401 100644 --- a/pkg/abi/linux/signal.go +++ b/pkg/abi/linux/signal.go @@ -293,8 +293,6 @@ type Sigevent struct { UnRemainder [44]byte } -// LINT.IfChange - // SigAction represents struct sigaction. // // +marshal @@ -306,8 +304,6 @@ type SigAction struct { Mask SignalSet } -// LINT.ThenChange(../../safecopy/safecopy_unsafe.go) - // SignalStack represents information about a user stack, and is equivalent to // stack_t. // diff --git a/pkg/abi/linux/wait.go b/pkg/abi/linux/wait.go index 4bdc280d1..710729138 100644 --- a/pkg/abi/linux/wait.go +++ b/pkg/abi/linux/wait.go @@ -14,6 +14,10 @@ package linux +import ( + "fmt" +) + // Options for waitpid(2), wait4(2), and/or waitid(2), from // include/uapi/linux/wait.h. const ( @@ -34,3 +38,124 @@ const ( P_PID = 0x1 P_PGID = 0x2 ) + +// WaitStatus represents a thread status, as returned by the wait* family of +// syscalls. +type WaitStatus uint32 + +// WaitStatusExit returns a WaitStatus representing the given exit status. +func WaitStatusExit(status int32) WaitStatus { + return WaitStatus(uint32(status) << 8) +} + +// WaitStatusTerminationSignal returns a WaitStatus representing termination by +// the given signal. +func WaitStatusTerminationSignal(sig Signal) WaitStatus { + return WaitStatus(uint32(sig)) +} + +// WaitStatusStopped returns a WaitStatus representing stoppage by the given +// signal or ptrace trap code. +func WaitStatusStopped(code uint32) WaitStatus { + return WaitStatus(code<<8 | 0x7f) +} + +// WaitStatusContinued returns a WaitStatus representing continuation by +// SIGCONT. +func WaitStatusContinued() WaitStatus { + return WaitStatus(0xffff) +} + +// WithCoreDump returns a copy of ws that indicates that a core dump was +// generated. +// +// Preconditions: ws.Signaled(). +func (ws WaitStatus) WithCoreDump() WaitStatus { + return ws | 0x80 +} + +// Exited returns true if ws represents an exit status, consistent with +// WIFEXITED. +func (ws WaitStatus) Exited() bool { + return ws&0x7f == 0 +} + +// Signaled returns true if ws represents a termination by signal, consistent +// with WIFSIGNALED. +func (ws WaitStatus) Signaled() bool { + // ws&0x7f != 0 (exited) and ws&0x7f != 0x7f (stopped or continued) + return ((ws&0x7f)+1)>>1 != 0 +} + +// CoreDumped returns true if ws indicates that a core dump was produced, +// consistent with WCOREDUMP. +// +// Preconditions: ws.Signaled(). +func (ws WaitStatus) CoreDumped() bool { + return ws&0x80 != 0 +} + +// Stopped returns true if ws represents a stoppage, consistent with +// WIFSTOPPED. +func (ws WaitStatus) Stopped() bool { + return ws&0xff == 0x7f +} + +// Continued returns true if ws represents a continuation by SIGCONT, +// consistent with WIFCONTINUED. +func (ws WaitStatus) Continued() bool { + return ws == 0xffff +} + +// ExitStatus returns the lower 8 bits of the exit status represented by ws, +// consistent with WEXITSTATUS. +// +// Preconditions: ws.Exited(). +func (ws WaitStatus) ExitStatus() uint32 { + return uint32((ws & 0xff00) >> 8) +} + +// TerminationSignal returns the termination signal represented by ws, +// consistent with WTERMSIG. +// +// Preconditions: ws.Signaled(). +func (ws WaitStatus) TerminationSignal() Signal { + return Signal(ws & 0x7f) +} + +// StopSignal returns the stop signal represented by ws, consistent with +// WSTOPSIG. +// +// Preconditions: ws.Stopped(). +func (ws WaitStatus) StopSignal() Signal { + return Signal((ws & 0xff00) >> 8) +} + +// PtraceEvent returns the PTRACE_EVENT_* field in ws. +// +// Preconditions: ws.Stopped(). +func (ws WaitStatus) PtraceEvent() uint32 { + return uint32(ws >> 16) +} + +// String implements fmt.Stringer.String. +func (ws WaitStatus) String() string { + switch { + case ws.Exited(): + return fmt.Sprintf("exit status %d", ws.ExitStatus()) + case ws.Signaled(): + if ws.CoreDumped() { + return fmt.Sprintf("killed by signal %d (core dumped)", ws.TerminationSignal()) + } + return fmt.Sprintf("killed by signal %d", ws.TerminationSignal()) + case ws.Stopped(): + if ev := ws.PtraceEvent(); ev != 0 { + return fmt.Sprintf("stopped by signal %d (PTRACE_EVENT %d)", ws.StopSignal(), ev) + } + return fmt.Sprintf("stopped by signal %d", ws.StopSignal()) + case ws.Continued(): + return "continued" + default: + return fmt.Sprintf("unknown status %#x", uint32(ws)) + } +} diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD index bd3a5cce9..6d8b5f818 100644 --- a/pkg/amutex/BUILD +++ b/pkg/amutex/BUILD @@ -8,7 +8,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/context", - "//pkg/syserror", + "//pkg/errors/linuxerr", ], ) diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go index d7acc1d9f..985199cfa 100644 --- a/pkg/amutex/amutex.go +++ b/pkg/amutex/amutex.go @@ -20,7 +20,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // Sleeper must be implemented by users of the abortable mutex to allow for @@ -33,7 +33,7 @@ type NoopSleeper = context.Context // Block blocks until either receiving from ch succeeds (in which case it // returns nil) or sleeper is interrupted (in which case it returns -// syserror.ErrInterrupted). +// linuxerr.ErrInterrupted). func Block(sleeper Sleeper, ch <-chan struct{}) error { cancel := sleeper.SleepStart() select { @@ -42,7 +42,7 @@ func Block(sleeper Sleeper, ch <-chan struct{}) error { return nil case <-cancel: sleeper.SleepFinish(false) - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } } diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD index 11072d4de..02c0e52b9 100644 --- a/pkg/atomicbitops/BUILD +++ b/pkg/atomicbitops/BUILD @@ -18,7 +18,10 @@ go_library( go_test( name = "atomicbitops_test", size = "small", - srcs = ["atomicbitops_test.go"], + srcs = [ + "aligned_test.go", + "atomicbitops_test.go", + ], library = ":atomicbitops", deps = ["//pkg/sync"], ) diff --git a/pkg/atomicbitops/aligned_32bit_unsafe.go b/pkg/atomicbitops/aligned_32bit_unsafe.go index df706b453..0e4765c48 100644 --- a/pkg/atomicbitops/aligned_32bit_unsafe.go +++ b/pkg/atomicbitops/aligned_32bit_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm || mips || 386 // +build arm mips 386 package atomicbitops @@ -33,14 +34,15 @@ import ( // // +stateify savable type AlignedAtomicInt64 struct { - value [15]byte + value int64 + value32 int32 } func (aa *AlignedAtomicInt64) ptr() *int64 { - // In the 15-byte aa.value, there are guaranteed to be 8 contiguous - // bytes with 64-bit alignment. We find an address in this range by - // adding 7, then clear the 3 least significant bits to get its start. - return (*int64)(unsafe.Pointer((uintptr(unsafe.Pointer(&aa.value[0])) + 7) &^ 7)) + // On 32-bit systems, aa.value is guaranteed to be 32-bit aligned. It means + // that in the 12-byte aa.value, there are guaranteed to be 8 contiguous bytes + // with 64-bit alignment. + return (*int64)(unsafe.Pointer((uintptr(unsafe.Pointer(&aa.value)) + 4) &^ 7)) } // Load is analagous to atomic.LoadInt64. @@ -70,14 +72,15 @@ func (aa *AlignedAtomicInt64) Add(v int64) int64 { // // +stateify savable type AlignedAtomicUint64 struct { - value [15]byte + value uint64 + value32 uint32 } func (aa *AlignedAtomicUint64) ptr() *uint64 { - // In the 15-byte aa.value, there are guaranteed to be 8 contiguous - // bytes with 64-bit alignment. We find an address in this range by - // adding 7, then clear the 3 least significant bits to get its start. - return (*uint64)(unsafe.Pointer((uintptr(unsafe.Pointer(&aa.value[0])) + 7) &^ 7)) + // On 32-bit systems, aa.value is guaranteed to be 32-bit aligned. It means + // that in the 12-byte aa.value, there are guaranteed to be 8 contiguous bytes + // with 64-bit alignment. + return (*uint64)(unsafe.Pointer((uintptr(unsafe.Pointer(&aa.value)) + 4) &^ 7)) } // Load is analagous to atomic.LoadUint64. diff --git a/pkg/atomicbitops/aligned_64bit.go b/pkg/atomicbitops/aligned_64bit.go index 1544c7814..2c421d920 100644 --- a/pkg/atomicbitops/aligned_64bit.go +++ b/pkg/atomicbitops/aligned_64bit.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !arm && !mips && !386 // +build !arm,!mips,!386 package atomicbitops diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/atomicbitops/aligned_test.go index 3486864dc..e7123d2b8 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go +++ b/pkg/atomicbitops/aligned_test.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,17 +12,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -package disklayout +package atomicbitops import ( "testing" ) -// TestDirentSize tests that the dirent structs are of the correct -// size. -func TestDirentSize(t *testing.T) { - var dOld DirentOld - assertSize(t, &dOld, DirentSize) - var dNew DirentNew - assertSize(t, &dNew, DirentSize) +func TestAtomiciInt64(t *testing.T) { + v := struct { + v8 int8 + v64 AlignedAtomicInt64 + }{} + v.v64.Add(1) +} + +func TestAtomicUint64(t *testing.T) { + v := struct { + v8 uint8 + v64 AlignedAtomicUint64 + }{} + v.v64.Add(1) } diff --git a/pkg/atomicbitops/atomicbitops.go b/pkg/atomicbitops/atomicbitops.go index 1be081719..4c4606a58 100644 --- a/pkg/atomicbitops/atomicbitops.go +++ b/pkg/atomicbitops/atomicbitops.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || arm64 // +build amd64 arm64 // Package atomicbitops provides extensions to the sync/atomic package. diff --git a/pkg/atomicbitops/atomicbitops_noasm.go b/pkg/atomicbitops/atomicbitops_noasm.go index 3b2898256..474c0c815 100644 --- a/pkg/atomicbitops/atomicbitops_noasm.go +++ b/pkg/atomicbitops/atomicbitops_noasm.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !amd64 && !arm64 // +build !amd64,!arm64 package atomicbitops diff --git a/pkg/bitmap/BUILD b/pkg/bitmap/BUILD new file mode 100644 index 000000000..0f1e7006d --- /dev/null +++ b/pkg/bitmap/BUILD @@ -0,0 +1,16 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "bitmap", + srcs = ["bitmap.go"], + visibility = ["//:sandbox"], +) + +go_test( + name = "bitmap_test", + size = "small", + srcs = ["bitmap_test.go"], + library = ":bitmap", +) diff --git a/pkg/bitmap/bitmap.go b/pkg/bitmap/bitmap.go new file mode 100644 index 000000000..12d2fc2b8 --- /dev/null +++ b/pkg/bitmap/bitmap.go @@ -0,0 +1,234 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package bitmap provides the implementation of bitmap. +package bitmap + +import ( + "math" + "math/bits" +) + +// Bitmap implements an efficient bitmap. +// +// +stateify savable +type Bitmap struct { + // numOnes is the number of ones in the bitmap. + numOnes uint32 + + // bitBlock holds the bits. The type of bitBlock is uint64 which means + // each number in bitBlock contains 64 entries. + bitBlock []uint64 +} + +// New create a new empty Bitmap. +func New(size uint32) Bitmap { + b := Bitmap{} + bSize := (size + 63) / 64 + b.bitBlock = make([]uint64, bSize) + return b +} + +// IsEmpty verifies whether the Bitmap is empty. +func (b *Bitmap) IsEmpty() bool { + return b.numOnes == 0 +} + +// Minimum return the smallest value in the Bitmap. +func (b *Bitmap) Minimum() uint32 { + for i := 0; i < len(b.bitBlock); i++ { + if w := b.bitBlock[i]; w != 0 { + r := bits.TrailingZeros64(w) + return uint32(r + i*64) + } + } + return math.MaxInt32 +} + +// FirstZero returns the first unset bit from the range [start, ). +func (b *Bitmap) FirstZero(start uint32) uint32 { + i, nbit := int(start/64), start%64 + n := len(b.bitBlock) + if i >= n { + return math.MaxInt32 + } + w := b.bitBlock[i] | ((1 << nbit) - 1) + for { + if w != ^uint64(0) { + r := bits.TrailingZeros64(^w) + return uint32(r + i*64) + } + i++ + if i == n { + break + } + w = b.bitBlock[i] + } + return math.MaxInt32 +} + +// Maximum return the largest value in the Bitmap. +func (b *Bitmap) Maximum() uint32 { + for i := len(b.bitBlock) - 1; i >= 0; i-- { + if w := b.bitBlock[i]; w != 0 { + r := bits.LeadingZeros64(w) + return uint32(i*64 + 63 - r) + } + } + return uint32(0) +} + +// Add add i to the Bitmap. +func (b *Bitmap) Add(i uint32) { + blockNum, mask := i/64, uint64(1)<<(i%64) + // if blockNum is out of range, extend b.bitBlock + if x, y := int(blockNum), len(b.bitBlock); x >= y { + b.bitBlock = append(b.bitBlock, make([]uint64, x-y+1)...) + } + oldBlock := b.bitBlock[blockNum] + newBlock := oldBlock | mask + if oldBlock != newBlock { + b.bitBlock[blockNum] = newBlock + b.numOnes++ + } +} + +// Remove i from the Bitmap. +func (b *Bitmap) Remove(i uint32) { + blockNum, mask := i/64, uint64(1)<<(i%64) + oldBlock := b.bitBlock[blockNum] + newBlock := oldBlock &^ mask + if oldBlock != newBlock { + b.bitBlock[blockNum] = newBlock + b.numOnes-- + } +} + +// Clone the Bitmap. +func (b *Bitmap) Clone() Bitmap { + bitmap := Bitmap{b.numOnes, make([]uint64, len(b.bitBlock))} + copy(bitmap.bitBlock, b.bitBlock[:]) + return bitmap +} + +// countOnesForBlocks count all 1 bits within b.bitBlock of begin and that of end. +// The begin block and end block are inclusive. +func (b *Bitmap) countOnesForBlocks(begin, end uint32) uint64 { + ones := uint64(0) + beginBlock := begin / 64 + endBlock := end / 64 + for i := beginBlock; i <= endBlock; i++ { + ones += uint64(bits.OnesCount64(b.bitBlock[i])) + } + return ones +} + +// countOnesForAllBlocks count all 1 bits in b.bitBlock. +func (b *Bitmap) countOnesForAllBlocks() uint64 { + ones := uint64(0) + for i := 0; i < len(b.bitBlock); i++ { + ones += uint64(bits.OnesCount64(b.bitBlock[i])) + } + return ones +} + +// flipRange flip the bits within range (begin and end). begin is inclusive and end is exclusive. +func (b *Bitmap) flipRange(begin, end uint32) { + end-- + beginBlock := begin / 64 + endBlock := end / 64 + if beginBlock == endBlock { + b.bitBlock[endBlock] ^= ((^uint64(0) << uint(begin%64)) & ((uint64(1) << (uint(end)%64 + 1)) - 1)) + } else { + b.bitBlock[beginBlock] ^= ^(^uint64(0) << uint(begin%64)) + for i := beginBlock; i < endBlock; i++ { + b.bitBlock[i] = ^b.bitBlock[i] + } + b.bitBlock[endBlock] ^= ((uint64(1) << (uint(end)%64 + 1)) - 1) + } +} + +// clearRange clear the bits within range (begin and end). begin is inclusive and end is exclusive. +func (b *Bitmap) clearRange(begin, end uint32) { + end-- + beginBlock := begin / 64 + endBlock := end / 64 + if beginBlock == endBlock { + b.bitBlock[beginBlock] &= (((uint64(1) << uint(begin%64)) - 1) | ^((uint64(1) << (uint(end)%64 + 1)) - 1)) + } else { + b.bitBlock[beginBlock] &= ((uint64(1) << uint(begin%64)) - 1) + for i := beginBlock + 1; i < endBlock; i++ { + b.bitBlock[i] &= ^b.bitBlock[i] + } + b.bitBlock[endBlock] &= ^((uint64(1) << (uint(end)%64 + 1)) - 1) + } +} + +// ClearRange clear bits within range (begin and end) for the Bitmap. begin is inclusive and end is exclusive. +func (b *Bitmap) ClearRange(begin, end uint32) { + blockRange := end/64 - begin/64 + // When the number of cleared blocks is larger than half of the length of b.bitBlock, + // counting 1s for the entire bitmap has better performance. + if blockRange > uint32(len(b.bitBlock)/2) { + b.clearRange(begin, end) + b.numOnes = uint32(b.countOnesForAllBlocks()) + } else { + oldRangeOnes := b.countOnesForBlocks(begin, end) + b.clearRange(begin, end) + newRangeOnes := b.countOnesForBlocks(begin, end) + b.numOnes += uint32(newRangeOnes - oldRangeOnes) + } +} + +// FlipRange flip bits within range (begin and end) for the Bitmap. begin is inclusive and end is exclusive. +func (b *Bitmap) FlipRange(begin, end uint32) { + blockRange := end/64 - begin/64 + // When the number of flipped blocks is larger than half of the length of b.bitBlock, + // counting 1s for the entire bitmap has better performance. + if blockRange > uint32(len(b.bitBlock)/2) { + b.flipRange(begin, end) + b.numOnes = uint32(b.countOnesForAllBlocks()) + } else { + oldRangeOnes := b.countOnesForBlocks(begin, end) + b.flipRange(begin, end) + newRangeOnes := b.countOnesForBlocks(begin, end) + b.numOnes += uint32(newRangeOnes - oldRangeOnes) + } +} + +// ToSlice transform the Bitmap into slice. For example, a bitmap of [0, 1, 0, 1] +// will return the slice [1, 3]. +func (b *Bitmap) ToSlice() []uint32 { + bitmapSlice := make([]uint32, 0, b.numOnes) + // base is the start number of a bitBlock + base := 0 + for i := 0; i < len(b.bitBlock); i++ { + bitBlock := b.bitBlock[i] + // Iterate through all the numbers held by this bit block. + for bitBlock != 0 { + // Extract the lowest set 1 bit. + j := bitBlock & -bitBlock + // Interpret the bit as the in32 number it represents and add it to result. + bitmapSlice = append(bitmapSlice, uint32((base + int(bits.OnesCount64(j-1))))) + bitBlock ^= j + } + base += 64 + } + return bitmapSlice +} + +// GetNumOnes return the the number of ones in the Bitmap. +func (b *Bitmap) GetNumOnes() uint32 { + return b.numOnes +} diff --git a/pkg/bitmap/bitmap_test.go b/pkg/bitmap/bitmap_test.go new file mode 100644 index 000000000..76ebd779f --- /dev/null +++ b/pkg/bitmap/bitmap_test.go @@ -0,0 +1,306 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bitmap + +import ( + "math" + "reflect" + "testing" +) + +// generateFilledSlice generates a slice fill with numbers. +func generateFilledSlice(min, max, length int) []uint32 { + if max == min { + return []uint32{uint32(min)} + } + if length > (max - min) { + return nil + } + randSlice := make([]uint32, length) + if length != 0 { + rangeNum := uint32((max - min) / length) + randSlice[0], randSlice[length-1] = uint32(min), uint32(max) + for i := 1; i < length-1; i++ { + randSlice[i] = randSlice[i-1] + rangeNum + } + } + return randSlice +} + +// generateFilledBitmap generates a Bitmap filled with fillNum of numbers, +// and returns the slice and bitmap. +func generateFilledBitmap(min, max, fillNum int) ([]uint32, Bitmap) { + bitmap := New(uint32(max)) + randSlice := generateFilledSlice(min, max, fillNum) + for i := 0; i < fillNum; i++ { + bitmap.Add(randSlice[i]) + } + return randSlice, bitmap +} + +func TestNewBitmap(t *testing.T) { + tests := []struct { + name string + size int + expectSize int + }{ + {"length 1", 1, 1}, + {"length 1024", 1024, 16}, + {"length 1025", 1025, 17}, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + if bitmap := New(uint32(tt.size)); len(bitmap.bitBlock) != tt.expectSize { + t.Errorf("New created bitmap with %v, bitBlock size: %d, wanted: %d", tt.name, len(bitmap.bitBlock), tt.expectSize) + } + }) + } +} + +func TestAdd(t *testing.T) { + tests := []struct { + name string + bitmapSize int + addNum int + }{ + {"Add with null bitmap.bitBlock", 0, 10}, + {"Add without extending bitBlock", 64, 10}, + {"Add without extending bitblock with margin number", 63, 64}, + {"Add with extended one block", 1024, 1025}, + {"Add with extended more then one block", 1024, 2048}, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + bitmap := New(uint32(tt.bitmapSize)) + bitmap.Add(uint32(tt.addNum)) + bitmapSlice := bitmap.ToSlice() + if bitmapSlice[0] != uint32(tt.addNum) { + t.Errorf("%v, get number: %d, wanted: %d.", tt.name, bitmapSlice[0], tt.addNum) + } + }) + } +} + +func TestRemove(t *testing.T) { + bitmap := New(uint32(1024)) + firstSlice := generateFilledSlice(0, 511, 50) + secondSlice := generateFilledSlice(512, 1024, 50) + for i := 0; i < 50; i++ { + bitmap.Add(firstSlice[i]) + bitmap.Add(secondSlice[i]) + } + + for i := 0; i < 50; i++ { + bitmap.Remove(firstSlice[i]) + } + bitmapSlice := bitmap.ToSlice() + if !reflect.DeepEqual(bitmapSlice, secondSlice) { + t.Errorf("After Remove() firstSlice, remained slice: %v, wanted: %v", bitmapSlice, secondSlice) + } + + for i := 0; i < 50; i++ { + bitmap.Remove(secondSlice[i]) + } + bitmapSlice = bitmap.ToSlice() + emptySlice := make([]uint32, 0) + if !reflect.DeepEqual(bitmapSlice, emptySlice) { + t.Errorf("After Remove secondSlice, remained slice: %v, wanted: %v", bitmapSlice, emptySlice) + } + +} + +// Verify flip bits within one bitBlock, one bit and bits cross multi bitBlocks. +func TestFlipRange(t *testing.T) { + tests := []struct { + name string + flipRangeMin int + flipRangeMax int + filledSliceLen int + }{ + {"Flip one number in bitmap", 77, 77, 1}, + {"Flip numbers within one bitBlocks", 5, 60, 20}, + {"Flip numbers that cross multi bitBlocks", 20, 1000, 300}, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + fillSlice, bitmap := generateFilledBitmap(tt.flipRangeMin, tt.flipRangeMax, tt.filledSliceLen) + flipFillSlice := make([]uint32, 0) + for i, j := tt.flipRangeMin, 0; i <= tt.flipRangeMax; i++ { + if uint32(i) != fillSlice[j] { + flipFillSlice = append(flipFillSlice, uint32(i)) + } else { + j++ + } + } + + bitmap.FlipRange(uint32(tt.flipRangeMin), uint32(tt.flipRangeMax+1)) + flipBitmapSlice := bitmap.ToSlice() + if !reflect.DeepEqual(flipFillSlice, flipBitmapSlice) { + t.Errorf("%v, flipped slice: %v, wanted: %v", tt.name, flipBitmapSlice, flipFillSlice) + } + }) + } +} + +// Verify clear bits within one bitBlock, one bit and bits cross multi bitBlocks. +func TestClearRange(t *testing.T) { + tests := []struct { + name string + clearRangeMin int + clearRangeMax int + bitmapSize int + }{ + {"ClearRange clear one number", 5, 5, 64}, + {"ClearRange clear numbers within one bitBlock", 4, 61, 64}, + {"ClearRange clear numbers cross multi bitBlocks", 20, 254, 512}, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + bitmap := New(uint32(tt.bitmapSize)) + bitmap.FlipRange(uint32(0), uint32(tt.bitmapSize)) + bitmap.ClearRange(uint32(tt.clearRangeMin), uint32(tt.clearRangeMax+1)) + clearedBitmapSlice := bitmap.ToSlice() + clearedSlice := make([]uint32, 0) + for i := 0; i < tt.bitmapSize; i++ { + if i < tt.clearRangeMin || i > tt.clearRangeMax { + clearedSlice = append(clearedSlice, uint32(i)) + } + } + if !reflect.DeepEqual(clearedSlice, clearedBitmapSlice) { + t.Errorf("%v, cleared slice: %v, wanted: %v", tt.name, clearedBitmapSlice, clearedSlice) + } + }) + + } +} + +func TestMinimum(t *testing.T) { + randSlice, bitmap := generateFilledBitmap(0, 1024, 200) + min := bitmap.Minimum() + if min != randSlice[0] { + t.Errorf("Minimum() returns: %v, wanted: %v", min, randSlice[0]) + } + + bitmap.ClearRange(uint32(0), uint32(200)) + min = bitmap.Minimum() + bitmapSlice := bitmap.ToSlice() + if min != bitmapSlice[0] { + t.Errorf("After ClearRange, Minimum() returns: %v, wanted: %v", min, bitmapSlice[0]) + } + + bitmap.FlipRange(uint32(2), uint32(11)) + min = bitmap.Minimum() + bitmapSlice = bitmap.ToSlice() + if min != bitmapSlice[0] { + t.Errorf("After Flip, Minimum() returns: %v, wanted: %v", min, bitmapSlice[0]) + } +} + +func TestMaximum(t *testing.T) { + randSlice, bitmap := generateFilledBitmap(0, 1024, 200) + max := bitmap.Maximum() + if max != randSlice[len(randSlice)-1] { + t.Errorf("Maximum() returns: %v, wanted: %v", max, randSlice[len(randSlice)-1]) + } + + bitmap.ClearRange(uint32(1000), uint32(1025)) + max = bitmap.Maximum() + bitmapSlice := bitmap.ToSlice() + if max != bitmapSlice[len(bitmapSlice)-1] { + t.Errorf("After ClearRange, Maximum() returns: %v, wanted: %v", max, bitmapSlice[len(bitmapSlice)-1]) + } + + bitmap.FlipRange(uint32(1001), uint32(1021)) + max = bitmap.Maximum() + bitmapSlice = bitmap.ToSlice() + if max != bitmapSlice[len(bitmapSlice)-1] { + t.Errorf("After Flip, Maximum() returns: %v, wanted: %v", max, bitmapSlice[len(bitmapSlice)-1]) + } +} + +func TestBitmapNumOnes(t *testing.T) { + randSlice, bitmap := generateFilledBitmap(0, 1024, 200) + bitmapOnes := bitmap.GetNumOnes() + if bitmapOnes != uint32(200) { + t.Errorf("GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 200) + } + // Remove 10 numbers. + for i := 5; i < 15; i++ { + bitmap.Remove(randSlice[i]) + } + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(190) { + t.Errorf("After Remove 10 number, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 190) + } + // Remove the 10 number again, the length supposed not change. + for i := 5; i < 15; i++ { + bitmap.Remove(randSlice[i]) + } + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(190) { + t.Errorf("After Remove the 10 number again, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 190) + } + + // Add 10 number. + for i := 1080; i < 1090; i++ { + bitmap.Add(uint32(i)) + } + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(200) { + t.Errorf("After Add 10 number, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 200) + } + + // Add the 10 number again, the length supposed not change. + for i := 1080; i < 1090; i++ { + bitmap.Add(uint32(i)) + } + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(200) { + t.Errorf("After Add the 10 number again, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 200) + } + + // Flip 10 bits from 0 to 1. + bitmap.FlipRange(uint32(1025), uint32(1035)) + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(210) { + t.Errorf("After Flip, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 210) + } + + // ClearRange numbers range from [0, 1025). + bitmap.ClearRange(uint32(0), uint32(1025)) + bitmapOnes = bitmap.GetNumOnes() + if bitmapOnes != uint32(20) { + t.Errorf("After ClearRange, GetNumOnes() returns: %v, wanted: %v", bitmapOnes, 20) + } +} + +func TestFirstZero(t *testing.T) { + bitmap := New(uint32(1000)) + bitmap.FlipRange(200, 400) + for i, j := range map[uint32]uint32{0: 0, 201: 400, 200: 400, 199: 199, 400: 400, 10000: math.MaxInt32} { + v := bitmap.FirstZero(i) + if v != j { + t.Errorf("Minimum() returns: %v, wanted: %v", v, j) + } + } +} diff --git a/pkg/bits/uint64_arch.go b/pkg/bits/uint64_arch.go index 9f23eff77..fc5634167 100644 --- a/pkg/bits/uint64_arch.go +++ b/pkg/bits/uint64_arch.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || arm64 // +build amd64 arm64 package bits diff --git a/pkg/bits/uint64_arch_amd64_asm.s b/pkg/bits/uint64_arch_amd64_asm.s index 8ff364181..2931b5d56 100644 --- a/pkg/bits/uint64_arch_amd64_asm.s +++ b/pkg/bits/uint64_arch_amd64_asm.s @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 TEXT ·TrailingZeros64(SB),$0-16 diff --git a/pkg/bits/uint64_arch_arm64_asm.s b/pkg/bits/uint64_arch_arm64_asm.s index 814ba562d..eb8d4d280 100644 --- a/pkg/bits/uint64_arch_arm64_asm.s +++ b/pkg/bits/uint64_arch_arm64_asm.s @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 TEXT ·TrailingZeros64(SB),$0-16 diff --git a/pkg/bits/uint64_arch_generic.go b/pkg/bits/uint64_arch_generic.go index 9dd2098d1..83b23a3fc 100644 --- a/pkg/bits/uint64_arch_generic.go +++ b/pkg/bits/uint64_arch_generic.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !amd64 && !arm64 // +build !amd64,!arm64 package bits diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go index 7bcfcd543..a4610f977 100644 --- a/pkg/buffer/view.go +++ b/pkg/buffer/view.go @@ -378,6 +378,20 @@ func (v *View) Copy() (other View) { return } +// Clone makes a more shallow copy compared to Copy. The underlying payload +// slice (buffer.data) is shared but the buffers themselves are copied. +func (v *View) Clone() *View { + other := &View{ + size: v.size, + } + for buf := v.data.Front(); buf != nil; buf = buf.Next() { + newBuf := other.pool.getNoInit() + *newBuf = *buf + other.data.PushBack(newBuf) + } + return other +} + // Apply applies the given function across all valid data. func (v *View) Apply(fn func([]byte)) { for buf := v.data.Front(); buf != nil; buf = buf.Next() { diff --git a/pkg/control/server/server.go b/pkg/control/server/server.go index 629dae8f4..889568177 100644 --- a/pkg/control/server/server.go +++ b/pkg/control/server/server.go @@ -22,6 +22,7 @@ package server import ( "os" + "time" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" @@ -65,13 +66,13 @@ func (s *Server) Wait() { // Stop stops the server. Note that this function should only be called once // and the server should not be used afterwards. -func (s *Server) Stop() { +func (s *Server) Stop(timeout time.Duration) { s.socket.Close() s.Wait() // This will cause existing clients to be terminated safely. If the // registered handlers have a Stop callback, it will be called. - s.server.Stop() + s.server.Stop(timeout) } // StartServing starts listening for connect and spawns the main service diff --git a/pkg/coverage/coverage.go b/pkg/coverage/coverage.go index b33a20802..0fabee92b 100644 --- a/pkg/coverage/coverage.go +++ b/pkg/coverage/coverage.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + // Package coverage provides an interface through which Go coverage data can // be collected, converted to kcov format, and exposed to userspace. // diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go index 69eeb7528..4d5e062a8 100644 --- a/pkg/cpuid/cpuid.go +++ b/pkg/cpuid/cpuid.go @@ -37,6 +37,14 @@ package cpuid // arch/arm64/include/uapi/asm/hwcap.h type Feature int +// HostFeatureSet returns a FeatureSet that matches that of the host machine. +// Callers must not mutate the returned FeatureSet. +func HostFeatureSet() *FeatureSet { + return hostFeatureSet +} + +var hostFeatureSet = getHostFeatureSet() + // ErrIncompatible is returned by FeatureSet.HostCompatible if fs is not a // subset of the host feature set. type ErrIncompatible struct { diff --git a/pkg/cpuid/cpuid_arm64.go b/pkg/cpuid/cpuid_arm64.go index 98c6ec62f..04645aed5 100644 --- a/pkg/cpuid/cpuid_arm64.go +++ b/pkg/cpuid/cpuid_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package cpuid @@ -229,6 +230,16 @@ type FeatureSet struct { CPURevision uint8 } +// Clone returns a copy of fs. +func (fs *FeatureSet) Clone() *FeatureSet { + fs2 := *fs + fs2.Set = make(map[Feature]bool) + for f, b := range fs.Set { + fs2.Set[f] = b + } + return &fs2 +} + // CheckHostCompatible returns nil if fs is a subset of the host feature set. // Noop on arm64. func (fs *FeatureSet) CheckHostCompatible() error { @@ -291,9 +302,9 @@ func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) { fmt.Fprintln(b, "") // The /proc/cpuinfo file ends with an extra newline. } -// HostFeatureSet uses hwCap to get host values and construct a feature set +// getHostFeatureSet uses hwCap to get host values and construct a feature set // that matches that of the host machine. -func HostFeatureSet() *FeatureSet { +func getHostFeatureSet() *FeatureSet { s := make(map[Feature]bool) for f := range arm64FeatureStrings { diff --git a/pkg/cpuid/cpuid_arm64_test.go b/pkg/cpuid/cpuid_arm64_test.go index a34f67779..16b1c064a 100644 --- a/pkg/cpuid/cpuid_arm64_test.go +++ b/pkg/cpuid/cpuid_arm64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package cpuid diff --git a/pkg/cpuid/cpuid_parse_x86_test.go b/pkg/cpuid/cpuid_parse_x86_test.go index d60fdb550..36dd20552 100644 --- a/pkg/cpuid/cpuid_parse_x86_test.go +++ b/pkg/cpuid/cpuid_parse_x86_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package cpuid diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go index 392711e8f..a92d32d74 100644 --- a/pkg/cpuid/cpuid_x86.go +++ b/pkg/cpuid/cpuid_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package cpuid @@ -626,6 +627,17 @@ type FeatureSet struct { CacheLine uint32 } +// Clone returns a copy of fs. +func (fs *FeatureSet) Clone() *FeatureSet { + fs2 := *fs + fs2.Set = make(map[Feature]bool) + for f, b := range fs.Set { + fs2.Set[f] = b + } + fs2.Caches = append([]Cache(nil), fs.Caches...) + return &fs2 +} + // FlagsString prints out supported CPU flags. If cpuinfoOnly is true, it is // equivalent to the "flags" field in /proc/cpuinfo. func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string { @@ -960,13 +972,13 @@ func (fs *FeatureSet) UseXsaveopt() bool { // HostID executes a native CPUID instruction. func HostID(axArg, cxArg uint32) (ax, bx, cx, dx uint32) -// HostFeatureSet uses cpuid to get host values and construct a feature set +// getHostFeatureSet uses cpuid to get host values and construct a feature set // that matches that of the host machine. Note that there are several places // where there appear to be some unnecessary assignments between register names // (ax, bx, cx, or dx) and featureBlockN variables. This is to explicitly show // where the different feature blocks come from, to make the code easier to // inspect and read. -func HostFeatureSet() *FeatureSet { +func getHostFeatureSet() *FeatureSet { // eax=0 gets max supported feature and vendor ID. _, bx, cx, dx := HostID(0, 0) vendorID := vendorIDFromRegs(bx, cx, dx) diff --git a/pkg/cpuid/cpuid_x86_test.go b/pkg/cpuid/cpuid_x86_test.go index bacf345c8..92a2d9f81 100644 --- a/pkg/cpuid/cpuid_x86_test.go +++ b/pkg/cpuid/cpuid_x86_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package cpuid diff --git a/pkg/crypto/crypto_stdlib.go b/pkg/crypto/crypto_stdlib.go index 514592b08..69e867386 100644 --- a/pkg/crypto/crypto_stdlib.go +++ b/pkg/crypto/crypto_stdlib.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package crypto import ( diff --git a/pkg/errors/BUILD b/pkg/errors/BUILD new file mode 100644 index 000000000..36ea5da0c --- /dev/null +++ b/pkg/errors/BUILD @@ -0,0 +1,10 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "errors", + srcs = ["errors.go"], + visibility = ["//:sandbox"], + deps = ["//pkg/abi/linux/errno"], +) diff --git a/pkg/errors/errors.go b/pkg/errors/errors.go new file mode 100644 index 000000000..7984a770e --- /dev/null +++ b/pkg/errors/errors.go @@ -0,0 +1,40 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package errors holds the standardized error definition for gVisor. +package errors + +import ( + "gvisor.dev/gvisor/pkg/abi/linux/errno" +) + +// Error represents a syscall errno with a descriptive message. +type Error struct { + errno errno.Errno + message string +} + +// New creates a new *Error. +func New(err errno.Errno, message string) *Error { + return &Error{ + errno: err, + message: message, + } +} + +// Error implements error.Error. +func (e *Error) Error() string { return e.message } + +// Errno returns the underlying errno.Errno value. +func (e *Error) Errno() errno.Errno { return e.errno } diff --git a/pkg/linuxerr/BUILD b/pkg/errors/linuxerr/BUILD index c5abbd34f..e73b0e28a 100644 --- a/pkg/linuxerr/BUILD +++ b/pkg/errors/linuxerr/BUILD @@ -4,9 +4,16 @@ package(licenses = ["notice"]) go_library( name = "linuxerr", - srcs = ["linuxerr.go"], + srcs = [ + "internal.go", + "linuxerr.go", + ], visibility = ["//visibility:public"], - deps = ["//pkg/abi/linux"], + deps = [ + "//pkg/abi/linux/errno", + "//pkg/errors", + "@org_golang_x_sys//unix:go_default_library", + ], ) go_test( @@ -14,7 +21,8 @@ go_test( srcs = ["linuxerr_test.go"], deps = [ ":linuxerr", - "//pkg/syserror", + "//pkg/abi/linux/errno", + "//pkg/errors", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/errors/linuxerr/internal.go b/pkg/errors/linuxerr/internal.go new file mode 100644 index 000000000..127bba0df --- /dev/null +++ b/pkg/errors/linuxerr/internal.go @@ -0,0 +1,120 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"),; +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linuxerr + +import ( + "gvisor.dev/gvisor/pkg/abi/linux/errno" + "gvisor.dev/gvisor/pkg/errors" +) + +var ( + // ErrWouldBlock is an internal error used to indicate that an operation + // cannot be satisfied immediately, and should be retried at a later + // time, possibly when the caller has received a notification that the + // operation may be able to complete. It is used by implementations of + // the kio.File interface. + ErrWouldBlock = errors.New(errno.EWOULDBLOCK, "request would block") + + // ErrInterrupted is returned if a request is interrupted before it can + // complete. + ErrInterrupted = errors.New(errno.EINTR, "request was interrupted") + + // ErrExceedsFileSizeLimit is returned if a request would exceed the + // file's size limit. + ErrExceedsFileSizeLimit = errors.New(errno.E2BIG, "exceeds file size limit") +) + +var errorMap = map[error]*errors.Error{ + ErrWouldBlock: EWOULDBLOCK, + ErrInterrupted: EINTR, + ErrExceedsFileSizeLimit: EFBIG, +} + +// errorUnwrappers is an array of unwrap functions to extract typed errors. +var errorUnwrappers = []func(error) (*errors.Error, bool){} + +// AddErrorUnwrapper registers an unwrap method that can extract a concrete error +// from a typed, but not initialized, error. +func AddErrorUnwrapper(unwrap func(e error) (*errors.Error, bool)) { + errorUnwrappers = append(errorUnwrappers, unwrap) +} + +// TranslateError translates errors to errnos, it will return false if +// the error was not registered. +func TranslateError(from error) (*errors.Error, bool) { + if err, ok := errorMap[from]; ok { + return err, true + } + // Try to unwrap the error if we couldn't match an error + // exactly. This might mean that a package has its own + // error type. + for _, unwrap := range errorUnwrappers { + if err, ok := unwrap(from); ok { + return err, true + } + } + return nil, false +} + +// These errors are significant because ptrace syscall exit tracing can +// observe them. +// +// For all of the following errors, if the syscall is not interrupted by a +// signal delivered to a user handler, the syscall is restarted. +var ( + // ERESTARTSYS is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler without SA_RESTART set, and restarted otherwise. + ERESTARTSYS = errors.New(errno.ERESTARTSYS, "to be restarted if SA_RESTART is set") + + // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it + // should always be restarted. + ERESTARTNOINTR = errors.New(errno.ERESTARTNOINTR, "to be restarted") + + // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler, and restarted otherwise. + ERESTARTNOHAND = errors.New(errno.ERESTARTNOHAND, "to be restarted if no handler") + + // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate + // that it should be restarted using a custom function. The interrupted + // syscall must register a custom restart function by calling + // Task.SetRestartSyscallFn. + ERESTART_RESTARTBLOCK = errors.New(errno.ERESTART_RESTARTBLOCK, "interrupted by signal") +) + +var restartMap = map[int]*errors.Error{ + -int(errno.ERESTARTSYS): ERESTARTSYS, + -int(errno.ERESTARTNOINTR): ERESTARTNOINTR, + -int(errno.ERESTARTNOHAND): ERESTARTNOHAND, + -int(errno.ERESTART_RESTARTBLOCK): ERESTART_RESTARTBLOCK, +} + +// IsRestartError checks if a given error is a restart error. +func IsRestartError(err error) bool { + switch err { + case ERESTARTSYS, ERESTARTNOINTR, ERESTARTNOHAND, ERESTART_RESTARTBLOCK: + return true + default: + return false + } +} + +// SyscallRestartErrorFromReturn returns the SyscallRestartErrno represented by +// rv, the value in a syscall return register. +func SyscallRestartErrorFromReturn(rv uintptr) (*errors.Error, bool) { + err, ok := restartMap[int(rv)] + return err, ok +} diff --git a/pkg/errors/linuxerr/linuxerr.go b/pkg/errors/linuxerr/linuxerr.go new file mode 100644 index 000000000..5905ef593 --- /dev/null +++ b/pkg/errors/linuxerr/linuxerr.go @@ -0,0 +1,355 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"),; +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package linuxerr contains syscall error codes exported as an error interface +// pointers. This allows for fast comparison and return operations comperable +// to unix.Errno constants. +package linuxerr + +import ( + "fmt" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux/errno" + "gvisor.dev/gvisor/pkg/errors" +) + +const maxErrno uint32 = errno.EHWPOISON + 1 + +// The following errors are semantically identical to Errno of type unix.Errno +// or sycall.Errno. However, since the type are distinct ( these are +// *errors.Error), they are not directly comperable. However, the Errno method +// returns an Errno number such that the error can be compared to unix/syscall.Errno +// (e.g. unix.Errno(EPERM.Errno()) == unix.EPERM is true). Converting unix/syscall.Errno +// to the errors should be done via the lookup methods provided. +var ( + NOERROR = errors.New(errno.NOERRNO, "not an error") + EPERM = errors.New(errno.EPERM, "operation not permitted") + ENOENT = errors.New(errno.ENOENT, "no such file or directory") + ESRCH = errors.New(errno.ESRCH, "no such process") + EINTR = errors.New(errno.EINTR, "interrupted system call") + EIO = errors.New(errno.EIO, "I/O error") + ENXIO = errors.New(errno.ENXIO, "no such device or address") + E2BIG = errors.New(errno.E2BIG, "argument list too long") + ENOEXEC = errors.New(errno.ENOEXEC, "exec format error") + EBADF = errors.New(errno.EBADF, "bad file number") + ECHILD = errors.New(errno.ECHILD, "no child processes") + EAGAIN = errors.New(errno.EAGAIN, "try again") + ENOMEM = errors.New(errno.ENOMEM, "out of memory") + EACCES = errors.New(errno.EACCES, "permission denied") + EFAULT = errors.New(errno.EFAULT, "bad address") + ENOTBLK = errors.New(errno.ENOTBLK, "block device required") + EBUSY = errors.New(errno.EBUSY, "device or resource busy") + EEXIST = errors.New(errno.EEXIST, "file exists") + EXDEV = errors.New(errno.EXDEV, "cross-device link") + ENODEV = errors.New(errno.ENODEV, "no such device") + ENOTDIR = errors.New(errno.ENOTDIR, "not a directory") + EISDIR = errors.New(errno.EISDIR, "is a directory") + EINVAL = errors.New(errno.EINVAL, "invalid argument") + ENFILE = errors.New(errno.ENFILE, "file table overflow") + EMFILE = errors.New(errno.EMFILE, "too many open files") + ENOTTY = errors.New(errno.ENOTTY, "not a typewriter") + ETXTBSY = errors.New(errno.ETXTBSY, "text file busy") + EFBIG = errors.New(errno.EFBIG, "file too large") + ENOSPC = errors.New(errno.ENOSPC, "no space left on device") + ESPIPE = errors.New(errno.ESPIPE, "illegal seek") + EROFS = errors.New(errno.EROFS, "read-only file system") + EMLINK = errors.New(errno.EMLINK, "too many links") + EPIPE = errors.New(errno.EPIPE, "broken pipe") + EDOM = errors.New(errno.EDOM, "math argument out of domain of func") + ERANGE = errors.New(errno.ERANGE, "math result not representable") + + // Errno values from include/uapi/asm-generic/errno.h. + EDEADLK = errors.New(errno.EDEADLK, "resource deadlock would occur") + ENAMETOOLONG = errors.New(errno.ENAMETOOLONG, "file name too long") + ENOLCK = errors.New(errno.ENOLCK, "no record locks available") + ENOSYS = errors.New(errno.ENOSYS, "invalid system call number") + ENOTEMPTY = errors.New(errno.ENOTEMPTY, "directory not empty") + ELOOP = errors.New(errno.ELOOP, "too many symbolic links encountered") + ENOMSG = errors.New(errno.ENOMSG, "no message of desired type") + EIDRM = errors.New(errno.EIDRM, "identifier removed") + ECHRNG = errors.New(errno.ECHRNG, "channel number out of range") + EL2NSYNC = errors.New(errno.EL2NSYNC, "level 2 not synchronized") + EL3HLT = errors.New(errno.EL3HLT, "level 3 halted") + EL3RST = errors.New(errno.EL3RST, "level 3 reset") + ELNRNG = errors.New(errno.ELNRNG, "link number out of range") + EUNATCH = errors.New(errno.EUNATCH, "protocol driver not attached") + ENOCSI = errors.New(errno.ENOCSI, "no CSI structure available") + EL2HLT = errors.New(errno.EL2HLT, "level 2 halted") + EBADE = errors.New(errno.EBADE, "invalid exchange") + EBADR = errors.New(errno.EBADR, "invalid request descriptor") + EXFULL = errors.New(errno.EXFULL, "exchange full") + ENOANO = errors.New(errno.ENOANO, "no anode") + EBADRQC = errors.New(errno.EBADRQC, "invalid request code") + EBADSLT = errors.New(errno.EBADSLT, "invalid slot") + EBFONT = errors.New(errno.EBFONT, "bad font file format") + ENOSTR = errors.New(errno.ENOSTR, "device not a stream") + ENODATA = errors.New(errno.ENODATA, "no data available") + ETIME = errors.New(errno.ETIME, "timer expired") + ENOSR = errors.New(errno.ENOSR, "out of streams resources") + ENOPKG = errors.New(errno.ENOPKG, "package not installed") + EREMOTE = errors.New(errno.EREMOTE, "object is remote") + ENOLINK = errors.New(errno.ENOLINK, "link has been severed") + EADV = errors.New(errno.EADV, "advertise error") + ESRMNT = errors.New(errno.ESRMNT, "srmount error") + ECOMM = errors.New(errno.ECOMM, "communication error on send") + EPROTO = errors.New(errno.EPROTO, "protocol error") + EMULTIHOP = errors.New(errno.EMULTIHOP, "multihop attempted") + EDOTDOT = errors.New(errno.EDOTDOT, "RFS specific error") + EBADMSG = errors.New(errno.EBADMSG, "not a data message") + EOVERFLOW = errors.New(errno.EOVERFLOW, "value too large for defined data type") + ENOTUNIQ = errors.New(errno.ENOTUNIQ, "name not unique on network") + EBADFD = errors.New(errno.EBADFD, "file descriptor in bad state") + EREMCHG = errors.New(errno.EREMCHG, "remote address changed") + ELIBACC = errors.New(errno.ELIBACC, "can not access a needed shared library") + ELIBBAD = errors.New(errno.ELIBBAD, "accessing a corrupted shared library") + ELIBSCN = errors.New(errno.ELIBSCN, ".lib section in a.out corrupted") + ELIBMAX = errors.New(errno.ELIBMAX, "attempting to link in too many shared libraries") + ELIBEXEC = errors.New(errno.ELIBEXEC, "cannot exec a shared library directly") + EILSEQ = errors.New(errno.EILSEQ, "illegal byte sequence") + ERESTART = errors.New(errno.ERESTART, "interrupted system call should be restarted") + ESTRPIPE = errors.New(errno.ESTRPIPE, "streams pipe error") + EUSERS = errors.New(errno.EUSERS, "too many users") + ENOTSOCK = errors.New(errno.ENOTSOCK, "socket operation on non-socket") + EDESTADDRREQ = errors.New(errno.EDESTADDRREQ, "destination address required") + EMSGSIZE = errors.New(errno.EMSGSIZE, "message too long") + EPROTOTYPE = errors.New(errno.EPROTOTYPE, "protocol wrong type for socket") + ENOPROTOOPT = errors.New(errno.ENOPROTOOPT, "protocol not available") + EPROTONOSUPPORT = errors.New(errno.EPROTONOSUPPORT, "protocol not supported") + ESOCKTNOSUPPORT = errors.New(errno.ESOCKTNOSUPPORT, "socket type not supported") + EOPNOTSUPP = errors.New(errno.EOPNOTSUPP, "operation not supported on transport endpoint") + EPFNOSUPPORT = errors.New(errno.EPFNOSUPPORT, "protocol family not supported") + EAFNOSUPPORT = errors.New(errno.EAFNOSUPPORT, "address family not supported by protocol") + EADDRINUSE = errors.New(errno.EADDRINUSE, "address already in use") + EADDRNOTAVAIL = errors.New(errno.EADDRNOTAVAIL, "cannot assign requested address") + ENETDOWN = errors.New(errno.ENETDOWN, "network is down") + ENETUNREACH = errors.New(errno.ENETUNREACH, "network is unreachable") + ENETRESET = errors.New(errno.ENETRESET, "network dropped connection because of reset") + ECONNABORTED = errors.New(errno.ECONNABORTED, "software caused connection abort") + ECONNRESET = errors.New(errno.ECONNRESET, "connection reset by peer") + ENOBUFS = errors.New(errno.ENOBUFS, "no buffer space available") + EISCONN = errors.New(errno.EISCONN, "transport endpoint is already connected") + ENOTCONN = errors.New(errno.ENOTCONN, "transport endpoint is not connected") + ESHUTDOWN = errors.New(errno.ESHUTDOWN, "cannot send after transport endpoint shutdown") + ETOOMANYREFS = errors.New(errno.ETOOMANYREFS, "too many references: cannot splice") + ETIMEDOUT = errors.New(errno.ETIMEDOUT, "connection timed out") + ECONNREFUSED = errors.New(errno.ECONNREFUSED, "connection refused") + EHOSTDOWN = errors.New(errno.EHOSTDOWN, "host is down") + EHOSTUNREACH = errors.New(errno.EHOSTUNREACH, "no route to host") + EALREADY = errors.New(errno.EALREADY, "operation already in progress") + EINPROGRESS = errors.New(errno.EINPROGRESS, "operation now in progress") + ESTALE = errors.New(errno.ESTALE, "stale file handle") + EUCLEAN = errors.New(errno.EUCLEAN, "structure needs cleaning") + ENOTNAM = errors.New(errno.ENOTNAM, "not a XENIX named type file") + ENAVAIL = errors.New(errno.ENAVAIL, "no XENIX semaphores available") + EISNAM = errors.New(errno.EISNAM, "is a named type file") + EREMOTEIO = errors.New(errno.EREMOTEIO, "remote I/O error") + EDQUOT = errors.New(errno.EDQUOT, "quota exceeded") + ENOMEDIUM = errors.New(errno.ENOMEDIUM, "no medium found") + EMEDIUMTYPE = errors.New(errno.EMEDIUMTYPE, "wrong medium type") + ECANCELED = errors.New(errno.ECANCELED, "operation Canceled") + ENOKEY = errors.New(errno.ENOKEY, "required key not available") + EKEYEXPIRED = errors.New(errno.EKEYEXPIRED, "key has expired") + EKEYREVOKED = errors.New(errno.EKEYREVOKED, "key has been revoked") + EKEYREJECTED = errors.New(errno.EKEYREJECTED, "key was rejected by service") + EOWNERDEAD = errors.New(errno.EOWNERDEAD, "owner died") + ENOTRECOVERABLE = errors.New(errno.ENOTRECOVERABLE, "state not recoverable") + ERFKILL = errors.New(errno.ERFKILL, "operation not possible due to RF-kill") + EHWPOISON = errors.New(errno.EHWPOISON, "memory page has hardware error") + + // Errors equivalent to other errors. + EWOULDBLOCK = EAGAIN + EDEADLOCK = EDEADLK + ENONET = ENOENT + ENOATTR = ENODATA + ENOTSUP = EOPNOTSUPP +) + +// A nil *errors.Error denotes no error and is placed at the 0 index of +// errorSlice. Thus, any other empty index should not be nil or a valid error. +// This marks that index as an invalid error so any comparison to nil or a +// valid linuxerr fails. +var errNotValidError = errors.New(errno.Errno(maxErrno), "not a valid error") + +// The following errorSlice holds errors by errno for fast translation between +// errnos (especially uint32(sycall.Errno)) and *errors.Error. +var errorSlice = []*errors.Error{ + // Errno values from include/uapi/asm-generic/errno-base.h. + errno.NOERRNO: NOERROR, + errno.EPERM: EPERM, + errno.ENOENT: ENOENT, + errno.ESRCH: ESRCH, + errno.EINTR: EINTR, + errno.EIO: EIO, + errno.ENXIO: ENXIO, + errno.E2BIG: E2BIG, + errno.ENOEXEC: ENOEXEC, + errno.EBADF: EBADF, + errno.ECHILD: ECHILD, + errno.EAGAIN: EAGAIN, + errno.ENOMEM: ENOMEM, + errno.EACCES: EACCES, + errno.EFAULT: EFAULT, + errno.ENOTBLK: ENOTBLK, + errno.EBUSY: EBUSY, + errno.EEXIST: EEXIST, + errno.EXDEV: EXDEV, + errno.ENODEV: ENODEV, + errno.ENOTDIR: ENOTDIR, + errno.EISDIR: EISDIR, + errno.EINVAL: EINVAL, + errno.ENFILE: ENFILE, + errno.EMFILE: EMFILE, + errno.ENOTTY: ENOTTY, + errno.ETXTBSY: ETXTBSY, + errno.EFBIG: EFBIG, + errno.ENOSPC: ENOSPC, + errno.ESPIPE: ESPIPE, + errno.EROFS: EROFS, + errno.EMLINK: EMLINK, + errno.EPIPE: EPIPE, + errno.EDOM: EDOM, + errno.ERANGE: ERANGE, + + // Errno values from include/uapi/asm-generic/errno.h. + errno.EDEADLK: EDEADLK, + errno.ENAMETOOLONG: ENAMETOOLONG, + errno.ENOLCK: ENOLCK, + errno.ENOSYS: ENOSYS, + errno.ENOTEMPTY: ENOTEMPTY, + errno.ELOOP: ELOOP, + errno.ELOOP + 1: errNotValidError, // No valid errno between ELOOP and ENOMSG. + errno.ENOMSG: ENOMSG, + errno.EIDRM: EIDRM, + errno.ECHRNG: ECHRNG, + errno.EL2NSYNC: EL2NSYNC, + errno.EL3HLT: EL3HLT, + errno.EL3RST: EL3RST, + errno.ELNRNG: ELNRNG, + errno.EUNATCH: EUNATCH, + errno.ENOCSI: ENOCSI, + errno.EL2HLT: EL2HLT, + errno.EBADE: EBADE, + errno.EBADR: EBADR, + errno.EXFULL: EXFULL, + errno.ENOANO: ENOANO, + errno.EBADRQC: EBADRQC, + errno.EBADSLT: EBADSLT, + errno.EBADSLT + 1: errNotValidError, // No valid errno between EBADSLT and ENOPKG. + errno.EBFONT: EBFONT, + errno.ENOSTR: ENOSTR, + errno.ENODATA: ENODATA, + errno.ETIME: ETIME, + errno.ENOSR: ENOSR, + errno.ENOSR + 1: errNotValidError, // No valid errno betweeen ENOSR and ENOPKG. + errno.ENOPKG: ENOPKG, + errno.EREMOTE: EREMOTE, + errno.ENOLINK: ENOLINK, + errno.EADV: EADV, + errno.ESRMNT: ESRMNT, + errno.ECOMM: ECOMM, + errno.EPROTO: EPROTO, + errno.EMULTIHOP: EMULTIHOP, + errno.EDOTDOT: EDOTDOT, + errno.EBADMSG: EBADMSG, + errno.EOVERFLOW: EOVERFLOW, + errno.ENOTUNIQ: ENOTUNIQ, + errno.EBADFD: EBADFD, + errno.EREMCHG: EREMCHG, + errno.ELIBACC: ELIBACC, + errno.ELIBBAD: ELIBBAD, + errno.ELIBSCN: ELIBSCN, + errno.ELIBMAX: ELIBMAX, + errno.ELIBEXEC: ELIBEXEC, + errno.EILSEQ: EILSEQ, + errno.ERESTART: ERESTART, + errno.ESTRPIPE: ESTRPIPE, + errno.EUSERS: EUSERS, + errno.ENOTSOCK: ENOTSOCK, + errno.EDESTADDRREQ: EDESTADDRREQ, + errno.EMSGSIZE: EMSGSIZE, + errno.EPROTOTYPE: EPROTOTYPE, + errno.ENOPROTOOPT: ENOPROTOOPT, + errno.EPROTONOSUPPORT: EPROTONOSUPPORT, + errno.ESOCKTNOSUPPORT: ESOCKTNOSUPPORT, + errno.EOPNOTSUPP: EOPNOTSUPP, + errno.EPFNOSUPPORT: EPFNOSUPPORT, + errno.EAFNOSUPPORT: EAFNOSUPPORT, + errno.EADDRINUSE: EADDRINUSE, + errno.EADDRNOTAVAIL: EADDRNOTAVAIL, + errno.ENETDOWN: ENETDOWN, + errno.ENETUNREACH: ENETUNREACH, + errno.ENETRESET: ENETRESET, + errno.ECONNABORTED: ECONNABORTED, + errno.ECONNRESET: ECONNRESET, + errno.ENOBUFS: ENOBUFS, + errno.EISCONN: EISCONN, + errno.ENOTCONN: ENOTCONN, + errno.ESHUTDOWN: ESHUTDOWN, + errno.ETOOMANYREFS: ETOOMANYREFS, + errno.ETIMEDOUT: ETIMEDOUT, + errno.ECONNREFUSED: ECONNREFUSED, + errno.EHOSTDOWN: EHOSTDOWN, + errno.EHOSTUNREACH: EHOSTUNREACH, + errno.EALREADY: EALREADY, + errno.EINPROGRESS: EINPROGRESS, + errno.ESTALE: ESTALE, + errno.EUCLEAN: EUCLEAN, + errno.ENOTNAM: ENOTNAM, + errno.ENAVAIL: ENAVAIL, + errno.EISNAM: EISNAM, + errno.EREMOTEIO: EREMOTEIO, + errno.EDQUOT: EDQUOT, + errno.ENOMEDIUM: ENOMEDIUM, + errno.EMEDIUMTYPE: EMEDIUMTYPE, + errno.ECANCELED: ECANCELED, + errno.ENOKEY: ENOKEY, + errno.EKEYEXPIRED: EKEYEXPIRED, + errno.EKEYREVOKED: EKEYREVOKED, + errno.EKEYREJECTED: EKEYREJECTED, + errno.EOWNERDEAD: EOWNERDEAD, + errno.ENOTRECOVERABLE: ENOTRECOVERABLE, + errno.ERFKILL: ERFKILL, + errno.EHWPOISON: EHWPOISON, +} + +// ErrorFromErrno gets an error from the list and panics if an invalid entry is requested. +func ErrorFromErrno(e errno.Errno) *errors.Error { + err := errorSlice[e] + // Done this way because a single comparison in benchmarks is 2-3 faster + // than something like ( if err == nil && err > 0 ). + if err != errNotValidError { + return err + } + panic(fmt.Sprintf("invalid error requested with errno: %d", e)) +} + +// Equals compars a linuxerr to a given error +// TODO(b/34162363): Remove when syserror is removed. +func Equals(e *errors.Error, err error) bool { + if err == nil { + return e == NOERROR || e == nil + } + if e == nil { + return err == NOERROR || err == unix.Errno(0) + } + + switch err.(type) { + case *errors.Error: + return e == err + case unix.Errno, error: + return unix.Errno(e.Errno()) == err + } + return false +} diff --git a/pkg/errors/linuxerr/linuxerr_test.go b/pkg/errors/linuxerr/linuxerr_test.go new file mode 100644 index 000000000..df7cd1c5a --- /dev/null +++ b/pkg/errors/linuxerr/linuxerr_test.go @@ -0,0 +1,268 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linuxerr_test + +import ( + "errors" + "fmt" + "io" + "io/fs" + "syscall" + "testing" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux/errno" + gErrors "gvisor.dev/gvisor/pkg/errors" + "gvisor.dev/gvisor/pkg/errors/linuxerr" +) + +var globalError error + +func BenchmarkAssignUnix(b *testing.B) { + for i := b.N; i > 0; i-- { + globalError = unix.EINVAL + } +} + +func BenchmarkAssignLinuxerr(b *testing.B) { + for i := b.N; i > 0; i-- { + globalError = linuxerr.EINVAL + } +} + +func BenchmarkCompareUnix(b *testing.B) { + globalError = unix.EAGAIN + j := 0 + for i := b.N; i > 0; i-- { + if globalError == unix.EINVAL { + j++ + } + } +} + +func BenchmarkCompareLinuxerr(b *testing.B) { + globalError = linuxerr.E2BIG + j := 0 + for i := b.N; i > 0; i-- { + if globalError == linuxerr.EINVAL { + j++ + } + } +} + +func BenchmarkSwitchUnix(b *testing.B) { + globalError = unix.EPERM + j := 0 + for i := b.N; i > 0; i-- { + switch globalError { + case unix.EINVAL: + j++ + case unix.EINTR: + j += 2 + case unix.EAGAIN: + j += 3 + } + } +} + +func BenchmarkSwitchLinuxerr(b *testing.B) { + globalError = linuxerr.EPERM + j := 0 + for i := b.N; i > 0; i-- { + switch globalError { + case linuxerr.EINVAL: + j++ + case linuxerr.EINTR: + j += 2 + case linuxerr.EAGAIN: + j += 3 + } + } +} + +func BenchmarkReturnUnix(b *testing.B) { + var localError error + f := func() error { + return unix.EINVAL + } + for i := b.N; i > 0; i-- { + localError = f() + } + if localError != nil { + return + } +} + +func BenchmarkReturnLinuxerr(b *testing.B) { + var localError error + f := func() error { + return linuxerr.EINVAL + } + for i := b.N; i > 0; i-- { + localError = f() + } + if localError != nil { + return + } +} + +func BenchmarkConvertUnixLinuxerr(b *testing.B) { + var localError error + for i := b.N; i > 0; i-- { + localError = linuxerr.ErrorFromErrno(errno.Errno(unix.EINVAL)) + } + if localError != nil { + return + } +} + +func BenchmarkConvertUnixLinuxerrZero(b *testing.B) { + var localError error + for i := b.N; i > 0; i-- { + localError = linuxerr.ErrorFromErrno(errno.Errno(0)) + } + if localError != nil { + return + } +} + +type translationTestTable struct { + errIn error + expectedBool bool + expectedTranslation *gErrors.Error +} + +func TestErrorTranslation(t *testing.T) { + testTable := []translationTestTable{ + { + errIn: linuxerr.ENOENT, + }, + { + errIn: unix.ENOENT, + }, + { + errIn: linuxerr.ErrInterrupted, + expectedBool: true, + expectedTranslation: linuxerr.EINTR, + }, + { + errIn: linuxerr.ERESTART_RESTARTBLOCK, + }, + { + errIn: errors.New("some new error"), + }, + } + for _, tt := range testTable { + t.Run(fmt.Sprintf("err: %v %T", tt.errIn, tt.errIn), func(t *testing.T) { + err, ok := linuxerr.TranslateError(tt.errIn) + if (!tt.expectedBool && err != nil) || (tt.expectedBool != ok) { + t.Fatalf("%v => %v %v expected %v err: nil", tt.errIn, err, ok, tt.expectedBool) + } else if err != tt.expectedTranslation { + t.Fatalf("%v => %v expected %v", tt.errIn, err, tt.expectedTranslation) + } + }) + } +} + +func TestSyscallErrnoToErrors(t *testing.T) { + for _, tc := range []struct { + errno syscall.Errno + err *gErrors.Error + }{ + {errno: syscall.EACCES, err: linuxerr.EACCES}, + {errno: syscall.EAGAIN, err: linuxerr.EAGAIN}, + {errno: syscall.EBADF, err: linuxerr.EBADF}, + {errno: syscall.EBUSY, err: linuxerr.EBUSY}, + {errno: syscall.EDOM, err: linuxerr.EDOM}, + {errno: syscall.EEXIST, err: linuxerr.EEXIST}, + {errno: syscall.EFAULT, err: linuxerr.EFAULT}, + {errno: syscall.EFBIG, err: linuxerr.EFBIG}, + {errno: syscall.EINTR, err: linuxerr.EINTR}, + {errno: syscall.EINVAL, err: linuxerr.EINVAL}, + {errno: syscall.EIO, err: linuxerr.EIO}, + {errno: syscall.ENOTDIR, err: linuxerr.ENOTDIR}, + {errno: syscall.ENOTTY, err: linuxerr.ENOTTY}, + {errno: syscall.EPERM, err: linuxerr.EPERM}, + {errno: syscall.EPIPE, err: linuxerr.EPIPE}, + {errno: syscall.ESPIPE, err: linuxerr.ESPIPE}, + {errno: syscall.EWOULDBLOCK, err: linuxerr.EAGAIN}, + } { + t.Run(tc.errno.Error(), func(t *testing.T) { + e := linuxerr.ErrorFromErrno(errno.Errno(tc.errno)) + if e != tc.err { + t.Fatalf("Mismatch errors: want: %+v (%d) got: %+v %d", tc.err, tc.err.Errno(), e, e.Errno()) + } + }) + } +} + +// TestEqualsMethod tests that the Equals method correctly compares syerror, +// unix.Errno and linuxerr. +// TODO (b/34162363): Remove this. +func TestEqualsMethod(t *testing.T) { + for _, tc := range []struct { + name string + linuxErr []*gErrors.Error + err []error + equal bool + }{ + { + name: "compare nil", + linuxErr: []*gErrors.Error{nil, linuxerr.NOERROR}, + err: []error{nil, linuxerr.NOERROR, unix.Errno(0)}, + equal: true, + }, + { + name: "linuxerr nil error not", + linuxErr: []*gErrors.Error{nil, linuxerr.NOERROR}, + err: []error{unix.Errno(1), linuxerr.EPERM, linuxerr.EACCES}, + equal: false, + }, + { + name: "linuxerr not nil error nil", + linuxErr: []*gErrors.Error{linuxerr.ENOENT}, + err: []error{nil, unix.Errno(0), linuxerr.NOERROR}, + equal: false, + }, + { + name: "equal errors", + linuxErr: []*gErrors.Error{linuxerr.ESRCH}, + err: []error{linuxerr.ESRCH, linuxerr.ESRCH, unix.Errno(linuxerr.ESRCH.Errno())}, + equal: true, + }, + { + name: "unequal errors", + linuxErr: []*gErrors.Error{linuxerr.ENOENT}, + err: []error{linuxerr.ESRCH, linuxerr.ESRCH, unix.Errno(linuxerr.ESRCH.Errno())}, + equal: false, + }, + { + name: "other error", + linuxErr: []*gErrors.Error{nil, linuxerr.NOERROR, linuxerr.E2BIG, linuxerr.EINVAL}, + err: []error{fs.ErrInvalid, io.EOF}, + equal: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + for _, le := range tc.linuxErr { + for _, e := range tc.err { + if linuxerr.Equals(le, e) != tc.equal { + t.Fatalf("Expected %t from Equals method for linuxerr: %s %T and error: %s %T", tc.equal, le, le, e, e) + } + } + } + }) + } +} diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD index a264ae2f0..56399232a 100644 --- a/pkg/eventchannel/BUILD +++ b/pkg/eventchannel/BUILD @@ -7,18 +7,19 @@ go_library( srcs = [ "event.go", "event_any.go", + "processor.go", "rate.go", ], visibility = ["//:sandbox"], deps = [ ":eventchannel_go_proto", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/sync", "//pkg/unet", "@org_golang_google_protobuf//encoding/prototext:go_default_library", "@org_golang_google_protobuf//proto:go_default_library", "@org_golang_google_protobuf//types/known/anypb:go_default_library", - "@org_golang_x_sys//unix:go_default_library", "@org_golang_x_time//rate:go_default_library", ], ) diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go index 98dfeb1f5..2be2d9d37 100644 --- a/pkg/eventchannel/event.go +++ b/pkg/eventchannel/event.go @@ -23,9 +23,9 @@ import ( "encoding/binary" "fmt" - "golang.org/x/sys/unix" "google.golang.org/protobuf/encoding/prototext" "google.golang.org/protobuf/proto" + "gvisor.dev/gvisor/pkg/errors/linuxerr" pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" @@ -155,7 +155,7 @@ func (s *socketEmitter) Emit(msg proto.Message) (bool, error) { for done := 0; done < len(p); { n, err := s.socket.Write(p[done:]) if err != nil { - return (err == unix.EPIPE), err + return linuxerr.Equals(linuxerr.EPIPE, err), err } done += n } diff --git a/pkg/eventchannel/event_any.go b/pkg/eventchannel/event_any.go index a5549f6cd..b708937a4 100644 --- a/pkg/eventchannel/event_any.go +++ b/pkg/eventchannel/event_any.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package eventchannel import ( @@ -23,3 +26,8 @@ import ( func newAny(m proto.Message) (*anypb.Any, error) { return anypb.New(m) } + +func emptyAny() *anypb.Any { + var any anypb.Any + return &any +} diff --git a/pkg/eventchannel/processor.go b/pkg/eventchannel/processor.go new file mode 100644 index 000000000..e765c10d1 --- /dev/null +++ b/pkg/eventchannel/processor.go @@ -0,0 +1,130 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventchannel + +import ( + "encoding/binary" + "fmt" + "io" + "os" + "time" + + "google.golang.org/protobuf/proto" + pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto" +) + +// eventProcessor carries display state across multiple events. +type eventProcessor struct { + filtering bool + // filtered is the number of events omitted since printing the last matching + // event. Only meaningful when filtering == true. + filtered uint64 + // allowlist is the set of event names to display. If empty, all events are + // displayed. + allowlist map[string]bool +} + +// newEventProcessor creates a new EventProcessor with filters. +func newEventProcessor(filters []string) *eventProcessor { + e := &eventProcessor{ + filtering: len(filters) > 0, + allowlist: make(map[string]bool), + } + for _, f := range filters { + e.allowlist[f] = true + } + return e +} + +// processOne reads, parses and displays a single event from the event channel. +// +// The event channel is a stream of (msglen, payload) packets; this function +// processes a single such packet. The msglen is a uvarint-encoded length for +// the associated payload. The payload is a binary-encoded 'Any' protobuf, which +// in turn encodes an arbitrary event protobuf. +func (e *eventProcessor) processOne(src io.Reader, out *os.File) error { + // Read and parse the msglen. + lenbuf := make([]byte, binary.MaxVarintLen64) + if _, err := io.ReadFull(src, lenbuf); err != nil { + return err + } + msglen, consumed := binary.Uvarint(lenbuf) + if consumed <= 0 { + return fmt.Errorf("couldn't parse the message length") + } + + // Read the payload. + buf := make([]byte, msglen) + // Copy any unused bytes from the len buffer into the payload buffer. These + // bytes are actually part of the payload. + extraBytes := copy(buf, lenbuf[consumed:]) + if _, err := io.ReadFull(src, buf[extraBytes:]); err != nil { + return err + } + + // Unmarshal the payload into an "Any" protobuf, which encodes the actual + // event. + encodedEv := emptyAny() + if err := proto.Unmarshal(buf, encodedEv); err != nil { + return fmt.Errorf("failed to unmarshal 'any' protobuf message: %v", err) + } + + var ev pb.DebugEvent + if err := (encodedEv).UnmarshalTo(&ev); err != nil { + return fmt.Errorf("failed to decode 'any' protobuf message: %v", err) + } + + if e.filtering && e.allowlist[ev.Name] { + e.filtered++ + return nil + } + + if e.filtering && e.filtered > 0 { + if e.filtered == 1 { + fmt.Fprintf(out, "... filtered %d event ...\n\n", e.filtered) + } else { + fmt.Fprintf(out, "... filtered %d events ...\n\n", e.filtered) + } + e.filtered = 0 + } + + // Extract the inner event and display it. Example: + // + // 2017-10-04 14:35:05.316180374 -0700 PDT m=+1.132485846 + // cloud_gvisor.MemoryUsage { + // total: 23822336 + // } + fmt.Fprintf(out, "%v\n%v {\n", time.Now(), ev.Name) + fmt.Fprintf(out, "%v", ev.Text) + fmt.Fprintf(out, "}\n\n") + + return nil +} + +// ProcessAll reads, parses and displays all events from src. The events are +// displayed to out. +func ProcessAll(src io.Reader, filters []string, out *os.File) error { + ep := newEventProcessor(filters) + for { + switch err := ep.processOne(src, out); err { + case nil: + continue + case io.EOF: + return nil + default: + return err + } + } +} diff --git a/pkg/fdchannel/fdchannel_unsafe.go b/pkg/fdchannel/fdchannel_unsafe.go index 1f24a448d..f9a201eeb 100644 --- a/pkg/fdchannel/fdchannel_unsafe.go +++ b/pkg/fdchannel/fdchannel_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris // Package fdchannel implements passing file descriptors between processes over diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go index 1290d5d10..152557143 100644 --- a/pkg/fdnotifier/fdnotifier.go +++ b/pkg/fdnotifier/fdnotifier.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package fdnotifier contains an adapter that translates IO events (e.g., a diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go index 493ea8375..db917303f 100644 --- a/pkg/fdnotifier/poll_unsafe.go +++ b/pkg/fdnotifier/poll_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package fdnotifier diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD index 9730b88c1..c810c7946 100644 --- a/pkg/flipcall/BUILD +++ b/pkg/flipcall/BUILD @@ -10,9 +10,7 @@ go_library( "flipcall_unsafe.go", "futex_linux.go", "io.go", - "packet_window_allocator.go", - "packet_window_mmap_amd64.go", - "packet_window_mmap_arm64.go", + "packet_window.go", ], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/flipcall/ctrl_futex.go b/pkg/flipcall/ctrl_futex.go index 2e8452a02..99410628f 100644 --- a/pkg/flipcall/ctrl_futex.go +++ b/pkg/flipcall/ctrl_futex.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package flipcall import ( @@ -118,7 +121,7 @@ func (ep *Endpoint) ctrlWaitFirst() error { return ep.futexWaitUntilActive() } -func (ep *Endpoint) ctrlRoundTrip() error { +func (ep *Endpoint) ctrlRoundTrip(mayRetainP bool) error { if err := ep.enterFutexWait(); err != nil { return err } @@ -130,6 +133,9 @@ func (ep *Endpoint) ctrlRoundTrip() error { if err := ep.futexWakePeer(); err != nil { return err } + // Since we don't know if the peer Endpoint is in the same process as this + // one (in which case it may need our P to run), we allow our P to be + // retaken regardless of mayRetainP. return ep.futexWaitUntilActive() } diff --git a/pkg/flipcall/flipcall.go b/pkg/flipcall/flipcall.go index 8d8309a73..88588ba0e 100644 --- a/pkg/flipcall/flipcall.go +++ b/pkg/flipcall/flipcall.go @@ -22,6 +22,7 @@ import ( "sync/atomic" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/memutil" ) // An Endpoint provides the ability to synchronously transfer data and control @@ -96,9 +97,9 @@ func (ep *Endpoint) Init(side EndpointSide, pwd PacketWindowDescriptor, opts ... if pwd.Length > math.MaxUint32 { return fmt.Errorf("packet window size (%d) exceeds maximum (%d)", pwd.Length, math.MaxUint32) } - m, e := packetWindowMmap(pwd) - if e != 0 { - return fmt.Errorf("failed to mmap packet window: %v", e) + m, err := memutil.MapFile(0, uintptr(pwd.Length), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, uintptr(pwd.FD), uintptr(pwd.Offset)) + if err != nil { + return fmt.Errorf("failed to mmap packet window: %v", err) } ep.packet = m ep.dataCap = uint32(pwd.Length) - uint32(PacketHeaderBytes) @@ -222,6 +223,23 @@ func (ep *Endpoint) RecvFirst() (uint32, error) { // * If ep is a client Endpoint, ep.Connect() has previously been called and // returned nil. func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) { + return ep.sendRecv(dataLen, false /* mayRetainP */) +} + +// SendRecvFast is equivalent to SendRecv, but may prevent the caller's runtime +// P from being released, in which case the calling goroutine continues to +// count against GOMAXPROCS while waiting for the peer Endpoint to return +// control to the caller. +// +// SendRecvFast is appropriate if the peer Endpoint is expected to consistently +// return control in a short amount of time (less than ~10ms). +// +// Preconditions: As for SendRecv. +func (ep *Endpoint) SendRecvFast(dataLen uint32) (uint32, error) { + return ep.sendRecv(dataLen, true /* mayRetainP */) +} + +func (ep *Endpoint) sendRecv(dataLen uint32, mayRetainP bool) (uint32, error) { if dataLen > ep.dataCap { panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap)) } @@ -232,7 +250,7 @@ func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) { // they can only shoot themselves in the foot. *ep.dataLen() = dataLen raceBecomeInactive() - if err := ep.ctrlRoundTrip(); err != nil { + if err := ep.ctrlRoundTrip(mayRetainP); err != nil { return 0, err } raceBecomeActive() diff --git a/pkg/flipcall/futex_linux.go b/pkg/flipcall/futex_linux.go index c212f05f1..4bb85939b 100644 --- a/pkg/flipcall/futex_linux.go +++ b/pkg/flipcall/futex_linux.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package flipcall diff --git a/pkg/flipcall/packet_window_allocator.go b/pkg/flipcall/packet_window.go index 9122c97b7..9122c97b7 100644 --- a/pkg/flipcall/packet_window_allocator.go +++ b/pkg/flipcall/packet_window.go diff --git a/pkg/gohacks/gohacks_unsafe.go b/pkg/gohacks/gohacks_unsafe.go index 374aac2b4..a055b3e8d 100644 --- a/pkg/gohacks/gohacks_unsafe.go +++ b/pkg/gohacks/gohacks_unsafe.go @@ -12,10 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.13 -// +build !go1.18 +//go:build go1.13 && !go1.19 +// +build go1.13,!go1.19 -// Check type signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. + +// Check type signatures and Noescape when updating Go version. +// +// TODO(b/165820485): add these checks to checklinkname. // Package gohacks contains utilities for subverting the Go compiler. package gohacks diff --git a/pkg/goid/goid.go b/pkg/goid/goid.go index 193b2c2d4..0887f79ab 100644 --- a/pkg/goid/goid.go +++ b/pkg/goid/goid.go @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build go1.12 -// +build !go1.18 +//go:build go1.12 && !go1.19 +// +build go1.12,!go1.19 // Check type signatures when updating Go version. diff --git a/pkg/hostarch/hostarch_arm64.go b/pkg/hostarch/hostarch_arm64.go index a31a8aeeb..a65c810a5 100644 --- a/pkg/hostarch/hostarch_arm64.go +++ b/pkg/hostarch/hostarch_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package hostarch diff --git a/pkg/hostarch/hostarch_x86.go b/pkg/hostarch/hostarch_x86.go index af6ef2b7f..00bf668f3 100644 --- a/pkg/hostarch/hostarch_x86.go +++ b/pkg/hostarch/hostarch_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || 386 // +build amd64 386 package hostarch diff --git a/pkg/iovec/BUILD b/pkg/iovec/BUILD deleted file mode 100644 index f4e9a6af9..000000000 --- a/pkg/iovec/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "iovec", - srcs = ["iovec.go"], - visibility = ["//:sandbox"], - deps = ["@org_golang_x_sys//unix:go_default_library"], -) - -go_test( - name = "iovec_test", - size = "small", - srcs = ["iovec_test.go"], - library = ":iovec", - deps = ["@org_golang_x_sys//unix:go_default_library"], -) diff --git a/pkg/iovec/iovec.go b/pkg/iovec/iovec.go deleted file mode 100644 index a281c05b6..000000000 --- a/pkg/iovec/iovec.go +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build linux - -// Package iovec provides helpers to interact with vectorized I/O on host -// system. -package iovec - -import ( - "golang.org/x/sys/unix" -) - -// MaxIovs is the maximum number of iovecs host platform can accept. -var MaxIovs = 1024 - -// Builder is a builder for slice of unix.Iovec. -type Builder struct { - iovec []unix.Iovec - storage [8]unix.Iovec - - // overflow tracks the last buffer when iovec length is at MaxIovs. - overflow []byte -} - -// Add adds buf to b preparing to be written. Zero-length buf won't be added. -func (b *Builder) Add(buf []byte) { - if len(buf) == 0 { - return - } - if b.iovec == nil { - b.iovec = b.storage[:0] - } - if len(b.iovec) >= MaxIovs { - b.addByAppend(buf) - return - } - - b.iovec = append(b.iovec, unix.Iovec{Base: &buf[0]}) - b.iovec[len(b.iovec)-1].SetLen(len(buf)) - - // Keep the last buf if iovec is at max capacity. We will need to append to it - // for later bufs. - if len(b.iovec) == MaxIovs { - n := len(buf) - b.overflow = buf[:n:n] - } -} - -func (b *Builder) addByAppend(buf []byte) { - b.overflow = append(b.overflow, buf...) - b.iovec[len(b.iovec)-1] = unix.Iovec{Base: &b.overflow[0]} - b.iovec[len(b.iovec)-1].SetLen(len(b.overflow)) -} - -// Build returns the final Iovec slice. The length of returned iovec will not -// excceed MaxIovs. -func (b *Builder) Build() []unix.Iovec { - return b.iovec -} diff --git a/pkg/iovec/iovec_test.go b/pkg/iovec/iovec_test.go deleted file mode 100644 index f6deb4208..000000000 --- a/pkg/iovec/iovec_test.go +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build linux - -package iovec - -import ( - "bytes" - "fmt" - "testing" - "unsafe" - - "golang.org/x/sys/unix" -) - -func TestBuilderEmpty(t *testing.T) { - var builder Builder - iovecs := builder.Build() - if got, want := len(iovecs), 0; got != want { - t.Errorf("len(iovecs) = %d, want %d", got, want) - } -} - -func TestBuilderBuild(t *testing.T) { - a := []byte{1, 2} - b := []byte{3, 4, 5} - - var builder Builder - builder.Add(a) - builder.Add(b) - builder.Add(nil) // Nil slice won't be added. - builder.Add([]byte{}) // Empty slice won't be added. - iovecs := builder.Build() - - if got, want := len(iovecs), 2; got != want { - t.Fatalf("len(iovecs) = %d, want %d", got, want) - } - for i, data := range [][]byte{a, b} { - if got, want := *iovecs[i].Base, data[0]; got != want { - t.Fatalf("*iovecs[%d].Base = %d, want %d", i, got, want) - } - if got, want := iovecs[i].Len, uint64(len(data)); got != want { - t.Fatalf("iovecs[%d].Len = %d, want %d", i, got, want) - } - } -} - -func TestBuilderBuildMaxIov(t *testing.T) { - for _, test := range []struct { - numIov int - }{ - { - numIov: MaxIovs - 1, - }, - { - numIov: MaxIovs, - }, - { - numIov: MaxIovs + 1, - }, - { - numIov: MaxIovs + 10, - }, - } { - name := fmt.Sprintf("numIov=%v", test.numIov) - t.Run(name, func(t *testing.T) { - var data []byte - var builder Builder - for i := 0; i < test.numIov; i++ { - buf := []byte{byte(i)} - builder.Add(buf) - data = append(data, buf...) - } - iovec := builder.Build() - - // Check the expected length of iovec. - wantNum := test.numIov - if wantNum > MaxIovs { - wantNum = MaxIovs - } - if got, want := len(iovec), wantNum; got != want { - t.Errorf("len(iovec) = %d, want %d", got, want) - } - - // Test a real read-write. - var fds [2]int - if err := unix.Pipe(fds[:]); err != nil { - t.Fatalf("Pipe: %v", err) - } - defer unix.Close(fds[0]) - defer unix.Close(fds[1]) - - wrote, _, e := unix.RawSyscall(unix.SYS_WRITEV, uintptr(fds[1]), uintptr(unsafe.Pointer(&iovec[0])), uintptr(len(iovec))) - if int(wrote) != len(data) || e != 0 { - t.Fatalf("writev: %v, %v; want %v, 0", wrote, e, len(data)) - } - - got := make([]byte, len(data)) - if n, err := unix.Read(fds[0], got); n != len(got) || err != nil { - t.Fatalf("read: %v, %v; want %v, nil", n, err, len(got)) - } - - if !bytes.Equal(got, data) { - t.Errorf("read: got data %v, want %v", got, data) - } - }) - } -} diff --git a/pkg/linuxerr/linuxerr.go b/pkg/linuxerr/linuxerr.go deleted file mode 100644 index f45caaadf..000000000 --- a/pkg/linuxerr/linuxerr.go +++ /dev/null @@ -1,184 +0,0 @@ -// Copyright 2021 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package linuxerr contains syscall error codes exported as an error interface -// pointers. This allows for fast comparison and return operations comperable -// to unix.Errno constants. -package linuxerr - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" -) - -// Error represents a syscall errno with a descriptive message. -type Error struct { - errno linux.Errno - message string -} - -func new(err linux.Errno, message string) *Error { - return &Error{ - errno: err, - message: message, - } -} - -// Error implements error.Error. -func (e *Error) Error() string { return e.message } - -// Errno returns the underlying linux.Errno value. -func (e *Error) Errno() linux.Errno { return e.errno } - -// The following varables have the same meaning as their errno equivalent. - -// Errno values from include/uapi/asm-generic/errno-base.h. -var ( - EPERM = new(linux.EPERM, "operation not permitted") - ENOENT = new(linux.ENOENT, "no such file or directory") - ESRCH = new(linux.ESRCH, "no such process") - EINTR = new(linux.EINTR, "interrupted system call") - EIO = new(linux.EIO, "I/O error") - ENXIO = new(linux.ENXIO, "no such device or address") - E2BIG = new(linux.E2BIG, "argument list too long") - ENOEXEC = new(linux.ENOEXEC, "exec format error") - EBADF = new(linux.EBADF, "bad file number") - ECHILD = new(linux.ECHILD, "no child processes") - EAGAIN = new(linux.EAGAIN, "try again") - ENOMEM = new(linux.ENOMEM, "out of memory") - EACCES = new(linux.EACCES, "permission denied") - EFAULT = new(linux.EFAULT, "bad address") - ENOTBLK = new(linux.ENOTBLK, "block device required") - EBUSY = new(linux.EBUSY, "device or resource busy") - EEXIST = new(linux.EEXIST, "file exists") - EXDEV = new(linux.EXDEV, "cross-device link") - ENODEV = new(linux.ENODEV, "no such device") - ENOTDIR = new(linux.ENOTDIR, "not a directory") - EISDIR = new(linux.EISDIR, "is a directory") - EINVAL = new(linux.EINVAL, "invalid argument") - ENFILE = new(linux.ENFILE, "file table overflow") - EMFILE = new(linux.EMFILE, "too many open files") - ENOTTY = new(linux.ENOTTY, "not a typewriter") - ETXTBSY = new(linux.ETXTBSY, "text file busy") - EFBIG = new(linux.EFBIG, "file too large") - ENOSPC = new(linux.ENOSPC, "no space left on device") - ESPIPE = new(linux.ESPIPE, "illegal seek") - EROFS = new(linux.EROFS, "read-only file system") - EMLINK = new(linux.EMLINK, "too many links") - EPIPE = new(linux.EPIPE, "broken pipe") - EDOM = new(linux.EDOM, "math argument out of domain of func") - ERANGE = new(linux.ERANGE, "math result not representable") -) - -// Errno values from include/uapi/asm-generic/errno.h. -var ( - EDEADLK = new(linux.EDEADLK, "resource deadlock would occur") - ENAMETOOLONG = new(linux.ENAMETOOLONG, "file name too long") - ENOLCK = new(linux.ENOLCK, "no record locks available") - ENOSYS = new(linux.ENOSYS, "invalid system call number") - ENOTEMPTY = new(linux.ENOTEMPTY, "directory not empty") - ELOOP = new(linux.ELOOP, "too many symbolic links encountered") - EWOULDBLOCK = new(linux.EWOULDBLOCK, "operation would block") - ENOMSG = new(linux.ENOMSG, "no message of desired type") - EIDRM = new(linux.EIDRM, "identifier removed") - ECHRNG = new(linux.ECHRNG, "channel number out of range") - EL2NSYNC = new(linux.EL2NSYNC, "level 2 not synchronized") - EL3HLT = new(linux.EL3HLT, "level 3 halted") - EL3RST = new(linux.EL3RST, "level 3 reset") - ELNRNG = new(linux.ELNRNG, "link number out of range") - EUNATCH = new(linux.EUNATCH, "protocol driver not attached") - ENOCSI = new(linux.ENOCSI, "no CSI structure available") - EL2HLT = new(linux.EL2HLT, "level 2 halted") - EBADE = new(linux.EBADE, "invalid exchange") - EBADR = new(linux.EBADR, "invalid request descriptor") - EXFULL = new(linux.EXFULL, "exchange full") - ENOANO = new(linux.ENOANO, "no anode") - EBADRQC = new(linux.EBADRQC, "invalid request code") - EBADSLT = new(linux.EBADSLT, "invalid slot") - EDEADLOCK = new(linux.EDEADLOCK, EDEADLK.message) - EBFONT = new(linux.EBFONT, "bad font file format") - ENOSTR = new(linux.ENOSTR, "device not a stream") - ENODATA = new(linux.ENODATA, "no data available") - ETIME = new(linux.ETIME, "timer expired") - ENOSR = new(linux.ENOSR, "out of streams resources") - ENONET = new(linux.ENOENT, "machine is not on the network") - ENOPKG = new(linux.ENOPKG, "package not installed") - EREMOTE = new(linux.EREMOTE, "object is remote") - ENOLINK = new(linux.ENOLINK, "link has been severed") - EADV = new(linux.EADV, "advertise error") - ESRMNT = new(linux.ESRMNT, "srmount error") - ECOMM = new(linux.ECOMM, "communication error on send") - EPROTO = new(linux.EPROTO, "protocol error") - EMULTIHOP = new(linux.EMULTIHOP, "multihop attempted") - EDOTDOT = new(linux.EDOTDOT, "RFS specific error") - EBADMSG = new(linux.EBADMSG, "not a data message") - EOVERFLOW = new(linux.EOVERFLOW, "value too large for defined data type") - ENOTUNIQ = new(linux.ENOTUNIQ, "name not unique on network") - EBADFD = new(linux.EBADFD, "file descriptor in bad state") - EREMCHG = new(linux.EREMCHG, "remote address changed") - ELIBACC = new(linux.ELIBACC, "can not access a needed shared library") - ELIBBAD = new(linux.ELIBBAD, "accessing a corrupted shared library") - ELIBSCN = new(linux.ELIBSCN, ".lib section in a.out corrupted") - ELIBMAX = new(linux.ELIBMAX, "attempting to link in too many shared libraries") - ELIBEXEC = new(linux.ELIBEXEC, "cannot exec a shared library directly") - EILSEQ = new(linux.EILSEQ, "illegal byte sequence") - ERESTART = new(linux.ERESTART, "interrupted system call should be restarted") - ESTRPIPE = new(linux.ESTRPIPE, "streams pipe error") - EUSERS = new(linux.EUSERS, "too many users") - ENOTSOCK = new(linux.ENOTSOCK, "socket operation on non-socket") - EDESTADDRREQ = new(linux.EDESTADDRREQ, "destination address required") - EMSGSIZE = new(linux.EMSGSIZE, "message too long") - EPROTOTYPE = new(linux.EPROTOTYPE, "protocol wrong type for socket") - ENOPROTOOPT = new(linux.ENOPROTOOPT, "protocol not available") - EPROTONOSUPPORT = new(linux.EPROTONOSUPPORT, "protocol not supported") - ESOCKTNOSUPPORT = new(linux.ESOCKTNOSUPPORT, "socket type not supported") - EOPNOTSUPP = new(linux.EOPNOTSUPP, "operation not supported on transport endpoint") - EPFNOSUPPORT = new(linux.EPFNOSUPPORT, "protocol family not supported") - EAFNOSUPPORT = new(linux.EAFNOSUPPORT, "address family not supported by protocol") - EADDRINUSE = new(linux.EADDRINUSE, "address already in use") - EADDRNOTAVAIL = new(linux.EADDRNOTAVAIL, "cannot assign requested address") - ENETDOWN = new(linux.ENETDOWN, "network is down") - ENETUNREACH = new(linux.ENETUNREACH, "network is unreachable") - ENETRESET = new(linux.ENETRESET, "network dropped connection because of reset") - ECONNABORTED = new(linux.ECONNABORTED, "software caused connection abort") - ECONNRESET = new(linux.ECONNRESET, "connection reset by peer") - ENOBUFS = new(linux.ENOBUFS, "no buffer space available") - EISCONN = new(linux.EISCONN, "transport endpoint is already connected") - ENOTCONN = new(linux.ENOTCONN, "transport endpoint is not connected") - ESHUTDOWN = new(linux.ESHUTDOWN, "cannot send after transport endpoint shutdown") - ETOOMANYREFS = new(linux.ETOOMANYREFS, "too many references: cannot splice") - ETIMEDOUT = new(linux.ETIMEDOUT, "connection timed out") - ECONNREFUSED = new(linux.ECONNREFUSED, "connection refused") - EHOSTDOWN = new(linux.EHOSTDOWN, "host is down") - EHOSTUNREACH = new(linux.EHOSTUNREACH, "no route to host") - EALREADY = new(linux.EALREADY, "operation already in progress") - EINPROGRESS = new(linux.EINPROGRESS, "operation now in progress") - ESTALE = new(linux.ESTALE, "stale file handle") - EUCLEAN = new(linux.EUCLEAN, "structure needs cleaning") - ENOTNAM = new(linux.ENOTNAM, "not a XENIX named type file") - ENAVAIL = new(linux.ENAVAIL, "no XENIX semaphores available") - EISNAM = new(linux.EISNAM, "is a named type file") - EREMOTEIO = new(linux.EREMOTEIO, "remote I/O error") - EDQUOT = new(linux.EDQUOT, "quota exceeded") - ENOMEDIUM = new(linux.ENOMEDIUM, "no medium found") - EMEDIUMTYPE = new(linux.EMEDIUMTYPE, "wrong medium type") - ECANCELED = new(linux.ECANCELED, "operation Canceled") - ENOKEY = new(linux.ENOKEY, "required key not available") - EKEYEXPIRED = new(linux.EKEYEXPIRED, "key has expired") - EKEYREVOKED = new(linux.EKEYREVOKED, "key has been revoked") - EKEYREJECTED = new(linux.EKEYREJECTED, "key was rejected by service") - EOWNERDEAD = new(linux.EOWNERDEAD, "owner died") - ENOTRECOVERABLE = new(linux.ENOTRECOVERABLE, "state not recoverable") - ERFKILL = new(linux.ERFKILL, "operation not possible due to RF-kill") - EHWPOISON = new(linux.EHWPOISON, "memory page has hardware error") -) diff --git a/pkg/linuxerr/linuxerr_test.go b/pkg/linuxerr/linuxerr_test.go deleted file mode 100644 index d34937e93..000000000 --- a/pkg/linuxerr/linuxerr_test.go +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package syserror_test - -import ( - "errors" - "testing" - - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/linuxerr" - "gvisor.dev/gvisor/pkg/syserror" -) - -var globalError error - -func BenchmarkAssignErrno(b *testing.B) { - for i := b.N; i > 0; i-- { - globalError = unix.EINVAL - } -} - -func BenchmarkLinuxerrAssignError(b *testing.B) { - for i := b.N; i > 0; i-- { - globalError = linuxerr.EINVAL - } -} - -func BenchmarkAssignSyserrorError(b *testing.B) { - for i := b.N; i > 0; i-- { - globalError = syserror.EINVAL - } -} - -func BenchmarkCompareErrno(b *testing.B) { - globalError = unix.EAGAIN - j := 0 - for i := b.N; i > 0; i-- { - if globalError == unix.EINVAL { - j++ - } - } -} - -func BenchmarkCompareLinuxerrError(b *testing.B) { - globalError = linuxerr.E2BIG - j := 0 - for i := b.N; i > 0; i-- { - if globalError == linuxerr.EINVAL { - j++ - } - } -} - -func BenchmarkCompareSyserrorError(b *testing.B) { - globalError = syserror.EAGAIN - j := 0 - for i := b.N; i > 0; i-- { - if globalError == syserror.EINVAL { - j++ - } - } -} - -func BenchmarkSwitchErrno(b *testing.B) { - globalError = unix.EPERM - j := 0 - for i := b.N; i > 0; i-- { - switch globalError { - case unix.EINVAL: - j++ - case unix.EINTR: - j += 2 - case unix.EAGAIN: - j += 3 - } - } -} - -func BenchmarkSwitchLinuxerrError(b *testing.B) { - globalError = linuxerr.EPERM - j := 0 - for i := b.N; i > 0; i-- { - switch globalError { - case linuxerr.EINVAL: - j++ - case linuxerr.EINTR: - j += 2 - case linuxerr.EAGAIN: - j += 3 - } - } -} - -func BenchmarkSwitchSyserrorError(b *testing.B) { - globalError = syserror.EPERM - j := 0 - for i := b.N; i > 0; i-- { - switch globalError { - case syserror.EINVAL: - j++ - case syserror.EINTR: - j += 2 - case syserror.EAGAIN: - j += 3 - } - } -} - -type translationTestTable struct { - fn string - errIn error - syscallErrorIn unix.Errno - expectedBool bool - expectedTranslation unix.Errno -} - -func TestErrorTranslation(t *testing.T) { - myError := errors.New("My test error") - myError2 := errors.New("Another test error") - testTable := []translationTestTable{ - {"TranslateError", myError, 0, false, 0}, - {"TranslateError", myError2, 0, false, 0}, - {"AddErrorTranslation", myError, unix.EAGAIN, true, 0}, - {"AddErrorTranslation", myError, unix.EAGAIN, false, 0}, - {"AddErrorTranslation", myError, unix.EPERM, false, 0}, - {"TranslateError", myError, 0, true, unix.EAGAIN}, - {"TranslateError", myError2, 0, false, 0}, - {"AddErrorTranslation", myError2, unix.EPERM, true, 0}, - {"AddErrorTranslation", myError2, unix.EPERM, false, 0}, - {"AddErrorTranslation", myError2, unix.EAGAIN, false, 0}, - {"TranslateError", myError, 0, true, unix.EAGAIN}, - {"TranslateError", myError2, 0, true, unix.EPERM}, - } - for _, tt := range testTable { - switch tt.fn { - case "TranslateError": - err, ok := syserror.TranslateError(tt.errIn) - if ok != tt.expectedBool { - t.Fatalf("%v(%v) => %v expected %v", tt.fn, tt.errIn, ok, tt.expectedBool) - } else if err != tt.expectedTranslation { - t.Fatalf("%v(%v) (error) => %v expected %v", tt.fn, tt.errIn, err, tt.expectedTranslation) - } - case "AddErrorTranslation": - ok := syserror.AddErrorTranslation(tt.errIn, tt.syscallErrorIn) - if ok != tt.expectedBool { - t.Fatalf("%v(%v) => %v expected %v", tt.fn, tt.errIn, ok, tt.expectedBool) - } - default: - t.Fatalf("Unknown function %v", tt.fn) - } - } -} diff --git a/pkg/marshal/primitive/BUILD b/pkg/marshal/primitive/BUILD index 190b57c29..6e5ce136d 100644 --- a/pkg/marshal/primitive/BUILD +++ b/pkg/marshal/primitive/BUILD @@ -12,9 +12,7 @@ go_library( "//:sandbox", ], deps = [ - "//pkg/context", "//pkg/hostarch", "//pkg/marshal", - "//pkg/usermem", ], ) diff --git a/pkg/marshal/primitive/primitive.go b/pkg/marshal/primitive/primitive.go index 6f38992b7..1c49cf082 100644 --- a/pkg/marshal/primitive/primitive.go +++ b/pkg/marshal/primitive/primitive.go @@ -19,10 +19,8 @@ package primitive import ( "io" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" - "gvisor.dev/gvisor/pkg/usermem" ) // Int8 is a marshal.Marshallable implementation for int8. @@ -400,26 +398,3 @@ func CopyStringOut(cc marshal.CopyContext, addr hostarch.Addr, src string) (int, srcP := ByteSlice(src) return srcP.CopyOut(cc, addr) } - -// IOCopyContext wraps an object implementing hostarch.IO to implement -// marshal.CopyContext. -type IOCopyContext struct { - Ctx context.Context - IO usermem.IO - Opts usermem.IOOpts -} - -// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer. -func (i *IOCopyContext) CopyScratchBuffer(size int) []byte { - return make([]byte, size) -} - -// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. -func (i *IOCopyContext) CopyOutBytes(addr hostarch.Addr, b []byte) (int, error) { - return i.IO.CopyOut(i.Ctx, addr, b, i.Opts) -} - -// CopyInBytes implements marshal.CopyContext.CopyInBytes. -func (i *IOCopyContext) CopyInBytes(addr hostarch.Addr, b []byte) (int, error) { - return i.IO.CopyIn(i.Ctx, addr, b, i.Opts) -} diff --git a/pkg/memutil/BUILD b/pkg/memutil/BUILD index 9d07d98b4..bea595286 100644 --- a/pkg/memutil/BUILD +++ b/pkg/memutil/BUILD @@ -4,7 +4,11 @@ package(licenses = ["notice"]) go_library( name = "memutil", - srcs = ["memutil_unsafe.go"], + srcs = [ + "memfd_linux_unsafe.go", + "memutil.go", + "mmap.go", + ], visibility = ["//visibility:public"], deps = ["@org_golang_x_sys//unix:go_default_library"], ) diff --git a/pkg/memutil/memutil_unsafe.go b/pkg/memutil/memfd_linux_unsafe.go index 6676d1ce3..2179c92f3 100644 --- a/pkg/memutil/memutil_unsafe.go +++ b/pkg/memutil/memfd_linux_unsafe.go @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux -// Package memutil provides a wrapper for the memfd_create() system call. package memutil import ( diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/memutil/memutil.go index e4ce484e4..3185882fd 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go +++ b/pkg/memutil/memutil.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,17 +12,5 @@ // See the License for the specific language governing permissions and // limitations under the License. -package disklayout - -import ( - "testing" -) - -// TestBlockGroupSize tests that the block group descriptor structs are of the -// correct size. -func TestBlockGroupSize(t *testing.T) { - var bgSmall BlockGroup32Bit - assertSize(t, &bgSmall, 32) - var bgBig BlockGroup64Bit - assertSize(t, &bgBig, 64) -} +// Package memutil provides utilities for working with shared memory files. +package memutil diff --git a/pkg/flipcall/packet_window_mmap_amd64.go b/pkg/memutil/mmap.go index ced587a2a..7a55d1b28 100644 --- a/pkg/flipcall/packet_window_mmap_amd64.go +++ b/pkg/memutil/mmap.go @@ -12,12 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -package flipcall +//go:build go1.1 +// +build go1.1 -import "golang.org/x/sys/unix" +package memutil -// Return a memory mapping of the pwd in memory that can be shared outside the sandbox. -func packetWindowMmap(pwd PacketWindowDescriptor) (uintptr, unix.Errno) { - m, _, err := unix.RawSyscall6(unix.SYS_MMAP, 0, uintptr(pwd.Length), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, uintptr(pwd.FD), uintptr(pwd.Offset)) - return m, err +import ( + "golang.org/x/sys/unix" +) + +// MapFile returns a memory mapping configured by the given options as per +// mmap(2). +func MapFile(addr, len, prot, flags, fd, offset uintptr) (uintptr, error) { + m, _, e := unix.RawSyscall6(unix.SYS_MMAP, addr, len, prot, flags, fd, offset) + if e != 0 { + return 0, e + } + return m, nil } diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go index ac7868ad9..6358ad8e9 100644 --- a/pkg/merkletree/merkletree.go +++ b/pkg/merkletree/merkletree.go @@ -151,21 +151,21 @@ type VerityDescriptor struct { Mode uint32 UID uint32 GID uint32 - Children map[string]struct{} + Children []string SymlinkTarget string RootHash []byte } -func (d *VerityDescriptor) String() string { +func (d *VerityDescriptor) encode() []byte { b := new(bytes.Buffer) e := gob.NewEncoder(b) - e.Encode(d.Children) - return fmt.Sprintf("Name: %s, Size: %d, Mode: %d, UID: %d, GID: %d, Children: %v, Symlink: %s, RootHash: %v", d.Name, d.FileSize, d.Mode, d.UID, d.GID, b.Bytes(), d.SymlinkTarget, d.RootHash) + e.Encode(d) + return b.Bytes() } // verify generates a hash from d, and compares it with expected. func (d *VerityDescriptor) verify(expected []byte, hashAlgorithms int) error { - h, err := hashData([]byte(d.String()), hashAlgorithms) + h, err := hashData(d.encode(), hashAlgorithms) if err != nil { return err } @@ -210,7 +210,7 @@ type GenerateParams struct { GID uint32 // Children is a map of children names for a directory. It should be // empty for a regular file. - Children map[string]struct{} + Children []string // SymlinkTarget is the target path of a symlink file, or "" if the file is not a symlink. SymlinkTarget string // HashAlgorithms is the algorithms used to hash data. @@ -242,7 +242,7 @@ func Generate(params *GenerateParams) ([]byte, error) { // If file is a symlink do not generate root hash for file content. if params.SymlinkTarget != "" { - return hashData([]byte(descriptor.String()), params.HashAlgorithms) + return hashData(descriptor.encode(), params.HashAlgorithms) } layout, err := InitLayout(params.Size, params.HashAlgorithms, params.DataAndTreeInSameFile) @@ -315,7 +315,7 @@ func Generate(params *GenerateParams) ([]byte, error) { numBlocks = (numBlocks + layout.hashesPerBlock() - 1) / layout.hashesPerBlock() } descriptor.RootHash = root - return hashData([]byte(descriptor.String()), params.HashAlgorithms) + return hashData(descriptor.encode(), params.HashAlgorithms) } // VerifyParams contains the params used to verify a portion of a file against @@ -339,7 +339,7 @@ type VerifyParams struct { GID uint32 // Children is a map of children names for a directory. It should be // empty for a regular file. - Children map[string]struct{} + Children []string // SymlinkTarget is the target path of a symlink file, or "" if the file is not a symlink. SymlinkTarget string // HashAlgorithms is the algorithms used to hash data. @@ -384,6 +384,14 @@ func verifyMetadata(params *VerifyParams, layout *Layout) error { return descriptor.verify(params.Expected, params.HashAlgorithms) } +// cachedHashes stores verified hashes from a previous hash step. +type cachedHashes struct { + // offset is the offset of cached hash in each level. + offset []int64 + // hash is the verified cache for each level from previous hash steps. + hash [][]byte +} + // Verify verifies the content read from data with offset. The content is // verified against tree. If content spans across multiple blocks, each block is // verified. Verification fails if the hash of the data does not match the tree @@ -409,29 +417,32 @@ func Verify(params *VerifyParams) (int64, error) { firstDataBlock := params.ReadOffset / layout.blockSize lastDataBlock := (params.ReadOffset + params.ReadSize - 1) / layout.blockSize - buf := make([]byte, layout.blockSize) - var readErr error - total := int64(0) + size := (lastDataBlock - firstDataBlock + 1) * layout.blockSize + retBuf := make([]byte, size) + n, err := params.File.ReadAt(retBuf, firstDataBlock*layout.blockSize) + if err != nil && err != io.EOF { + return 0, err + } + total := int64(n) + bytesRead := int64(0) + + // Only cache hash results if reading more than a block. + var ch *cachedHashes + if lastDataBlock > firstDataBlock { + ch = &cachedHashes{ + offset: make([]int64, layout.numLevels()), + hash: make([][]byte, layout.numLevels()), + } + } for i := firstDataBlock; i <= lastDataBlock; i++ { + // Reach the end of file during verification. + if total <= 0 { + return bytesRead, io.EOF + } // Read a block that includes all or part of target range in // input data. - bytesRead, err := params.File.ReadAt(buf, i*layout.blockSize) - readErr = err - // If at the end of input data and all previous blocks are - // verified, return the verified input data and EOF. - if readErr == io.EOF && bytesRead == 0 { - break - } - if readErr != nil && readErr != io.EOF { - return 0, fmt.Errorf("read from data failed: %w", err) - } - // If this is the end of file, zero the remaining bytes in buf, - // otherwise they are still from the previous block. - if bytesRead < len(buf) { - for j := bytesRead; j < len(buf); j++ { - buf[j] = 0 - } - } + buf := retBuf[(i-firstDataBlock)*layout.blockSize : (i-firstDataBlock+1)*layout.blockSize] + descriptor := VerityDescriptor{ Name: params.Name, FileSize: params.Size, @@ -441,8 +452,8 @@ func Verify(params *VerifyParams) (int64, error) { SymlinkTarget: params.SymlinkTarget, Children: params.Children, } - if err := verifyBlock(params.Tree, &descriptor, &layout, buf, i, params.HashAlgorithms, params.Expected); err != nil { - return 0, err + if err := verifyBlock(params.Tree, &descriptor, &layout, buf, i, params.HashAlgorithms, params.Expected, ch); err != nil { + return bytesRead, err } // startOff is the beginning of the read range within the @@ -459,22 +470,24 @@ func Verify(params *VerifyParams) (int64, error) { if i == lastDataBlock { endOff = (params.ReadOffset+params.ReadSize-1)%layout.blockSize + 1 } + // If the provided size exceeds the end of input data, we should // only copy the parts in buf that's part of input data. - if startOff > int64(bytesRead) { - startOff = int64(bytesRead) + if startOff > total { + startOff = total } - if endOff > int64(bytesRead) { - endOff = int64(bytesRead) + if endOff > total { + endOff = total } + n, err := params.Out.Write(buf[startOff:endOff]) if err != nil { - return total, err + return bytesRead, err } - total += int64(n) - + bytesRead += int64(n) + total -= endOff } - return total, readErr + return bytesRead, nil } // verifyBlock verifies a block against tree. index is the number of block in @@ -482,7 +495,7 @@ func Verify(params *VerifyParams) (int64, error) { // fails if the calculated hash from block is different from any level of // hashes stored in tree. And the final root hash is compared with // expected. -func verifyBlock(tree io.ReaderAt, descriptor *VerityDescriptor, layout *Layout, dataBlock []byte, blockIndex int64, hashAlgorithms int, expected []byte) error { +func verifyBlock(tree io.ReaderAt, descriptor *VerityDescriptor, layout *Layout, dataBlock []byte, blockIndex int64, hashAlgorithms int, expected []byte, ch *cachedHashes) error { if len(dataBlock) != int(layout.blockSize) { return fmt.Errorf("incorrect block size") } @@ -491,6 +504,12 @@ func verifyBlock(tree io.ReaderAt, descriptor *VerityDescriptor, layout *Layout, treeBlock := make([]byte, layout.blockSize) var digest []byte for level := 0; level < layout.numLevels(); level++ { + // No need to verify remaining levels if the current block has + // been verified in a previous call and cached. + if ch != nil && ch.offset[level] == layout.digestOffset(level, blockIndex) && ch.hash[level] != nil { + break + } + // Calculate hash. if level == 0 { h, err := hashData(dataBlock, hashAlgorithms) @@ -521,11 +540,19 @@ func verifyBlock(tree io.ReaderAt, descriptor *VerityDescriptor, layout *Layout, if !bytes.Equal(digest, expectedDigest) { return fmt.Errorf("verification failed") } + if ch != nil { + ch.offset[level] = layout.digestOffset(level, blockIndex) + ch.hash[level] = expectedDigest + } blockIndex = blockIndex / layout.hashesPerBlock() } // Verification for the tree succeeded. Now hash the descriptor with // the root hash and compare it with expected. - descriptor.RootHash = digest + if ch != nil { + descriptor.RootHash = ch.hash[layout.rootLevel()] + } else { + descriptor.RootHash = digest + } return descriptor.verify(expected, hashAlgorithms) } diff --git a/pkg/merkletree/merkletree_test.go b/pkg/merkletree/merkletree_test.go index 5d6f8df1b..1447fd139 100644 --- a/pkg/merkletree/merkletree_test.go +++ b/pkg/merkletree/merkletree_test.go @@ -206,112 +206,112 @@ func TestGenerate(t *testing.T) { data: bytes.Repeat([]byte{0}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: false, - expectedHash: []byte{9, 115, 238, 230, 38, 140, 195, 70, 207, 144, 202, 118, 23, 113, 32, 129, 226, 239, 177, 69, 161, 26, 14, 113, 16, 37, 30, 96, 19, 148, 132, 27}, + expectedHash: []byte{78, 38, 225, 107, 61, 246, 26, 6, 71, 163, 254, 97, 112, 200, 87, 232, 190, 87, 231, 160, 119, 124, 61, 229, 49, 126, 90, 223, 134, 51, 77, 182}, }, { name: "OnePageZeroesSHA256SameFile", data: bytes.Repeat([]byte{0}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: true, - expectedHash: []byte{9, 115, 238, 230, 38, 140, 195, 70, 207, 144, 202, 118, 23, 113, 32, 129, 226, 239, 177, 69, 161, 26, 14, 113, 16, 37, 30, 96, 19, 148, 132, 27}, + expectedHash: []byte{78, 38, 225, 107, 61, 246, 26, 6, 71, 163, 254, 97, 112, 200, 87, 232, 190, 87, 231, 160, 119, 124, 61, 229, 49, 126, 90, 223, 134, 51, 77, 182}, }, { name: "OnePageZeroesSHA512SeparateFile", data: bytes.Repeat([]byte{0}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: false, - expectedHash: []byte{127, 8, 95, 11, 83, 101, 51, 39, 170, 235, 39, 43, 135, 243, 145, 118, 148, 58, 27, 155, 182, 205, 44, 47, 5, 223, 215, 17, 35, 16, 43, 104, 43, 11, 8, 88, 171, 7, 249, 243, 14, 62, 126, 218, 23, 159, 237, 237, 42, 226, 39, 25, 87, 48, 253, 191, 116, 213, 37, 3, 187, 152, 154, 14}, + expectedHash: []byte{221, 45, 182, 132, 61, 212, 227, 145, 150, 131, 98, 221, 195, 5, 89, 21, 188, 36, 250, 101, 85, 78, 197, 253, 193, 23, 74, 219, 28, 108, 77, 47, 65, 79, 123, 144, 50, 245, 109, 72, 71, 80, 24, 77, 158, 95, 242, 185, 109, 163, 105, 183, 67, 106, 55, 194, 223, 46, 12, 242, 165, 203, 172, 254}, }, { name: "OnePageZeroesSHA512SameFile", data: bytes.Repeat([]byte{0}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: true, - expectedHash: []byte{127, 8, 95, 11, 83, 101, 51, 39, 170, 235, 39, 43, 135, 243, 145, 118, 148, 58, 27, 155, 182, 205, 44, 47, 5, 223, 215, 17, 35, 16, 43, 104, 43, 11, 8, 88, 171, 7, 249, 243, 14, 62, 126, 218, 23, 159, 237, 237, 42, 226, 39, 25, 87, 48, 253, 191, 116, 213, 37, 3, 187, 152, 154, 14}, + expectedHash: []byte{221, 45, 182, 132, 61, 212, 227, 145, 150, 131, 98, 221, 195, 5, 89, 21, 188, 36, 250, 101, 85, 78, 197, 253, 193, 23, 74, 219, 28, 108, 77, 47, 65, 79, 123, 144, 50, 245, 109, 72, 71, 80, 24, 77, 158, 95, 242, 185, 109, 163, 105, 183, 67, 106, 55, 194, 223, 46, 12, 242, 165, 203, 172, 254}, }, { name: "MultiplePageZeroesSHA256SeparateFile", data: bytes.Repeat([]byte{0}, 128*hostarch.PageSize+1), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: false, - expectedHash: []byte{247, 158, 42, 215, 180, 106, 0, 28, 77, 64, 132, 162, 74, 65, 250, 161, 243, 66, 129, 44, 197, 8, 145, 14, 94, 206, 156, 184, 145, 145, 20, 185}, + expectedHash: []byte{131, 122, 73, 143, 4, 202, 193, 156, 218, 169, 196, 223, 70, 100, 117, 191, 241, 113, 134, 11, 229, 231, 105, 157, 156, 0, 66, 213, 122, 145, 174, 8}, }, { name: "MultiplePageZeroesSHA256SameFile", data: bytes.Repeat([]byte{0}, 128*hostarch.PageSize+1), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: true, - expectedHash: []byte{247, 158, 42, 215, 180, 106, 0, 28, 77, 64, 132, 162, 74, 65, 250, 161, 243, 66, 129, 44, 197, 8, 145, 14, 94, 206, 156, 184, 145, 145, 20, 185}, + expectedHash: []byte{131, 122, 73, 143, 4, 202, 193, 156, 218, 169, 196, 223, 70, 100, 117, 191, 241, 113, 134, 11, 229, 231, 105, 157, 156, 0, 66, 213, 122, 145, 174, 8}, }, { name: "MultiplePageZeroesSHA512SeparateFile", data: bytes.Repeat([]byte{0}, 128*hostarch.PageSize+1), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: false, - expectedHash: []byte{100, 121, 14, 30, 104, 200, 142, 182, 190, 78, 23, 68, 157, 174, 23, 75, 174, 250, 250, 25, 66, 45, 235, 103, 129, 49, 78, 127, 173, 154, 121, 35, 37, 115, 60, 217, 26, 205, 253, 253, 236, 145, 107, 109, 232, 19, 72, 92, 4, 191, 181, 205, 191, 57, 234, 177, 144, 235, 143, 30, 15, 197, 109, 81}, + expectedHash: []byte{211, 48, 232, 110, 240, 51, 99, 241, 123, 138, 42, 76, 94, 86, 59, 200, 3, 246, 137, 148, 189, 226, 111, 103, 146, 29, 12, 218, 40, 182, 33, 99, 193, 163, 238, 26, 184, 13, 165, 187, 68, 173, 139, 9, 208, 59, 0, 192, 180, 50, 221, 35, 43, 119, 194, 16, 64, 84, 116, 63, 158, 195, 194, 226}, }, { name: "MultiplePageZeroesSHA512SameFile", data: bytes.Repeat([]byte{0}, 128*hostarch.PageSize+1), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: true, - expectedHash: []byte{100, 121, 14, 30, 104, 200, 142, 182, 190, 78, 23, 68, 157, 174, 23, 75, 174, 250, 250, 25, 66, 45, 235, 103, 129, 49, 78, 127, 173, 154, 121, 35, 37, 115, 60, 217, 26, 205, 253, 253, 236, 145, 107, 109, 232, 19, 72, 92, 4, 191, 181, 205, 191, 57, 234, 177, 144, 235, 143, 30, 15, 197, 109, 81}, + expectedHash: []byte{211, 48, 232, 110, 240, 51, 99, 241, 123, 138, 42, 76, 94, 86, 59, 200, 3, 246, 137, 148, 189, 226, 111, 103, 146, 29, 12, 218, 40, 182, 33, 99, 193, 163, 238, 26, 184, 13, 165, 187, 68, 173, 139, 9, 208, 59, 0, 192, 180, 50, 221, 35, 43, 119, 194, 16, 64, 84, 116, 63, 158, 195, 194, 226}, }, { name: "SingleASHA256SeparateFile", data: []byte{'a'}, hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: false, - expectedHash: []byte{90, 124, 194, 100, 206, 242, 75, 152, 47, 249, 16, 27, 136, 161, 223, 228, 121, 241, 126, 158, 126, 122, 100, 120, 117, 15, 81, 78, 201, 133, 119, 111}, + expectedHash: []byte{26, 47, 238, 138, 235, 244, 140, 231, 129, 240, 155, 252, 219, 44, 46, 72, 57, 249, 139, 88, 132, 238, 86, 108, 181, 115, 96, 72, 99, 210, 134, 47}, }, { name: "SingleASHA256SameFile", data: []byte{'a'}, hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: true, - expectedHash: []byte{90, 124, 194, 100, 206, 242, 75, 152, 47, 249, 16, 27, 136, 161, 223, 228, 121, 241, 126, 158, 126, 122, 100, 120, 117, 15, 81, 78, 201, 133, 119, 111}, + expectedHash: []byte{26, 47, 238, 138, 235, 244, 140, 231, 129, 240, 155, 252, 219, 44, 46, 72, 57, 249, 139, 88, 132, 238, 86, 108, 181, 115, 96, 72, 99, 210, 134, 47}, }, { name: "SingleASHA512SeparateFile", data: []byte{'a'}, hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: false, - expectedHash: []byte{24, 10, 13, 25, 113, 62, 169, 99, 151, 70, 166, 113, 81, 81, 163, 85, 5, 25, 29, 15, 46, 37, 104, 120, 142, 218, 52, 178, 187, 83, 30, 166, 101, 87, 70, 196, 188, 61, 123, 20, 13, 254, 126, 52, 212, 111, 75, 203, 33, 233, 233, 47, 181, 161, 43, 193, 131, 41, 99, 33, 164, 73, 89, 152}, + expectedHash: []byte{44, 30, 224, 12, 102, 119, 163, 171, 119, 175, 212, 121, 231, 188, 125, 171, 79, 28, 144, 234, 75, 122, 44, 75, 15, 101, 173, 92, 233, 109, 234, 60, 173, 148, 125, 85, 94, 234, 95, 91, 16, 196, 88, 175, 23, 129, 226, 110, 24, 238, 5, 49, 186, 128, 72, 188, 193, 180, 207, 193, 203, 119, 40, 191}, }, { name: "SingleASHA512SameFile", data: []byte{'a'}, hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: true, - expectedHash: []byte{24, 10, 13, 25, 113, 62, 169, 99, 151, 70, 166, 113, 81, 81, 163, 85, 5, 25, 29, 15, 46, 37, 104, 120, 142, 218, 52, 178, 187, 83, 30, 166, 101, 87, 70, 196, 188, 61, 123, 20, 13, 254, 126, 52, 212, 111, 75, 203, 33, 233, 233, 47, 181, 161, 43, 193, 131, 41, 99, 33, 164, 73, 89, 152}, + expectedHash: []byte{44, 30, 224, 12, 102, 119, 163, 171, 119, 175, 212, 121, 231, 188, 125, 171, 79, 28, 144, 234, 75, 122, 44, 75, 15, 101, 173, 92, 233, 109, 234, 60, 173, 148, 125, 85, 94, 234, 95, 91, 16, 196, 88, 175, 23, 129, 226, 110, 24, 238, 5, 49, 186, 128, 72, 188, 193, 180, 207, 193, 203, 119, 40, 191}, }, { name: "OnePageASHA256SeparateFile", data: bytes.Repeat([]byte{'a'}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: false, - expectedHash: []byte{132, 54, 112, 142, 156, 19, 50, 140, 138, 240, 192, 154, 100, 120, 242, 69, 64, 217, 62, 166, 127, 88, 23, 197, 100, 66, 255, 215, 214, 229, 54, 1}, + expectedHash: []byte{166, 254, 83, 46, 241, 111, 18, 47, 79, 6, 181, 197, 176, 143, 211, 204, 53, 5, 245, 134, 172, 95, 97, 131, 236, 132, 197, 138, 123, 78, 43, 13}, }, { name: "OnePageASHA256SameFile", data: bytes.Repeat([]byte{'a'}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, dataAndTreeInSameFile: true, - expectedHash: []byte{132, 54, 112, 142, 156, 19, 50, 140, 138, 240, 192, 154, 100, 120, 242, 69, 64, 217, 62, 166, 127, 88, 23, 197, 100, 66, 255, 215, 214, 229, 54, 1}, + expectedHash: []byte{166, 254, 83, 46, 241, 111, 18, 47, 79, 6, 181, 197, 176, 143, 211, 204, 53, 5, 245, 134, 172, 95, 97, 131, 236, 132, 197, 138, 123, 78, 43, 13}, }, { name: "OnePageASHA512SeparateFile", data: bytes.Repeat([]byte{'a'}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: false, - expectedHash: []byte{165, 46, 176, 116, 47, 209, 101, 193, 64, 185, 30, 9, 52, 22, 24, 154, 135, 220, 232, 168, 215, 45, 222, 226, 207, 104, 160, 10, 156, 98, 245, 250, 76, 21, 68, 204, 65, 118, 69, 52, 210, 155, 36, 109, 233, 103, 1, 40, 218, 89, 125, 38, 247, 194, 2, 225, 119, 155, 65, 99, 182, 111, 110, 145}, + expectedHash: []byte{23, 69, 6, 79, 39, 232, 90, 246, 62, 55, 4, 229, 47, 36, 230, 24, 233, 47, 55, 36, 26, 139, 196, 78, 242, 12, 194, 77, 109, 81, 151, 188, 63, 201, 127, 235, 81, 214, 91, 200, 19, 232, 240, 14, 197, 1, 99, 224, 18, 213, 203, 242, 44, 102, 25, 62, 90, 189, 106, 107, 129, 61, 115, 39}, }, { name: "OnePageASHA512SameFile", data: bytes.Repeat([]byte{'a'}, hostarch.PageSize), hashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA512, dataAndTreeInSameFile: true, - expectedHash: []byte{165, 46, 176, 116, 47, 209, 101, 193, 64, 185, 30, 9, 52, 22, 24, 154, 135, 220, 232, 168, 215, 45, 222, 226, 207, 104, 160, 10, 156, 98, 245, 250, 76, 21, 68, 204, 65, 118, 69, 52, 210, 155, 36, 109, 233, 103, 1, 40, 218, 89, 125, 38, 247, 194, 2, 225, 119, 155, 65, 99, 182, 111, 110, 145}, + expectedHash: []byte{23, 69, 6, 79, 39, 232, 90, 246, 62, 55, 4, 229, 47, 36, 230, 24, 233, 47, 55, 36, 26, 139, 196, 78, 242, 12, 194, 77, 109, 81, 151, 188, 63, 201, 127, 235, 81, 214, 91, 200, 19, 232, 240, 14, 197, 1, 99, 224, 18, 213, 203, 242, 44, 102, 25, 62, 90, 189, 106, 107, 129, 61, 115, 39}, }, } @@ -324,7 +324,7 @@ func TestGenerate(t *testing.T) { Mode: defaultMode, UID: defaultUID, GID: defaultGID, - Children: make(map[string]struct{}), + Children: []string{}, HashAlgorithms: tc.hashAlgorithms, TreeReader: &tree, TreeWriter: &tree, @@ -366,7 +366,7 @@ func prepareVerify(t *testing.T, dataSize int64, hashAlgorithm int, dataAndTreeI Mode: defaultMode, UID: defaultUID, GID: defaultGID, - Children: make(map[string]struct{}), + Children: []string{}, HashAlgorithms: hashAlgorithm, TreeReader: &tree, TreeWriter: &tree, @@ -398,7 +398,7 @@ func prepareVerify(t *testing.T, dataSize int64, hashAlgorithm int, dataAndTreeI Mode: defaultMode, UID: defaultUID, GID: defaultGID, - Children: make(map[string]struct{}), + Children: []string{}, HashAlgorithms: hashAlgorithm, ReadOffset: verifyStart, ReadSize: verifySize, @@ -627,7 +627,7 @@ func TestVerifyModifiedChildren(t *testing.T) { t.Run(tc.name, func(t *testing.T) { var buf bytes.Buffer _, params := prepareVerify(t, hostarch.PageSize /* dataSize */, defaultHashAlgorithm, tc.dataAndTreeInSameFile, false /* isSymlink */, 0 /* verifyStart */, 0 /* verifySize */, &buf) - params.Children["abc"] = struct{}{} + params.Children = append(params.Children, "abc") if _, err := Verify(¶ms); errors.Is(err, nil) { t.Errorf("Verification succeeded when expected to fail") } diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD index 0a6a5d215..c08792751 100644 --- a/pkg/metric/BUILD +++ b/pkg/metric/BUILD @@ -4,13 +4,16 @@ package(licenses = ["notice"]) go_library( name = "metric", - srcs = ["metric.go"], + srcs = [ + "metric.go", + ], visibility = ["//:sandbox"], deps = [ ":metric_go_proto", "//pkg/eventchannel", "//pkg/log", "//pkg/sync", + "@org_golang_google_protobuf//types/known/timestamppb", ], ) @@ -18,6 +21,9 @@ proto_library( name = "metric", srcs = ["metric.proto"], visibility = ["//:sandbox"], + deps = [ + "@com_google_protobuf//:timestamp_proto", + ], ) go_test( diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go index 4829ae7ce..ac38ec894 100644 --- a/pkg/metric/metric.go +++ b/pkg/metric/metric.go @@ -20,7 +20,9 @@ import ( "fmt" "sort" "sync/atomic" + "time" + "google.golang.org/protobuf/types/known/timestamppb" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" @@ -54,6 +56,27 @@ var ( }) ) +// InitStage is the name of a Sentry initialization stage. +type InitStage string + +// List of all Sentry initialization stages. +var ( + InitRestoreConfig InitStage = "restore_config" + InitExecConfig InitStage = "exec_config" + InitRestore InitStage = "restore" + InitCreateProcess InitStage = "create_process" + InitTaskStart InitStage = "task_start" + + // allStages is the list of allowed stages. + allStages = []InitStage{ + InitRestoreConfig, + InitExecConfig, + InitRestore, + InitCreateProcess, + InitTaskStart, + } +) + // Uint64Metric encapsulates a uint64 that represents some kind of metric to be // monitored. We currently support metrics with at most one field. // @@ -98,6 +121,10 @@ func Initialize() error { for _, v := range allMetrics.m { m.Metrics = append(m.Metrics, v.metadata) } + m.Stages = make([]string, 0, len(allStages)) + for _, s := range allStages { + m.Stages = append(m.Stages, string(s)) + } if err := eventchannel.Emit(&m); err != nil { return fmt.Errorf("unable to emit metric initialize event: %w", err) } @@ -287,34 +314,66 @@ func (m *Uint64Metric) IncrementBy(v uint64, fieldValues ...string) { } } -// metricSet holds named metrics. +// stageTiming contains timing data for an initialization stage. +type stageTiming struct { + stage InitStage + started time.Time + // ended is the zero time when the stage has not ended yet. + ended time.Time +} + +// inProgress returns whether this stage hasn't ended yet. +func (s stageTiming) inProgress() bool { + return !s.started.IsZero() && s.ended.IsZero() +} + +// metricSet holds metric data. type metricSet struct { + // Map of metrics. m map[string]customUint64Metric + + // mu protects the fields below. + mu sync.RWMutex + + // Information about the stages reached by the Sentry. Only appended to, so + // reading a shallow copy of the slice header concurrently is safe. + finished []stageTiming + + // The current stage in progress. + currentStage stageTiming } // makeMetricSet returns a new metricSet. func makeMetricSet() metricSet { return metricSet{ - m: make(map[string]customUint64Metric), + m: make(map[string]customUint64Metric), + finished: make([]stageTiming, 0, len(allStages)), } } // Values returns a snapshot of all values in m. func (m *metricSet) Values() metricValues { - vals := make(metricValues) + m.mu.Lock() + stages := m.finished[:] + m.mu.Unlock() + + vals := metricValues{ + m: make(map[string]interface{}, len(m.m)), + stages: stages, + } for k, v := range m.m { fields := v.metadata.GetFields() switch len(fields) { case 0: - vals[k] = v.value() + vals.m[k] = v.value() case 1: values := fields[0].GetAllowedValues() fieldsMap := make(map[string]uint64) for _, fieldValue := range values { fieldsMap[fieldValue] = v.value(fieldValue) } - vals[k] = fieldsMap + vals.m[k] = fieldsMap default: panic(fmt.Sprintf("Unsupported number of metric fields: %d", len(fields))) } @@ -322,10 +381,16 @@ func (m *metricSet) Values() metricValues { return vals } -// metricValues contains a copy of the values of all metrics. It is a map -// with key as metric name and value can be either uint64 or map[string]uint64 -// to support metrics with one field. -type metricValues map[string]interface{} +// metricValues contains a copy of the values of all metrics. +type metricValues struct { + // m is a map with key as metric name and value can be either uint64 or + // map[string]uint64 to support metrics with one field. + m map[string]interface{} + + // Information on when initialization stages were reached. Does not include + // the currently-ongoing stage, if any. + stages []stageTiming +} var ( // emitMu protects metricsAtLastEmit and ensures that all emitted @@ -354,8 +419,8 @@ func EmitMetricUpdate() { m := pb.MetricUpdate{} // On the first call metricsAtLastEmit will be empty. Include all // metrics then. - for k, v := range snapshot { - prev, ok := metricsAtLastEmit[k] + for k, v := range snapshot.m { + prev, ok := metricsAtLastEmit.m[k] switch t := v.(type) { case uint64: // Metric exists and value did not change. @@ -386,8 +451,23 @@ func EmitMetricUpdate() { } } + for s := len(metricsAtLastEmit.stages); s < len(snapshot.stages); s++ { + newStage := snapshot.stages[s] + m.StageTiming = append(m.StageTiming, &pb.StageTiming{ + Stage: string(newStage.stage), + Started: ×tamppb.Timestamp{ + Seconds: newStage.started.Unix(), + Nanos: int32(newStage.started.Nanosecond()), + }, + Ended: ×tamppb.Timestamp{ + Seconds: newStage.ended.Unix(), + Nanos: int32(newStage.ended.Nanosecond()), + }, + }) + } + metricsAtLastEmit = snapshot - if len(m.Metrics) == 0 { + if len(m.Metrics) == 0 && len(m.StageTiming) == 0 { return } @@ -399,9 +479,52 @@ func EmitMetricUpdate() { for _, metric := range m.Metrics { log.Debugf("%s: %+v", metric.Name, metric.Value) } + for _, stage := range m.StageTiming { + duration := time.Duration(stage.Ended.Seconds-stage.Started.Seconds)*time.Second + time.Duration(stage.Ended.Nanos-stage.Started.Nanos)*time.Nanosecond + log.Debugf("Stage %s took %v", stage.GetStage(), duration) + } } if err := eventchannel.Emit(&m); err != nil { log.Warningf("Unable to emit metrics: %s", err) } } + +// StartStage should be called when an initialization stage is started. +// It returns a function that must be called to indicate that the stage ended. +// Alternatively, future calls to StartStage will implicitly indicate that the +// previous stage ended. +// Stage information will be emitted in the next call to EmitMetricUpdate after +// a stage has ended. +// +// This function may (and is expected to) be called prior to final +// initialization of this metric library, as it has to capture early stages +// of Sentry initialization. +func StartStage(stage InitStage) func() { + now := time.Now() + allMetrics.mu.Lock() + defer allMetrics.mu.Unlock() + if allMetrics.currentStage.inProgress() { + endStage(now) + } + allMetrics.currentStage.stage = stage + allMetrics.currentStage.started = now + return func() { + now := time.Now() + allMetrics.mu.Lock() + defer allMetrics.mu.Unlock() + // The current stage may have been ended by another call to StartStage, so + // double-check prior to clearing the current stage. + if allMetrics.currentStage.inProgress() && allMetrics.currentStage.stage == stage { + endStage(now) + } + } +} + +// endStage marks allMetrics.currentStage as ended, adding it to the list of +// finished stages. It assumes allMetrics.mu is locked. +func endStage(when time.Time) { + allMetrics.currentStage.ended = when + allMetrics.finished = append(allMetrics.finished, allMetrics.currentStage) + allMetrics.currentStage = stageTiming{} +} diff --git a/pkg/metric/metric.proto b/pkg/metric/metric.proto index 53c8b4b50..d466b6904 100644 --- a/pkg/metric/metric.proto +++ b/pkg/metric/metric.proto @@ -16,6 +16,8 @@ syntax = "proto3"; package gvisor; +import "google/protobuf/timestamp.proto"; + // MetricMetadata contains all of the metadata describing a single metric. message MetricMetadata { // name is the unique name of the metric, usually in a "directory" format @@ -63,6 +65,7 @@ message MetricMetadata { // future MetricUpdates. message MetricRegistration { repeated MetricMetadata metrics = 1; + repeated string stages = 2; } // MetricValue the value of a metric at a single point in time. @@ -79,9 +82,20 @@ message MetricValue { repeated string field_values = 4; } +// StageTiming represents a new stage that's been reached by the Sentry. +message StageTiming { + string stage = 1; + google.protobuf.Timestamp started = 2; + google.protobuf.Timestamp ended = 3; +} + // MetricUpdate contains new values for multiple distinct metrics. // // Metrics whose values have not changed are not included. message MetricUpdate { repeated MetricValue metrics = 1; + // Timing information of initialization stages reached since last update. + // The first MetricUpdate will include multiple entries, since metric + // initialization happens relatively late in the Sentry startup process. + repeated StageTiming stage_timing = 2; } diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go index 1b4a9e73a..0654bdf07 100644 --- a/pkg/metric/metric_test.go +++ b/pkg/metric/metric_test.go @@ -16,6 +16,7 @@ package metric import ( "testing" + "time" "google.golang.org/protobuf/proto" "gvisor.dev/gvisor/pkg/eventchannel" @@ -352,3 +353,147 @@ func TestEmitMetricUpdateWithFields(t *testing.T) { t.Errorf("Field value weird2 not found: %+v", emitter) } } + +func TestMetricUpdateStageTiming(t *testing.T) { + defer reset() + + expectedTimings := map[InitStage]struct{ min, max time.Duration }{} + measureStage := func(stage InitStage, body func()) { + stageStarted := time.Now() + endStage := StartStage(stage) + bodyStarted := time.Now() + body() + bodyEnded := time.Now() + endStage() + stageEnded := time.Now() + + expectedTimings[stage] = struct{ min, max time.Duration }{ + min: bodyEnded.Sub(bodyStarted), + max: stageEnded.Sub(stageStarted), + } + } + checkStage := func(got *pb.StageTiming, want InitStage) { + if InitStage(got.GetStage()) != want { + t.Errorf("%v: got stage %q expected %q", got, got.GetStage(), want) + } + timingBounds, found := expectedTimings[want] + if !found { + t.Fatalf("invalid init stage name %q", want) + } + started := got.Started.AsTime() + ended := got.Ended.AsTime() + duration := ended.Sub(started) + if duration < timingBounds.min { + t.Errorf("stage %v: lasted %v, expected at least %v", want, duration, timingBounds.min) + } else if duration > timingBounds.max { + t.Errorf("stage %v: lasted %v, expected no more than %v", want, duration, timingBounds.max) + } + } + + // Test that it's legit to go through stages before metric registration. + measureStage("before_first_update_1", func() { + time.Sleep(100 * time.Millisecond) + }) + measureStage("before_first_update_2", func() { + time.Sleep(100 * time.Millisecond) + }) + + fooMetric, err := NewUint64Metric("/foo", false, pb.MetricMetadata_UNITS_NONE, fooDescription) + if err != nil { + t.Fatalf("Cannot register /foo: %v", err) + } + emitter.Reset() + Initialize() + EmitMetricUpdate() + + // We should have gotten the metric registration and the first MetricUpdate. + if len(emitter) != 2 { + t.Fatalf("emitter has %d messages (%v), expected %d", len(emitter), emitter, 2) + } + + if registration, ok := emitter[0].(*pb.MetricRegistration); !ok { + t.Errorf("first message is not MetricRegistration: %T / %v", emitter[0], emitter[0]) + } else if len(registration.Stages) != len(allStages) { + t.Errorf("MetricRegistration has %d stages (%v), expected %d (%v)", len(registration.Stages), registration.Stages, len(allStages), allStages) + } else { + for i := 0; i < len(allStages); i++ { + if InitStage(registration.Stages[i]) != allStages[i] { + t.Errorf("MetricRegistration.Stages[%d]: got %q want %q", i, registration.Stages[i], allStages[i]) + } + } + } + + if firstUpdate, ok := emitter[1].(*pb.MetricUpdate); !ok { + t.Errorf("second message is not MetricUpdate: %T / %v", emitter[1], emitter[1]) + } else if len(firstUpdate.StageTiming) != 2 { + t.Errorf("MetricUpdate has %d stage timings (%v), expected %d", len(firstUpdate.StageTiming), firstUpdate.StageTiming, 2) + } else { + checkStage(firstUpdate.StageTiming[0], "before_first_update_1") + checkStage(firstUpdate.StageTiming[1], "before_first_update_2") + } + + // Ensure re-emitting doesn't cause another event to be sent. + emitter.Reset() + EmitMetricUpdate() + if len(emitter) != 0 { + t.Fatalf("EmitMetricUpdate emitted %d events want %d", len(emitter), 0) + } + + // Generate monitoring data, we should get an event with no stages. + fooMetric.Increment() + emitter.Reset() + EmitMetricUpdate() + if len(emitter) != 1 { + t.Fatalf("EmitMetricUpdate emitted %d events want %d", len(emitter), 1) + } else if update, ok := emitter[0].(*pb.MetricUpdate); !ok { + t.Errorf("message is not MetricUpdate: %T / %v", emitter[1], emitter[1]) + } else if len(update.StageTiming) != 0 { + t.Errorf("unexpected stage timing information: %v", update.StageTiming) + } + + // Now generate new stages. + measureStage("foo_stage_1", func() { + time.Sleep(100 * time.Millisecond) + }) + measureStage("foo_stage_2", func() { + time.Sleep(100 * time.Millisecond) + }) + emitter.Reset() + EmitMetricUpdate() + if len(emitter) != 1 { + t.Fatalf("EmitMetricUpdate emitted %d events want %d", len(emitter), 1) + } else if update, ok := emitter[0].(*pb.MetricUpdate); !ok { + t.Errorf("message is not MetricUpdate: %T / %v", emitter[1], emitter[1]) + } else if len(update.Metrics) != 0 { + t.Errorf("MetricUpdate has %d metric value changes (%v), expected %d", len(update.Metrics), update.Metrics, 0) + } else if len(update.StageTiming) != 2 { + t.Errorf("MetricUpdate has %d stages (%v), expected %d", len(update.StageTiming), update.StageTiming, 2) + } else { + checkStage(update.StageTiming[0], "foo_stage_1") + checkStage(update.StageTiming[1], "foo_stage_2") + } + + // Now try generating data for both metrics and stages. + fooMetric.Increment() + measureStage("last_stage_1", func() { + time.Sleep(100 * time.Millisecond) + }) + measureStage("last_stage_2", func() { + time.Sleep(100 * time.Millisecond) + }) + fooMetric.Increment() + emitter.Reset() + EmitMetricUpdate() + if len(emitter) != 1 { + t.Fatalf("EmitMetricUpdate emitted %d events want %d", len(emitter), 1) + } else if update, ok := emitter[0].(*pb.MetricUpdate); !ok { + t.Errorf("message is not MetricUpdate: %T / %v", emitter[1], emitter[1]) + } else if len(update.Metrics) != 1 { + t.Errorf("MetricUpdate has %d metric value changes (%v), expected %d", len(update.Metrics), update.Metrics, 1) + } else if len(update.StageTiming) != 2 { + t.Errorf("MetricUpdate has %d stages (%v), expected %d", len(update.StageTiming), update.StageTiming, 2) + } else { + checkStage(update.StageTiming[0], "last_stage_1") + checkStage(update.StageTiming[1], "last_stage_2") + } +} diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD index b2291ef97..2b22b2203 100644 --- a/pkg/p9/BUILD +++ b/pkg/p9/BUILD @@ -22,6 +22,9 @@ go_library( "version.go", ], deps = [ + "//pkg/abi/linux/errno", + "//pkg/errors", + "//pkg/errors/linuxerr", "//pkg/fd", "//pkg/fdchannel", "//pkg/flipcall", diff --git a/pkg/p9/client.go b/pkg/p9/client.go index 764f1f970..eb496f02f 100644 --- a/pkg/p9/client.go +++ b/pkg/p9/client.go @@ -528,7 +528,7 @@ func (c *Client) sendRecvChannel(t message, r message) error { } // Send the request and receive the server's response. - rsz, err := ch.send(t) + rsz, err := ch.send(t, false /* isServer */) if err != nil { // See above. c.channelsMu.Lock() diff --git a/pkg/p9/file.go b/pkg/p9/file.go index 97e0231d6..8d6af2d6b 100644 --- a/pkg/p9/file.go +++ b/pkg/p9/file.go @@ -325,6 +325,12 @@ func (*DisallowServerCalls) Renamed(File, string) { func DefaultMultiGetAttr(start File, names []string) ([]FullStat, error) { stats := make([]FullStat, 0, len(names)) parent := start + closeParent := func() { + if parent != start { + _ = parent.Close() + } + } + defer closeParent() mask := AttrMaskAll() for i, name := range names { if len(name) == 0 && i == 0 { @@ -340,15 +346,14 @@ func DefaultMultiGetAttr(start File, names []string) ([]FullStat, error) { continue } qids, child, valid, attr, err := parent.WalkGetAttr([]string{name}) - if parent != start { - _ = parent.Close() - } if err != nil { if errors.Is(err, unix.ENOENT) { return stats, nil } return nil, err } + closeParent() + parent = child stats = append(stats, FullStat{ QID: qids[0], Valid: valid, @@ -357,13 +362,8 @@ func DefaultMultiGetAttr(start File, names []string) ([]FullStat, error) { if attr.Mode.FileType() != ModeDirectory { // Doesn't need to continue if entry is not a dir. Including symlinks // that cannot be followed. - _ = child.Close() break } - parent = child - } - if parent != start { - _ = parent.Close() } return stats, nil } diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go index 758e11b13..a8f8a9d03 100644 --- a/pkg/p9/handlers.go +++ b/pkg/p9/handlers.go @@ -23,6 +23,9 @@ import ( "sync/atomic" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux/errno" + "gvisor.dev/gvisor/pkg/errors" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" ) @@ -42,6 +45,8 @@ func ExtractErrno(err error) unix.Errno { // Attempt to unwrap. switch e := err.(type) { + case *errors.Error: + return unix.Errno(e.Errno()) case unix.Errno: return e case *os.PathError: @@ -62,6 +67,45 @@ func newErr(err error) *Rlerror { return &Rlerror{Error: uint32(ExtractErrno(err))} } +// ExtractLinuxerrErrno extracts a *errors.Error from a error, best effort. +// TODO(b/34162363): Merge this with ExtractErrno. +func ExtractLinuxerrErrno(err error) *errors.Error { + switch err { + case os.ErrNotExist: + return linuxerr.ENOENT + case os.ErrExist: + return linuxerr.EEXIST + case os.ErrPermission: + return linuxerr.EACCES + case os.ErrInvalid: + return linuxerr.EINVAL + } + + // Attempt to unwrap. + switch e := err.(type) { + case *errors.Error: + return e + case unix.Errno: + return linuxerr.ErrorFromErrno(errno.Errno(e)) + case *os.PathError: + return ExtractLinuxerrErrno(e.Err) + case *os.SyscallError: + return ExtractLinuxerrErrno(e.Err) + case *os.LinkError: + return ExtractLinuxerrErrno(e.Err) + } + + // Default case. + log.Warningf("unknown error: %v", err) + return linuxerr.EIO +} + +// newErrFromLinuxerr returns an Rlerror from the linuxerr list. +// TODO(b/34162363): Merge this with newErr. +func newErrFromLinuxerr(err error) *Rlerror { + return &Rlerror{Error: uint32(ExtractLinuxerrErrno(err).Errno())} +} + // handler is implemented for server-handled messages. // // See server.go for call information. diff --git a/pkg/p9/server.go b/pkg/p9/server.go index ff1172ed6..241ab44ef 100644 --- a/pkg/p9/server.go +++ b/pkg/p9/server.go @@ -19,7 +19,8 @@ import ( "runtime/debug" "sync/atomic" - "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux/errno" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdchannel" "gvisor.dev/gvisor/pkg/flipcall" @@ -483,7 +484,7 @@ func (cs *connState) lookupChannel(id uint32) *channel { func (cs *connState) handle(m message) (r message) { if !cs.reqGate.Enter() { // connState.stop() has been called; the connection is shutting down. - r = newErr(unix.ECONNRESET) + r = newErrFromLinuxerr(linuxerr.ECONNRESET) return } defer func() { @@ -498,15 +499,23 @@ func (cs *connState) handle(m message) (r message) { // Wrap in an EFAULT error; we don't really have a // better way to describe this kind of error. It will // usually manifest as a result of the test framework. - r = newErr(unix.EFAULT) + r = newErrFromLinuxerr(linuxerr.EFAULT) } }() if handler, ok := m.(handler); ok { // Call the message handler. r = handler.handle(cs) + // TODO(b/34162363):This is only here to make sure the server works with + // only linuxerr Errors, as the handlers work with both client and server. + // It will be removed a followup, when all the unix.Errno errors are + // replaced with linuxerr. + if rlError, ok := r.(*Rlerror); ok { + e := linuxerr.ErrorFromErrno(errno.Errno(rlError.Error)) + r = newErrFromLinuxerr(e) + } } else { // Produce an ENOSYS error. - r = newErr(unix.ENOSYS) + r = newErrFromLinuxerr(linuxerr.ENOSYS) } return } @@ -553,7 +562,7 @@ func (cs *connState) handleRequest() bool { // If it's not a connection error, but some other protocol error, // we can send a response immediately. cs.sendMu.Lock() - err := send(cs.conn, tag, newErr(err)) + err := send(cs.conn, tag, newErrFromLinuxerr(err)) cs.sendMu.Unlock() if err != nil { log.Debugf("p9.send: %v", err) diff --git a/pkg/p9/transport_flipcall.go b/pkg/p9/transport_flipcall.go index 802254a90..69a9f2537 100644 --- a/pkg/p9/transport_flipcall.go +++ b/pkg/p9/transport_flipcall.go @@ -85,7 +85,7 @@ func (ch *channel) service(cs *connState) error { } r := cs.handle(m) msgRegistry.put(m) - rsz, err = ch.send(r) + rsz, err = ch.send(r, true /* isServer */) if err != nil { return err } @@ -122,7 +122,7 @@ func (ch *channel) Close() error { // // The return value is the size of the received response. Not that in the // server case, this is the size of the next request. -func (ch *channel) send(m message) (uint32, error) { +func (ch *channel) send(m message, isServer bool) (uint32, error) { if log.IsLogging(log.Debug) { log.Debugf("send [channel @%p] %s", ch, m.String()) } @@ -162,7 +162,11 @@ func (ch *channel) send(m message) (uint32, error) { } // Perform the one-shot communication. - return ch.data.SendRecv(ssz) + if isServer { + return ch.data.SendRecv(ssz) + } + // RPCs are expected to return quickly rather than block. + return ch.data.SendRecvFast(ssz) } // recv decodes a message that exists on the channel. diff --git a/pkg/procid/procid.go b/pkg/procid/procid.go index 78b92422c..e0d42819d 100644 --- a/pkg/procid/procid.go +++ b/pkg/procid/procid.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + // Package procid provides a way to get the current system thread identifier. package procid diff --git a/pkg/procid/procid_amd64.s b/pkg/procid/procid_amd64.s index c4307c523..7073a0810 100644 --- a/pkg/procid/procid_amd64.s +++ b/pkg/procid/procid_amd64.s @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build amd64 -// +build go1.8 -// +build !go1.18 +//go:build amd64 && go1.8 && !go1.19 && go1.1 +// +build amd64,go1.8,!go1.19,go1.1 + +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. #include "textflag.h" diff --git a/pkg/procid/procid_arm64.s b/pkg/procid/procid_arm64.s index c1c409f3c..bdc3bdcb0 100644 --- a/pkg/procid/procid_arm64.s +++ b/pkg/procid/procid_arm64.s @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build arm64 -// +build go1.8 -// +build !go1.18 +//go:build arm64 && go1.8 && !go1.19 && go1.1 +// +build arm64,go1.8,!go1.19,go1.1 + +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. #include "textflag.h" diff --git a/pkg/rand/rand.go b/pkg/rand/rand.go index a2714784d..be0e85fdb 100644 --- a/pkg/rand/rand.go +++ b/pkg/rand/rand.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !linux // +build !linux // Package rand implements a cryptographically secure pseudorandom number diff --git a/pkg/refsvfs2/refs.go b/pkg/refsvfs2/refs.go index ef8beb659..fe3e4a1ca 100644 --- a/pkg/refsvfs2/refs.go +++ b/pkg/refsvfs2/refs.go @@ -28,6 +28,11 @@ type RefCounter interface { // DecRef decrements the object's reference count. Users of refs_template.Refs // may specify a destructor to be called once the reference count reaches zero. DecRef(ctx context.Context) +} + +// TryRefCounter is like RefCounter but allow the ref increment to be tried. +type TryRefCounter interface { + RefCounter // TryIncRef attempts to increment the reference count, but may fail if all // references have already been dropped, in which case it returns false. If diff --git a/pkg/refsvfs2/refs_template.go b/pkg/refsvfs2/refs_template.go index 1102c8adc..55b0a60a1 100644 --- a/pkg/refsvfs2/refs_template.go +++ b/pkg/refsvfs2/refs_template.go @@ -101,7 +101,7 @@ func (r *Refs) IncRef() { } } -// TryIncRef implements refs.RefCounter.TryIncRef. +// TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish diff --git a/pkg/ring0/aarch64.go b/pkg/ring0/aarch64.go index 3bda594f9..96c884844 100644 --- a/pkg/ring0/aarch64.go +++ b/pkg/ring0/aarch64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/defs_amd64.go b/pkg/ring0/defs_amd64.go index 76776c65c..24f6e4cde 100644 --- a/pkg/ring0/defs_amd64.go +++ b/pkg/ring0/defs_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 diff --git a/pkg/ring0/defs_arm64.go b/pkg/ring0/defs_arm64.go index 0125690d2..3e212516f 100644 --- a/pkg/ring0/defs_arm64.go +++ b/pkg/ring0/defs_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/entry_amd64.go b/pkg/ring0/entry_amd64.go index d87b1fd00..afd646b0b 100644 --- a/pkg/ring0/entry_amd64.go +++ b/pkg/ring0/entry_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 @@ -31,6 +32,13 @@ import ( // executed from kernel mode or not and the appropriate stub is called. func sysenter() +// addrOfSysenter returns the start address of sysenter. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func addrOfSysenter() uintptr + // swapgs swaps the current GS value. // // This must be called prior to sysret/iret. @@ -39,6 +47,9 @@ func swapgs() // jumpToKernel jumps to the kernel version of the current RIP. func jumpToKernel() +// jumpToUser jumps to the user version of the current RIP. +func jumpToUser() + // sysret returns to userspace from a system call. // // The return code is the vector that interrupted execution. @@ -65,7 +76,12 @@ func exception() // This is used when processing kernel exceptions and syscalls. func resume() -// Start is the CPU entrypoint. +// start is the CPU entrypoint. +// +// See requirements below. +func start() + +// AddrOfStart return the address of the CPU entrypoint. // // The following start conditions must be satisfied: // @@ -78,7 +94,11 @@ func resume() // * c.EFER() should be the current EFER value. // // The CPU state will be set to c.Registers(). -func Start() +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func AddrOfStart() uintptr // Exception stubs. func divideByZero() @@ -104,28 +124,56 @@ func virtualizationException() func securityException() func syscallInt80() +// These returns the start address of the functions above. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func addrOfDivideByZero() uintptr +func addrOfDebug() uintptr +func addrOfNMI() uintptr +func addrOfBreakpoint() uintptr +func addrOfOverflow() uintptr +func addrOfBoundRangeExceeded() uintptr +func addrOfInvalidOpcode() uintptr +func addrOfDeviceNotAvailable() uintptr +func addrOfDoubleFault() uintptr +func addrOfCoprocessorSegmentOverrun() uintptr +func addrOfInvalidTSS() uintptr +func addrOfSegmentNotPresent() uintptr +func addrOfStackSegmentFault() uintptr +func addrOfGeneralProtectionFault() uintptr +func addrOfPageFault() uintptr +func addrOfX87FloatingPointException() uintptr +func addrOfAlignmentCheck() uintptr +func addrOfMachineCheck() uintptr +func addrOfSimdFloatingPointException() uintptr +func addrOfVirtualizationException() uintptr +func addrOfSecurityException() uintptr +func addrOfSyscallInt80() uintptr + // Exception handler index. -var handlers = map[Vector]func(){ - DivideByZero: divideByZero, - Debug: debug, - NMI: nmi, - Breakpoint: breakpoint, - Overflow: overflow, - BoundRangeExceeded: boundRangeExceeded, - InvalidOpcode: invalidOpcode, - DeviceNotAvailable: deviceNotAvailable, - DoubleFault: doubleFault, - CoprocessorSegmentOverrun: coprocessorSegmentOverrun, - InvalidTSS: invalidTSS, - SegmentNotPresent: segmentNotPresent, - StackSegmentFault: stackSegmentFault, - GeneralProtectionFault: generalProtectionFault, - PageFault: pageFault, - X87FloatingPointException: x87FloatingPointException, - AlignmentCheck: alignmentCheck, - MachineCheck: machineCheck, - SIMDFloatingPointException: simdFloatingPointException, - VirtualizationException: virtualizationException, - SecurityException: securityException, - SyscallInt80: syscallInt80, +var handlers = map[Vector]uintptr{ + DivideByZero: addrOfDivideByZero(), + Debug: addrOfDebug(), + NMI: addrOfNMI(), + Breakpoint: addrOfBreakpoint(), + Overflow: addrOfOverflow(), + BoundRangeExceeded: addrOfBoundRangeExceeded(), + InvalidOpcode: addrOfInvalidOpcode(), + DeviceNotAvailable: addrOfDeviceNotAvailable(), + DoubleFault: addrOfDoubleFault(), + CoprocessorSegmentOverrun: addrOfCoprocessorSegmentOverrun(), + InvalidTSS: addrOfInvalidTSS(), + SegmentNotPresent: addrOfSegmentNotPresent(), + StackSegmentFault: addrOfStackSegmentFault(), + GeneralProtectionFault: addrOfGeneralProtectionFault(), + PageFault: addrOfPageFault(), + X87FloatingPointException: addrOfX87FloatingPointException(), + AlignmentCheck: addrOfAlignmentCheck(), + MachineCheck: addrOfMachineCheck(), + SIMDFloatingPointException: addrOfSimdFloatingPointException(), + VirtualizationException: addrOfVirtualizationException(), + SecurityException: addrOfSecurityException(), + SyscallInt80: addrOfSyscallInt80(), } diff --git a/pkg/ring0/entry_amd64.s b/pkg/ring0/entry_amd64.s index f59747df3..520bd9f57 100644 --- a/pkg/ring0/entry_amd64.s +++ b/pkg/ring0/entry_amd64.s @@ -88,11 +88,33 @@ #define LOAD_KERNEL_STACK(entry) \ MOVQ ENTRY_STACK_TOP(entry), SP; +// ADDR_OF_FUNC defines a function named 'name' that returns the address of +// 'symbol'. +#define ADDR_OF_FUNC(name, symbol) \ +TEXT name,$0-8; \ + MOVQ $symbol, AX; \ + MOVQ AX, ret+0(FP); \ + RET + // See kernel.go. TEXT ·Halt(SB),NOSPLIT,$0 HLT RET +// See kernel_amd64.go. +TEXT ·HaltAndWriteFSBase(SB),NOSPLIT,$8-8 + HLT + + // Restore FS_BASE. + MOVQ regs+0(FP), AX + MOVQ PTRACE_FS_BASE(AX), AX + + PUSHQ AX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ AX + + RET + // See entry_amd64.go. TEXT ·swapgs(SB),NOSPLIT,$0 SWAP_GS() @@ -107,8 +129,29 @@ TEXT ·jumpToKernel(SB),NOSPLIT,$0 MOVQ AX, 0(SP) RET +// jumpToUser changes execution to the user address space. +// +// This works by changing the return value to the user version. +TEXT ·jumpToUser(SB),NOSPLIT,$0 + // N.B. we can't access KernelStartAddress from the upper half (data + // pages not available), so just naively clear all the upper bits. + // We are assuming a 47-bit virtual address space. + MOVQ $0x00007fffffffffff, AX + MOVQ 0(SP), BX + ANDQ BX, AX // Future return value. + MOVQ AX, 0(SP) + RET + // See entry_amd64.go. TEXT ·sysret(SB),NOSPLIT,$0-24 + // Set application FS. We can't do this in Go because Go code needs FS. + MOVQ regs+8(FP), AX + MOVQ PTRACE_FS_BASE(AX), AX + + PUSHQ AX + CALL ·writeFS(SB) + POPQ AX + CALL ·jumpToKernel(SB) // Save original state and stack. sysenter() or exception() // from APP(gr3) will switch to this stack, set the return @@ -142,6 +185,14 @@ TEXT ·sysret(SB),NOSPLIT,$0-24 // See entry_amd64.go. TEXT ·iret(SB),NOSPLIT,$0-24 + // Set application FS. We can't do this in Go because Go code needs FS. + MOVQ regs+8(FP), AX + MOVQ PTRACE_FS_BASE(AX), AX + + PUSHQ AX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ AX + CALL ·jumpToKernel(SB) // Save original state and stack. sysenter() or exception() // from APP(gr3) will switch to this stack, set the return @@ -184,13 +235,29 @@ TEXT ·resume(SB),NOSPLIT,$0 IRET() // See entry_amd64.go. -TEXT ·Start(SB),NOSPLIT,$0 +TEXT ·start(SB),NOSPLIT,$0 + // N.B. This is the vCPU entrypoint. It is not called from Go code and + // thus pushes and pops values on the stack until calling into Go + // (startGo) because we aren't usually a typical Go assembly frame. + PUSHQ $0x0 // Previous frame pointer. MOVQ SP, BP // Set frame pointer. - PUSHQ AX // First argument (CPU). - CALL ·start(SB) // Call Go hook. + + PUSHQ AX // Save CPU. + + // Set up environment required by Go before calling startGo: Go needs + // FS_BASE and floating point initialized. + MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX + PUSHQ BX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ BX + + // First argument (CPU) already at bottom of stack. + CALL ·startGo(SB) // Call Go hook. JMP ·resume(SB) // Restore to registers. +ADDR_OF_FUNC(·AddrOfStart(SB), ·start(SB)); + // See entry_amd64.go. TEXT ·sysenter(SB),NOSPLIT,$0 // _RFLAGS_IOPL0 is always set in the user mode and it is never set in @@ -218,6 +285,18 @@ user: MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. + CALL ·jumpToUser(SB) + + // Restore kernel FS_BASE. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX + + PUSHQ BX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ BX + + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + // Return to the kernel, where the frame is: // // vector (sp+32) @@ -252,6 +331,8 @@ kernel: POPQ AX // Pop vCPU. JMP ·resume(SB) +ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB)); + // exception is a generic exception handler. // // There are two cases handled: @@ -298,6 +379,16 @@ user: MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX) MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) + CALL ·jumpToUser(SB) + + // Restore kernel FS_BASE. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX + + PUSHQ BX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ BX + // Copy out and return. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ 0(SP), BX // Load vector. @@ -336,36 +427,38 @@ kernel: POPQ AX // Pop vCPU. JMP ·resume(SB) -#define EXCEPTION_WITH_ERROR(value, symbol) \ +#define EXCEPTION_WITH_ERROR(value, symbol, addr) \ +ADDR_OF_FUNC(addr, symbol); \ TEXT symbol,NOSPLIT,$0; \ PUSHQ $value; \ JMP ·exception(SB); -#define EXCEPTION_WITHOUT_ERROR(value, symbol) \ +#define EXCEPTION_WITHOUT_ERROR(value, symbol, addr) \ +ADDR_OF_FUNC(addr, symbol); \ TEXT symbol,NOSPLIT,$0; \ PUSHQ $0x0; \ PUSHQ $value; \ JMP ·exception(SB); -EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB)) -EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB)) -EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB)) -EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB)) -EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB)) -EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB)) -EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB)) -EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB)) -EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB)) -EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB)) -EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB)) -EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB)) -EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB)) -EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB)) -EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB)) -EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB)) -EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB)) -EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB)) -EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB)) -EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB)) -EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB)) -EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB)) +EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB), ·addrOfDivideByZero(SB)) +EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB), ·addrOfDebug(SB)) +EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB), ·addrOfNMI(SB)) +EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB), ·addrOfBreakpoint(SB)) +EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB), ·addrOfOverflow(SB)) +EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB), ·addrOfBoundRangeExceeded(SB)) +EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB), ·addrOfInvalidOpcode(SB)) +EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB), ·addrOfDeviceNotAvailable(SB)) +EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB), ·addrOfDoubleFault(SB)) +EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB), ·addrOfCoprocessorSegmentOverrun(SB)) +EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB), ·addrOfInvalidTSS(SB)) +EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB), ·addrOfSegmentNotPresent(SB)) +EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB), ·addrOfStackSegmentFault(SB)) +EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB), ·addrOfGeneralProtectionFault(SB)) +EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB), ·addrOfPageFault(SB)) +EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB), ·addrOfX87FloatingPointException(SB)) +EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB), ·addrOfAlignmentCheck(SB)) +EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(SB)) +EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB)) +EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB)) +EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB)) +EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB)) diff --git a/pkg/ring0/entry_arm64.go b/pkg/ring0/entry_arm64.go index 62a93f3d6..299036478 100644 --- a/pkg/ring0/entry_arm64.go +++ b/pkg/ring0/entry_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/kernel_amd64.go b/pkg/ring0/kernel_amd64.go index f63af8b76..4a4c0ae26 100644 --- a/pkg/ring0/kernel_amd64.go +++ b/pkg/ring0/kernel_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 @@ -19,12 +20,20 @@ package ring0 import ( "encoding/binary" "reflect" + "sync" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/sentry/arch" ) +// HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the +// value in regs. +func HaltAndWriteFSBase(regs *arch.Registers) + // init initializes architecture-specific state. func (k *Kernel) init(maxCPUs int) { + initSentryXCR0() + entrySize := reflect.TypeOf(kernelEntry{}).Size() var ( entries []kernelEntry @@ -168,7 +177,7 @@ func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) { // //go:nosplit func (c *CPU) CR0() uint64 { - return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET + return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET | _CR0_NE } // CR4 returns the CPU's CR4 value. @@ -240,7 +249,6 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { // Perform the switch. swapgs() // GS will be swapped on return. - WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. LoadFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy in floating point. if switchOpts.FullRestore { @@ -249,38 +257,58 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { vector = sysret(c, regs, uintptr(userCR3)) } SaveFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy out floating point. - WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. RestoreKernelFPState() // escapes: no. Restore kernel MXCSR. return } -var sentryXCR0 = xgetbv(0) +var ( + sentryXCR0 uintptr + sentryXCR0Once sync.Once +) -// start is the CPU entrypoint. +// initSentryXCR0 saves a value of XCR0 in the host mode. It is used to +// initialize XCR0 of guest vCPU-s. +func initSentryXCR0() { + sentryXCR0Once.Do(func() { sentryXCR0 = xgetbv(0) }) +} + +// startGo is the CPU entrypoint. // -// This is called from the Start asm stub (see entry_amd64.go); on return the +// This is called from the start asm stub (see entry_amd64.go); on return the // registers in c.registers will be restored (not segments). // +// Note that any code written in Go should adhere to Go expected environment: +// * Initialized floating point state (required for optimizations using +// floating point instructions). +// * Go TLS in FS_BASE (this is required by splittable functions, calls into +// the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access +// TLS)). +// //go:nosplit -func start(c *CPU) { - // Save per-cpu & FS segment. +func startGo(c *CPU) { + // Save per-cpu. WriteGS(kernelAddr(c.kernelEntry)) - WriteFS(uintptr(c.registers.Fs_base)) + // + // TODO(mpratt): Note that per the note above, this should be done + // before entering Go code. However for simplicity we leave it here for + // now, since the small critical sections with undefined FPU state + // should only contain very limited use of floating point instructions + // (notably, use of XMM15 as a zero register). fninit() // Need to sync XCR0 with the host, because xsave and xrstor can be // called from different contexts. xsetbv(0, sentryXCR0) // Set the syscall target. - wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) + wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter())) wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF) // NOTE: This depends on having the 64-bit segments immediately // following the 32-bit user segments. This is simply the way the // sysret instruction is designed to work (it assumes they follow). wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) - wrmsr(_MSR_CSTAR, kernelFunc(sysenter)) + wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter())) } // SetCPUIDFaulting sets CPUID faulting per the boolean value. diff --git a/pkg/ring0/kernel_arm64.go b/pkg/ring0/kernel_arm64.go index 21db910a2..79f85ff50 100644 --- a/pkg/ring0/kernel_arm64.go +++ b/pkg/ring0/kernel_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/kernel_unsafe.go b/pkg/ring0/kernel_unsafe.go index 16955ad91..04c60d0a7 100644 --- a/pkg/ring0/kernel_unsafe.go +++ b/pkg/ring0/kernel_unsafe.go @@ -35,7 +35,6 @@ func kernelAddr(obj interface{}) uintptr { // kernelFunc returns the address of the given function. // //go:nosplit -func kernelFunc(fn func()) uintptr { - fnptr := (**uintptr)(unsafe.Pointer(&fn)) - return KernelStartAddress | **fnptr +func kernelFunc(fn uintptr) uintptr { + return KernelStartAddress | fn } diff --git a/pkg/ring0/lib_amd64.go b/pkg/ring0/lib_amd64.go index 3e6bb9663..05c394ff5 100644 --- a/pkg/ring0/lib_amd64.go +++ b/pkg/ring0/lib_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 @@ -43,8 +44,8 @@ func xsave(*byte) // xsaveopt uses xsaveopt to save floating point state. func xsaveopt(*byte) -// WriteFS sets the GS address (set by init). -var WriteFS func(addr uintptr) +// writeFS sets the FS base address (selects one of wrfsbase or wrfsmsr). +func writeFS(addr uintptr) // wrfsbase writes to the GS base address. func wrfsbase(addr uintptr) @@ -116,10 +117,8 @@ func Init(featureSet *cpuid.FeatureSet) { LoadFloatingPoint = fxrstor } if hasFSGSBASE { - WriteFS = wrfsbase WriteGS = wrgsbase } else { - WriteFS = wrfsmsr WriteGS = wrgsmsr } } diff --git a/pkg/ring0/lib_amd64.s b/pkg/ring0/lib_amd64.s index 70a43e79e..8ed98fc84 100644 --- a/pkg/ring0/lib_amd64.s +++ b/pkg/ring0/lib_amd64.s @@ -80,6 +80,29 @@ TEXT ·xsaveopt(SB),NOSPLIT,$0-8 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; RET +// writeFS writes to the FS base. +// +// This is written in assembly because it must be safe to call before the Go +// environment is set up. See comment on start(). +// +// Preconditions: must be running in the lower address space, as it accesses +// global data. +TEXT ·writeFS(SB),NOSPLIT,$8-8 + MOVQ addr+0(FP), AX + + CMPB ·hasFSGSBASE(SB), $1 + JNE msr + + PUSHQ AX + CALL ·wrfsbase(SB) + POPQ AX + RET +msr: + PUSHQ AX + CALL ·wrfsmsr(SB) + POPQ AX + RET + // wrfsbase writes to the FS base. // // The code corresponds to: diff --git a/pkg/ring0/lib_arm64.go b/pkg/ring0/lib_arm64.go index 5eabd4296..a72a6926d 100644 --- a/pkg/ring0/lib_arm64.go +++ b/pkg/ring0/lib_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/offsets_amd64.go b/pkg/ring0/offsets_amd64.go index ca4075b09..75f6218b3 100644 --- a/pkg/ring0/offsets_amd64.go +++ b/pkg/ring0/offsets_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ring0 @@ -95,6 +96,6 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_FS_BASE 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_GS_BASE 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) } diff --git a/pkg/ring0/offsets_arm64.go b/pkg/ring0/offsets_arm64.go index 03adaa6b0..60b2c4074 100644 --- a/pkg/ring0/offsets_arm64.go +++ b/pkg/ring0/offsets_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ring0 diff --git a/pkg/ring0/pagetables/pagetables_aarch64.go b/pkg/ring0/pagetables/pagetables_aarch64.go index 86eb00a4f..aa2a5c984 100644 --- a/pkg/ring0/pagetables/pagetables_aarch64.go +++ b/pkg/ring0/pagetables/pagetables_aarch64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package pagetables diff --git a/pkg/ring0/pagetables/pagetables_amd64_test.go b/pkg/ring0/pagetables/pagetables_amd64_test.go index a13c616ae..c27b3b10a 100644 --- a/pkg/ring0/pagetables/pagetables_amd64_test.go +++ b/pkg/ring0/pagetables/pagetables_amd64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package pagetables diff --git a/pkg/ring0/pagetables/pagetables_arm64_test.go b/pkg/ring0/pagetables/pagetables_arm64_test.go index 2514b9ac5..1c919ec7d 100644 --- a/pkg/ring0/pagetables/pagetables_arm64_test.go +++ b/pkg/ring0/pagetables/pagetables_arm64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package pagetables diff --git a/pkg/ring0/pagetables/pagetables_x86.go b/pkg/ring0/pagetables/pagetables_x86.go index e43698173..dc98d8452 100644 --- a/pkg/ring0/pagetables/pagetables_x86.go +++ b/pkg/ring0/pagetables/pagetables_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package pagetables diff --git a/pkg/ring0/pagetables/pcids_aarch64.go b/pkg/ring0/pagetables/pcids_aarch64.go index fbfd41d83..ad492d039 100644 --- a/pkg/ring0/pagetables/pcids_aarch64.go +++ b/pkg/ring0/pagetables/pcids_aarch64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package pagetables diff --git a/pkg/ring0/pagetables/pcids_aarch64.s b/pkg/ring0/pagetables/pcids_aarch64.s index e9d62d768..cfcedba71 100644 --- a/pkg/ring0/pagetables/pcids_aarch64.s +++ b/pkg/ring0/pagetables/pcids_aarch64.s @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 #include "funcdata.h" diff --git a/pkg/ring0/pagetables/pcids_x86.go b/pkg/ring0/pagetables/pcids_x86.go index 91fc5e8dd..2a107ea70 100644 --- a/pkg/ring0/pagetables/pcids_x86.go +++ b/pkg/ring0/pagetables/pcids_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build i386 || amd64 // +build i386 amd64 package pagetables diff --git a/pkg/ring0/pagetables/walker_amd64.go b/pkg/ring0/pagetables/walker_amd64.go index eb4fbcc31..ca5e2f85f 100644 --- a/pkg/ring0/pagetables/walker_amd64.go +++ b/pkg/ring0/pagetables/walker_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package pagetables diff --git a/pkg/ring0/pagetables/walker_arm64.go b/pkg/ring0/pagetables/walker_arm64.go index 5ed881c7a..e32dbda2d 100644 --- a/pkg/ring0/pagetables/walker_arm64.go +++ b/pkg/ring0/pagetables/walker_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package pagetables diff --git a/pkg/ring0/x86.go b/pkg/ring0/x86.go index 34fbc1c35..9a98703da 100644 --- a/pkg/ring0/x86.go +++ b/pkg/ring0/x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build 386 || amd64 // +build 386 amd64 package ring0 @@ -24,6 +25,7 @@ import ( const ( _CR0_PE = 1 << 0 _CR0_ET = 1 << 4 + _CR0_NE = 1 << 5 _CR0_AM = 1 << 18 _CR0_PG = 1 << 31 diff --git a/pkg/safecopy/BUILD b/pkg/safecopy/BUILD index b77c40279..0a045fc8e 100644 --- a/pkg/safecopy/BUILD +++ b/pkg/safecopy/BUILD @@ -18,7 +18,9 @@ go_library( ], visibility = ["//:sandbox"], deps = [ - "//pkg/syserror", + "//pkg/abi/linux", + "//pkg/errors", + "//pkg/errors/linuxerr", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/safecopy/safecopy.go b/pkg/safecopy/safecopy.go index df63dd5f1..a9711e63d 100644 --- a/pkg/safecopy/safecopy.go +++ b/pkg/safecopy/safecopy.go @@ -21,7 +21,8 @@ import ( "runtime" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // SegvError is returned when a safecopy function receives SIGSEGV. @@ -82,7 +83,7 @@ var ( // when we get a SIGSEGV that is not interesting to us. savedSigSegVHandler uintptr - // same a above, but for SIGBUS signals. + // Same as above, but for SIGBUS signals. savedSigBusHandler uintptr ) @@ -137,12 +138,12 @@ func init() { if err := ReplaceSignalHandler(unix.SIGBUS, addrOfSignalHandler(), &savedSigBusHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err)) } - syserror.AddErrorUnwrapper(func(e error) (unix.Errno, bool) { + linuxerr.AddErrorUnwrapper(func(e error) (*errors.Error, bool) { switch e.(type) { case SegvError, BusError, AlignmentError: - return unix.EFAULT, true + return linuxerr.EFAULT, true default: - return 0, false + return nil, false } }) } diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go index 3ec73f296..2365b2c0d 100644 --- a/pkg/safecopy/safecopy_unsafe.go +++ b/pkg/safecopy/safecopy_unsafe.go @@ -20,6 +20,7 @@ import ( "unsafe" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" ) // maxRegisterSize is the maximum register size used in memcpy and memclr. It @@ -342,15 +343,7 @@ func errorFromFaultSignal(addr uintptr, sig int32) error { // handler however, and if this is function is being used externally then the // same courtesy is expected. func ReplaceSignalHandler(sig unix.Signal, handler uintptr, previous *uintptr) error { - // TODO(gvisor.dev/issue/6160): This struct is the same as linux.SigAction. - // Once the usermem dependency is removed from primitive, delete this replica - // and remove IFTTT comments in abi/linux/signal.go. - var sa struct { - handler uintptr - flags uint64 - restorer uintptr - mask uint64 - } + var sa linux.SigAction const maskLen = 8 // Get the existing signal handler information, and save the current @@ -361,14 +354,14 @@ func ReplaceSignalHandler(sig unix.Signal, handler uintptr, previous *uintptr) e } // Fail if there isn't a previous handler. - if sa.handler == 0 { + if sa.Handler == 0 { return fmt.Errorf("previous handler for signal %x isn't set", sig) } - *previous = sa.handler + *previous = uintptr(sa.Handler) // Install our own handler. - sa.handler = handler + sa.Handler = uint64(handler) if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 { return e } diff --git a/pkg/safemem/io.go b/pkg/safemem/io.go index f039a5c34..9551ca853 100644 --- a/pkg/safemem/io.go +++ b/pkg/safemem/io.go @@ -207,58 +207,6 @@ func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) { return wbn, buf, rerr } -// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly -// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial -// read indicates an error. This is not thread-safe. -type FromIOReaderAt struct { - ReaderAt io.ReaderAt - Offset int64 -} - -// ReadToBlocks implements Reader.ReadToBlocks. -func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) { - var buf []byte - var done uint64 - for !dsts.IsEmpty() { - dst := dsts.Head() - var n int - var err error - n, buf, err = r.readToBlock(dst, buf) - done += uint64(n) - if n != dst.Len() { - return done, err - } - dsts = dsts.Tail() - if err != nil { - if dsts.IsEmpty() && err == io.EOF { - return done, nil - } - return done, err - } - } - return done, nil -} - -func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) { - // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require - // safecopy. - if !dst.NeedSafecopy() { - n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset) - r.Offset += int64(n) - return n, buf, err - } - if len(buf) < dst.Len() { - buf = make([]byte, dst.Len()) - } - rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset) - r.Offset += int64(rn) - wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) - if wberr != nil { - return wbn, buf, wberr - } - return wbn, buf, rerr -} - // FromIOWriter implements Writer for an io.Writer by repeatedly invoking // io.Writer.Write until it returns an error or partial write. // diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go index 8ffa1db37..062250d69 100644 --- a/pkg/seccomp/seccomp.go +++ b/pkg/seccomp/seccomp.go @@ -74,8 +74,8 @@ func Install(rules SyscallRules) error { } // Perform the actual installation. - if errno := SetFilter(instrs); errno != 0 { - return fmt.Errorf("failed to set filter: %v", errno) + if err := SetFilter(instrs); err != nil { + return fmt.Errorf("failed to set filter: %v", err) } log.Infof("Seccomp filters installed.") diff --git a/pkg/seccomp/seccomp_amd64.go b/pkg/seccomp/seccomp_amd64.go index 00bf332c1..9cd003bc5 100644 --- a/pkg/seccomp/seccomp_amd64.go +++ b/pkg/seccomp/seccomp_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package seccomp diff --git a/pkg/seccomp/seccomp_arm64.go b/pkg/seccomp/seccomp_arm64.go index b62133f21..adcf73e72 100644 --- a/pkg/seccomp/seccomp_arm64.go +++ b/pkg/seccomp/seccomp_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package seccomp diff --git a/pkg/seccomp/seccomp_test_victim_amd64.go b/pkg/seccomp/seccomp_test_victim_amd64.go index efb8604ec..5c1ecc301 100644 --- a/pkg/seccomp/seccomp_test_victim_amd64.go +++ b/pkg/seccomp/seccomp_test_victim_amd64.go @@ -15,6 +15,7 @@ // Test binary used to test that seccomp filters are properly constructed and // indeed kill the process on violation. +//go:build amd64 // +build amd64 package main diff --git a/pkg/seccomp/seccomp_test_victim_arm64.go b/pkg/seccomp/seccomp_test_victim_arm64.go index 97cb5f5fe..9647e2758 100644 --- a/pkg/seccomp/seccomp_test_victim_arm64.go +++ b/pkg/seccomp/seccomp_test_victim_arm64.go @@ -15,6 +15,7 @@ // Test binary used to test that seccomp filters are properly constructed and // indeed kill the process on violation. +//go:build arm64 // +build arm64 package main diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go index 7202591df..6701b5542 100644 --- a/pkg/seccomp/seccomp_unsafe.go +++ b/pkg/seccomp/seccomp_unsafe.go @@ -15,6 +15,8 @@ package seccomp import ( + "fmt" + "runtime" "unsafe" "golang.org/x/sys/unix" @@ -22,12 +24,56 @@ import ( ) // SetFilter installs the given BPF program. +func SetFilter(instrs []linux.BPFInstruction) error { + // PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See + // seccomp(2) for details. + // + // PR_SET_NO_NEW_PRIVS is specific to the calling thread, not the whole + // thread group, so between PR_SET_NO_NEW_PRIVS and seccomp() below we must + // remain on the same thread. no_new_privs will be propagated to other + // threads in the thread group by seccomp(SECCOMP_FILTER_FLAG_TSYNC), in + // kernel/seccomp.c:seccomp_sync_threads(). + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if _, _, errno := unix.RawSyscall6(unix.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0, 0); errno != 0 { + return errno + } + + sockProg := linux.SockFprog{ + Len: uint16(len(instrs)), + Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), + } + tid, errno := seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg)) + if errno != 0 { + return errno + } + // "On error, if SECCOMP_FILTER_FLAG_TSYNC was used, the return value is + // the ID of the thread that caused the synchronization failure. (This ID + // is a kernel thread ID of the type returned by clone(2) and gettid(2).)" + // - seccomp(2) + if tid != 0 { + return fmt.Errorf("couldn't synchronize filter to TID %d", tid) + } + return nil +} + +// SetFilterInChild is equivalent to SetFilter, but: +// +// - It is safe to call after runtime.syscall_runtime_AfterForkInChild. +// +// - It requires that the calling goroutine cannot be moved to another thread, +// which either requires that runtime.LockOSThread() is in effect or that the +// caller is in fact in a fork()ed child process. // -// This is safe to call from an afterFork context. +// - Since fork()ed child processes cannot perform heap allocation, it returns +// a unix.Errno rather than an error. // +// - The race instrumentation has to be disabled for all functions that are +// called in a forked child. +// +//go:norace //go:nosplit -func SetFilter(instrs []linux.BPFInstruction) unix.Errno { - // PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details. +func SetFilterInChild(instrs []linux.BPFInstruction) unix.Errno { if _, _, errno := unix.RawSyscall6(unix.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0, 0); errno != 0 { return errno } @@ -36,12 +82,22 @@ func SetFilter(instrs []linux.BPFInstruction) unix.Errno { Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), } - return seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg)) + tid, errno := seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg)) + if errno != 0 { + return errno + } + if tid != 0 { + // Return an errno that seccomp(2) doesn't to uniquely identify this + // case. Since this case occurs if another thread has a conflicting + // filter set, "name not unique on network" is at least suggestive? + return unix.ENOTUNIQ + } + return 0 } func isKillProcessAvailable() (bool, error) { action := uint32(linux.SECCOMP_RET_KILL_PROCESS) - if errno := seccomp(linux.SECCOMP_GET_ACTION_AVAIL, 0, unsafe.Pointer(&action)); errno != 0 { + if _, errno := seccomp(linux.SECCOMP_GET_ACTION_AVAIL, 0, unsafe.Pointer(&action)); errno != 0 { // EINVAL: SECCOMP_GET_ACTION_AVAIL not in this kernel yet. // EOPNOTSUPP: SECCOMP_RET_KILL_PROCESS not supported. if errno == unix.EINVAL || errno == unix.EOPNOTSUPP { @@ -55,9 +111,7 @@ func isKillProcessAvailable() (bool, error) { // seccomp calls seccomp(2). This is safe to call from an afterFork context. // //go:nosplit -func seccomp(op, flags uint32, ptr unsafe.Pointer) unix.Errno { - if _, _, errno := unix.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr)); errno != 0 { - return errno - } - return 0 +func seccomp(op, flags uint32, ptr unsafe.Pointer) (uintptr, unix.Errno) { + n, _, errno := unix.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr)) + return n, errno } diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 61dacd2fb..e0dbc436d 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -28,13 +28,13 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/cpuid", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/marshal", "//pkg/marshal/primitive", "//pkg/sentry/arch/fpu", "//pkg/sentry/limits", - "//pkg/syserror", "//pkg/usermem", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index 08789f517..9a827e84f 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch @@ -22,10 +23,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" - "gvisor.dev/gvisor/pkg/syserror" ) // Registers represents the CPU registers for this architecture. @@ -233,11 +234,11 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return s.PtraceGetRegs(dst) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } @@ -246,11 +247,11 @@ func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return s.PtraceSetRegs(src) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index d6b4d2357..e7cb24102 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package arch diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index 348f238fd..0d27a1f22 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index b2b94c304..6da13f26e 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || 386 // +build amd64 386 package arch diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index e8e52d3a8..96e9a6949 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || 386 // +build amd64 386 package arch @@ -23,10 +24,10 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" - "gvisor.dev/gvisor/pkg/syserror" ) // Registers represents the CPU registers for this architecture. @@ -353,7 +354,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return s.PtraceGetRegs(dst) case _NT_PRFPREG: @@ -361,7 +362,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, case _NT_X86_XSTATE: return s.fpState.PtraceGetXstateRegs(dst, maxlen, s.FeatureSet) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } @@ -370,7 +371,7 @@ func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return s.PtraceSetRegs(src) case _NT_PRFPREG: @@ -378,7 +379,7 @@ func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, case _NT_X86_XSTATE: return s.fpState.PtraceSetXstateRegs(src, maxlen, s.FeatureSet) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go index 5d7b99bd9..bb5ff7f7f 100644 --- a/pkg/sentry/arch/arch_x86_impl.go +++ b/pkg/sentry/arch/arch_x86_impl.go @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build (amd64 || 386) && go1.1 // +build amd64 386 +// +build go1.1 package arch diff --git a/pkg/sentry/arch/fpu/BUILD b/pkg/sentry/arch/fpu/BUILD index 4e4f20639..1f371e513 100644 --- a/pkg/sentry/arch/fpu/BUILD +++ b/pkg/sentry/arch/fpu/BUILD @@ -9,13 +9,14 @@ go_library( "fpu_amd64.go", "fpu_amd64.s", "fpu_arm64.go", + "fpu_unsafe.go", ], visibility = ["//:sandbox"], deps = [ "//pkg/cpuid", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sync", - "//pkg/syserror", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/arch/fpu/fpu.go b/pkg/sentry/arch/fpu/fpu.go index 867d309a3..62bde19d3 100644 --- a/pkg/sentry/arch/fpu/fpu.go +++ b/pkg/sentry/arch/fpu/fpu.go @@ -17,7 +17,6 @@ package fpu import ( "fmt" - "reflect" ) // State represents floating point state. @@ -40,15 +39,3 @@ type ErrLoadingState struct { func (e ErrLoadingState) Error() string { return fmt.Sprintf("floating point state contains unsupported features; supported: %#x saved: %#x", e.supportedFeatures, e.savedFeatures) } - -// alignedBytes returns a slice of size bytes, aligned in memory to the given -// alignment. This is used because we require certain structures to be aligned -// in a specific way (for example, the X86 floating point data). -func alignedBytes(size, alignment uint) []byte { - data := make([]byte, size+alignment-1) - offset := uint(reflect.ValueOf(data).Index(0).Addr().Pointer() % uintptr(alignment)) - if offset == 0 { - return data[:size:size] - } - return data[alignment-offset:][:size:size] -} diff --git a/pkg/sentry/arch/fpu/fpu_amd64.go b/pkg/sentry/arch/fpu/fpu_amd64.go index f0ba26736..e422f67a1 100644 --- a/pkg/sentry/arch/fpu/fpu_amd64.go +++ b/pkg/sentry/arch/fpu/fpu_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 || i386 // +build amd64 i386 package fpu @@ -21,9 +22,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // initX86FPState (defined in asm files) sets up initial state. @@ -70,7 +71,7 @@ const ptraceFPRegsSize = 512 // PtraceGetFPRegs implements Context.PtraceGetFPRegs. func (s *State) PtraceGetFPRegs(dst io.Writer, maxlen int) (int, error) { if maxlen < ptraceFPRegsSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return dst.Write((*s)[:ptraceFPRegsSize]) @@ -79,7 +80,7 @@ func (s *State) PtraceGetFPRegs(dst io.Writer, maxlen int) (int, error) { // PtraceSetFPRegs implements Context.PtraceSetFPRegs. func (s *State) PtraceSetFPRegs(src io.Reader, maxlen int) (int, error) { if maxlen < ptraceFPRegsSize { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } var f [ptraceFPRegsSize]byte diff --git a/pkg/sentry/arch/fpu/fpu_arm64.go b/pkg/sentry/arch/fpu/fpu_arm64.go index 46634661f..49e641722 100644 --- a/pkg/sentry/arch/fpu/fpu_arm64.go +++ b/pkg/sentry/arch/fpu/fpu_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package fpu diff --git a/pkg/sentry/arch/fpu/fpu_unsafe.go b/pkg/sentry/arch/fpu/fpu_unsafe.go new file mode 100644 index 000000000..c91dc99be --- /dev/null +++ b/pkg/sentry/arch/fpu/fpu_unsafe.go @@ -0,0 +1,31 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fpu + +import ( + "unsafe" +) + +// alignedBytes returns a slice of size bytes, aligned in memory to the given +// alignment. This is used because we require certain structures to be aligned +// in a specific way (for example, the X86 floating point data). +func alignedBytes(size, alignment uint) []byte { + data := make([]byte, size+alignment-1) + offset := uint(uintptr(unsafe.Pointer(&data[0])) % uintptr(alignment)) + if offset == 0 { + return data[:size:size] + } + return data[alignment-offset:][:size:size] +} diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 58e28dbba..dbd4336f9 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package arch diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 80df90076..ee22ec512 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go index 3859f41ee..c021ba072 100644 --- a/pkg/sentry/arch/syscalls_amd64.go +++ b/pkg/sentry/arch/syscalls_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package arch diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go index 95dfd1e90..7146c9e44 100644 --- a/pkg/sentry/arch/syscalls_arm64.go +++ b/pkg/sentry/arch/syscalls_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package arch diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index deaf5fa23..cfb33a398 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -1,21 +1,33 @@ -load("//tools:defs.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") package(licenses = ["notice"]) +proto_library( + name = "control", + srcs = ["control.proto"], + visibility = ["//visibility:public"], +) + go_library( name = "control", srcs = [ "control.go", + "events.go", + "fs.go", + "lifecycle.go", "logging.go", "pprof.go", "proc.go", "state.go", + "usage.go", ], visibility = [ "//:sandbox", ], deps = [ "//pkg/abi/linux", + "//pkg/context", + "//pkg/eventchannel", "//pkg/fd", "//pkg/log", "//pkg/sentry/fdimport", @@ -35,6 +47,8 @@ go_library( "//pkg/sync", "//pkg/tcpip/link/sniffer", "//pkg/urpc", + "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/control/control.proto b/pkg/sentry/control/control.proto new file mode 100644 index 000000000..72dda3fbc --- /dev/null +++ b/pkg/sentry/control/control.proto @@ -0,0 +1,40 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package gvisor; + +// ControlConfig configures the permission of controls. +message ControlConfig { + // Names for individual control URPC service objects. + // Any new service object that should be given conditional access should be + // named here and conditionally added based on presence in allowed_controls. + enum Endpoint { + UNKNOWN = 0; + EVENTS = 1; + FS = 2; + LIFECYCLE = 3; + LOGGING = 4; + PROFILE = 5; + USAGE = 6; + PROC = 7; + STATE = 8; + DEBUG = 9; + } + + // allowed_controls represents which endpoints may be registered to the + // server. + repeated Endpoint allowed_controls = 1; +} diff --git a/pkg/sentry/control/events.go b/pkg/sentry/control/events.go new file mode 100644 index 000000000..92e437ae7 --- /dev/null +++ b/pkg/sentry/control/events.go @@ -0,0 +1,65 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "errors" + "fmt" + + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/urpc" +) + +// EventsOpts are the arguments for eventchannel-related commands. +type EventsOpts struct { + urpc.FilePayload +} + +// Events is the control server state for eventchannel-related commands. +type Events struct { + emitter eventchannel.Emitter +} + +// AttachDebugEmitter receives a connected unix domain socket FD from the client +// and establishes it as a new emitter for the sentry eventchannel. Any existing +// emitters are replaced on a subsequent attach. +func (e *Events) AttachDebugEmitter(o *EventsOpts, _ *struct{}) error { + if len(o.FilePayload.Files) < 1 { + return errors.New("no output writer provided") + } + + sock, err := o.ReleaseFD(0) + if err != nil { + return err + } + sockFD := sock.Release() + + // SocketEmitter takes ownership of sockFD. + emitter, err := eventchannel.SocketEmitter(sockFD) + if err != nil { + return fmt.Errorf("failed to create SocketEmitter for FD %d: %v", sockFD, err) + } + + // If there is already a debug emitter, close the old one. + if e.emitter != nil { + e.emitter.Close() + } + + e.emitter = eventchannel.DebugEmitterFrom(emitter) + + // Register the new stream destination. + eventchannel.AddEmitter(e.emitter) + return nil +} diff --git a/pkg/sentry/control/fs.go b/pkg/sentry/control/fs.go new file mode 100644 index 000000000..d19b21f2d --- /dev/null +++ b/pkg/sentry/control/fs.go @@ -0,0 +1,93 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "fmt" + "io" + "os" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/pkg/usermem" +) + +// CatOpts contains options for the Cat RPC call. +type CatOpts struct { + // Files are the filesystem paths for the files to cat. + Files []string `json:"files"` + + // FilePayload contains the destination for output. + urpc.FilePayload +} + +// Fs includes fs-related functions. +type Fs struct { + Kernel *kernel.Kernel +} + +// Cat is a RPC stub which prints out and returns the content of the files. +func (f *Fs) Cat(o *CatOpts, _ *struct{}) error { + // Create an output stream. + if len(o.FilePayload.Files) != 1 { + return ErrInvalidFiles + } + + output := o.FilePayload.Files[0] + for _, file := range o.Files { + if err := cat(f.Kernel, file, output); err != nil { + return fmt.Errorf("cannot read from file %s: %v", file, err) + } + } + + return nil +} + +// fileReader encapsulates a fs.File and provides an io.Reader interface. +type fileReader struct { + ctx context.Context + file *fs.File +} + +// Read implements io.Reader.Read. +func (f *fileReader) Read(p []byte) (int, error) { + n, err := f.file.Readv(f.ctx, usermem.BytesIOSequence(p)) + return int(n), err +} + +func cat(k *kernel.Kernel, path string, output *os.File) error { + ctx := k.SupervisorContext() + mns := k.GlobalInit().Leader().MountNamespace() + root := mns.Root() + defer root.DecRef(ctx) + + remainingTraversals := uint(fs.DefaultTraversalLimit) + d, err := mns.FindInode(ctx, root, nil, path, &remainingTraversals) + if err != nil { + return fmt.Errorf("cannot find file %s: %v", path, err) + } + defer d.DecRef(ctx) + + file, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) + if err != nil { + return fmt.Errorf("cannot get file for path %s: %v", path, err) + } + defer file.DecRef(ctx) + + _, err = io.Copy(output, &fileReader{ctx: ctx, file: file}) + return err +} diff --git a/pkg/sentry/control/lifecycle.go b/pkg/sentry/control/lifecycle.go new file mode 100644 index 000000000..67abf497d --- /dev/null +++ b/pkg/sentry/control/lifecycle.go @@ -0,0 +1,36 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" +) + +// Lifecycle provides functions related to starting and stopping tasks. +type Lifecycle struct { + Kernel *kernel.Kernel +} + +// Pause pauses all tasks, blocking until they are stopped. +func (l *Lifecycle) Pause(_, _ *struct{}) error { + l.Kernel.Pause() + return nil +} + +// Resume resumes all tasks. +func (l *Lifecycle) Resume(_, _ *struct{}) error { + l.Kernel.Unpause() + return nil +} diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go index 8a500a515..7613dfcbc 100644 --- a/pkg/sentry/control/logging.go +++ b/pkg/sentry/control/logging.go @@ -50,20 +50,20 @@ type LoggingArgs struct { // enable strace at all. If this flag is false then a completely // pristine copy of the syscall table will be swapped in. This // approach is used to remain consistent with an empty strace - // whitelist meaning trace all system calls. + // allowlist meaning trace all system calls. EnableStrace bool - // Strace is the whitelist of syscalls to trace to log. If this - // and StraceEventWhitelist are empty trace all system calls. - StraceWhitelist []string + // Strace is the allowlist of syscalls to trace to log. If this + // and StraceEventAllowlist are empty trace all system calls. + StraceAllowlist []string // SetEventStrace is a flag used to indicate that event strace // related arguments were passed in. SetEventStrace bool - // StraceEventWhitelist is the whitelist of syscalls to trace + // StraceEventAllowlist is the allowlist of syscalls to trace // to event log. - StraceEventWhitelist []string + StraceEventAllowlist []string } // Logging provides functions related to logging. @@ -107,13 +107,13 @@ func (l *Logging) Change(args *LoggingArgs, code *int) error { func (l *Logging) configureStrace(args *LoggingArgs) error { if args.EnableStrace { - // Install the whitelist specified. - if len(args.StraceWhitelist) > 0 { - if err := strace.Enable(args.StraceWhitelist, strace.SinkTypeLog); err != nil { + // Install the allowlist specified. + if len(args.StraceAllowlist) > 0 { + if err := strace.Enable(args.StraceAllowlist, strace.SinkTypeLog); err != nil { return err } } else { - // For convenience, if strace is enabled but whitelist + // For convenience, if strace is enabled but allowlist // is empty, enable everything to log. strace.EnableAll(strace.SinkTypeLog) } @@ -125,8 +125,8 @@ func (l *Logging) configureStrace(args *LoggingArgs) error { } func (l *Logging) configureEventStrace(args *LoggingArgs) error { - if len(args.StraceEventWhitelist) > 0 { - if err := strace.Enable(args.StraceEventWhitelist, strace.SinkTypeEvent); err != nil { + if len(args.StraceEventAllowlist) > 0 { + if err := strace.Enable(args.StraceEventAllowlist, strace.SinkTypeEvent); err != nil { return err } } else { diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index 2f3664c57..f721b7236 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -26,6 +26,23 @@ import ( "gvisor.dev/gvisor/pkg/urpc" ) +const ( + // DefaultBlockProfileRate is the default profiling rate for block + // profiles. + // + // The default here is 10%, which will record a stacktrace 10% of the + // time when blocking occurs. Since these events should not be super + // frequent, we expect this to achieve a reasonable balance between + // collecting the data we need and imposing a high performance cost + // (e.g. skewing even the CPU profile). + DefaultBlockProfileRate = 10 + + // DefaultMutexProfileRate is the default profiling rate for mutex + // profiles. Like the block rate above, we use a default rate of 10% + // for the same reasons. + DefaultMutexProfileRate = 10 +) + // Profile includes profile-related RPC stubs. It provides a way to // control the built-in runtime profiling facilities. // @@ -175,12 +192,8 @@ func (p *Profile) Block(o *BlockProfileOpts, _ *struct{}) error { defer p.blockMu.Unlock() // Always set the rate. We then wait to collect a profile at this rate, - // and disable when we're done. Note that the default here is 10%, which - // will record a stacktrace 10% of the time when blocking occurs. Since - // these events should not be super frequent, we expect this to achieve - // a reasonable balance between collecting the data we need and imposing - // a high performance cost (e.g. skewing even the CPU profile). - rate := 10 + // and disable when we're done. + rate := DefaultBlockProfileRate if o.Rate != 0 { rate = o.Rate } @@ -220,9 +233,8 @@ func (p *Profile) Mutex(o *MutexProfileOpts, _ *struct{}) error { p.mutexMu.Lock() defer p.mutexMu.Unlock() - // Always set the fraction. Like the block rate above, we use - // a default rate of 10% for the same reasons. - fraction := 10 + // Always set the fraction. + fraction := DefaultMutexProfileRate if o.Fraction != 0 { fraction = o.Fraction } diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 221e98a01..6352ea71a 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -126,7 +126,7 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { // Wait for completion. newTG.WaitExited() - *waitStatus = newTG.ExitStatus().Status() + *waitStatus = uint32(newTG.ExitStatus()) return nil } @@ -223,7 +223,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI _ = fd.Close() } }() - ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds) + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, args.KUID, args.KGID, fds) if err != nil { return nil, 0, nil, nil, err } diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go index 62eaca965..4c83b8e8e 100644 --- a/pkg/sentry/control/state.go +++ b/pkg/sentry/control/state.go @@ -17,6 +17,7 @@ package control import ( "errors" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/state" @@ -67,7 +68,7 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error { log.Warningf("Save failed: exiting...") s.Kernel.SetSaveError(err) } - s.Kernel.Kill(kernel.ExitStatus{}) + s.Kernel.Kill(linux.WaitStatusExit(0)) }, } return saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog) diff --git a/pkg/sentry/control/usage.go b/pkg/sentry/control/usage.go new file mode 100644 index 000000000..cc78d3f45 --- /dev/null +++ b/pkg/sentry/control/usage.go @@ -0,0 +1,183 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "fmt" + "os" + "runtime" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/urpc" +) + +// Usage includes usage-related RPC stubs. +type Usage struct { + Kernel *kernel.Kernel +} + +// MemoryUsageOpts contains usage options. +type MemoryUsageOpts struct { + // Full indicates that a full accounting should be done. If Full is not + // specified, then a partial accounting will be done, and Unknown will + // contain a majority of memory. See Collect for more information. + Full bool `json:"Full"` +} + +// MemoryUsage is a memory usage structure. +type MemoryUsage struct { + Unknown uint64 `json:"Unknown"` + System uint64 `json:"System"` + Anonymous uint64 `json:"Anonymous"` + PageCache uint64 `json:"PageCache"` + Mapped uint64 `json:"Mapped"` + Tmpfs uint64 `json:"Tmpfs"` + Ramdiskfs uint64 `json:"Ramdiskfs"` + Total uint64 `json:"Total"` +} + +// MemoryUsageFileOpts contains usage file options. +type MemoryUsageFileOpts struct { + // Version is used to ensure both sides agree on the format of the + // shared memory buffer. + Version uint64 `json:"Version"` +} + +// MemoryUsageFile contains the file handle to the usage file. +type MemoryUsageFile struct { + urpc.FilePayload +} + +// UsageFD returns the file that tracks the memory usage of the application. +func (u *Usage) UsageFD(opts *MemoryUsageFileOpts, out *MemoryUsageFile) error { + // Only support version 1 for now. + if opts.Version != 1 { + return fmt.Errorf("unsupported version requested: %d", opts.Version) + } + + mf := u.Kernel.MemoryFile() + *out = MemoryUsageFile{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{ + usage.MemoryAccounting.File, + mf.File(), + }, + }, + } + + return nil +} + +// Collect returns memory used by the sandboxed application. +func (u *Usage) Collect(opts *MemoryUsageOpts, out *MemoryUsage) error { + if opts.Full { + // Ensure everything is up to date. + if err := u.Kernel.MemoryFile().UpdateUsage(); err != nil { + return err + } + + // Copy out a snapshot. + snapshot, total := usage.MemoryAccounting.Copy() + *out = MemoryUsage{ + System: snapshot.System, + Anonymous: snapshot.Anonymous, + PageCache: snapshot.PageCache, + Mapped: snapshot.Mapped, + Tmpfs: snapshot.Tmpfs, + Ramdiskfs: snapshot.Ramdiskfs, + Total: total, + } + } else { + // Get total usage from the MemoryFile implementation. + total, err := u.Kernel.MemoryFile().TotalUsage() + if err != nil { + return err + } + + // The memory accounting is guaranteed to be accurate only when + // UpdateUsage is called. If UpdateUsage is not called, then only Mapped + // will be up-to-date. + snapshot, _ := usage.MemoryAccounting.Copy() + *out = MemoryUsage{ + Unknown: total, + Mapped: snapshot.Mapped, + Total: total + snapshot.Mapped, + } + + } + + return nil +} + +// UsageReduceOpts contains options to Usage.Reduce(). +type UsageReduceOpts struct { + // If Wait is true, Reduce blocks until all activity initiated by + // Usage.Reduce() has completed. + Wait bool `json:"wait"` +} + +// UsageReduceOutput contains output from Usage.Reduce(). +type UsageReduceOutput struct{} + +// Reduce requests that the sentry attempt to reduce its memory usage. +func (u *Usage) Reduce(opts *UsageReduceOpts, out *UsageReduceOutput) error { + mf := u.Kernel.MemoryFile() + mf.StartEvictions() + if opts.Wait { + mf.WaitForEvictions() + } + return nil +} + +// MemoryUsageRecord contains the mapping and platform memory file. +type MemoryUsageRecord struct { + mmap uintptr + stats *usage.RTMemoryStats + mf os.File +} + +// NewMemoryUsageRecord creates a new MemoryUsageRecord from usageFile and +// platformFile. +func NewMemoryUsageRecord(usageFile, platformFile os.File) (*MemoryUsageRecord, error) { + mmap, _, e := unix.RawSyscall6(unix.SYS_MMAP, 0, usage.RTMemoryStatsSize, unix.PROT_READ, unix.MAP_SHARED, usageFile.Fd(), 0) + if e != 0 { + return nil, fmt.Errorf("mmap returned %d, want 0", e) + } + + m := MemoryUsageRecord{ + mmap: mmap, + stats: usage.RTMemoryStatsPointer(mmap), + mf: platformFile, + } + + runtime.SetFinalizer(&m, finalizer) + return &m, nil +} + +func finalizer(m *MemoryUsageRecord) { + unix.RawSyscall(unix.SYS_MUNMAP, m.mmap, usage.RTMemoryStatsSize, 0) +} + +// Fetch fetches the usage info from a MemoryUsageRecord. +func (m *MemoryUsageRecord) Fetch() (mapped, unknown, total uint64, err error) { + var stat unix.Stat_t + if err := unix.Fstat(int(m.mf.Fd()), &stat); err != nil { + return 0, 0, 0, err + } + fmem := uint64(stat.Blocks) * 512 + return m.stats.RTMapped, fmem, m.stats.RTMapped + fmem, nil +} diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD index 4c8604d58..66b9ed523 100644 --- a/pkg/sentry/devices/memdev/BUILD +++ b/pkg/sentry/devices/memdev/BUILD @@ -15,6 +15,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/rand", "//pkg/safemem", "//pkg/sentry/fsimpl/devtmpfs", @@ -23,7 +24,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go index fece3e762..fc702c9f6 100644 --- a/pkg/sentry/devices/memdev/full.go +++ b/pkg/sentry/devices/memdev/full.go @@ -16,8 +16,8 @@ package memdev import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -66,12 +66,12 @@ func (fd *fullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.Rea // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *fullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ENOSPC + return 0, linuxerr.ENOSPC } // Write implements vfs.FileDescriptionImpl.Write. func (fd *fullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ENOSPC + return 0, linuxerr.ENOSPC } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/devices/quotedev/BUILD b/pkg/sentry/devices/quotedev/BUILD deleted file mode 100644 index d09214e3e..000000000 --- a/pkg/sentry/devices/quotedev/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -licenses(["notice"]) - -go_library( - name = "quotedev", - srcs = ["quotedev.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/sentry/fsimpl/devtmpfs", - "//pkg/sentry/vfs", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/devices/quotedev/quotedev.go b/pkg/sentry/devices/quotedev/quotedev.go deleted file mode 100644 index 6114cb724..000000000 --- a/pkg/sentry/devices/quotedev/quotedev.go +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2021 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package quotedev implements a vfs.Device for /dev/gvisor_quote. -package quotedev - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -const ( - quoteDevMinor = 0 -) - -// quoteDevice implements vfs.Device for /dev/gvisor_quote -// -// +stateify savable -type quoteDevice struct{} - -// Open implements vfs.Device.Open. -// TODO(b/157161182): Add support for attestation ioctls. -func (quoteDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return nil, syserror.EIO -} - -// Register registers all devices implemented by this package in vfsObj. -func Register(vfsObj *vfs.VirtualFilesystem) error { - return vfsObj.RegisterDevice(vfs.CharDevice, linux.UNNAMED_MAJOR, quoteDevMinor, quoteDevice{}, &vfs.RegisterDeviceOptions{ - GroupName: "gvisor_quote", - }) -} - -// CreateDevtmpfsFiles creates device special files in dev representing all -// devices implemented by this package. -func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error { - return dev.CreateDeviceFile(ctx, "gvisor_quote", vfs.CharDevice, linux.UNNAMED_MAJOR, quoteDevMinor, 0666 /* mode */) -} diff --git a/pkg/sentry/devices/ttydev/BUILD b/pkg/sentry/devices/ttydev/BUILD index b4b6ca38a..ab4cd0b33 100644 --- a/pkg/sentry/devices/ttydev/BUILD +++ b/pkg/sentry/devices/ttydev/BUILD @@ -9,8 +9,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/fsimpl/devtmpfs", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go index a287c65ca..29b79b5d6 100644 --- a/pkg/sentry/devices/ttydev/ttydev.go +++ b/pkg/sentry/devices/ttydev/ttydev.go @@ -18,9 +18,9 @@ package ttydev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -36,7 +36,7 @@ type ttyDevice struct{} // Open implements vfs.Device.Open. func (ttyDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return nil, syserror.EIO + return nil, linuxerr.EIO } // Register registers all devices implemented by this package in vfsObj. diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD index 8b38d574d..60c971030 100644 --- a/pkg/sentry/devices/tundev/BUILD +++ b/pkg/sentry/devices/tundev/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/arch", "//pkg/sentry/fsimpl/devtmpfs", @@ -16,7 +17,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/socket/netstack", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/tcpip/link/tun", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go index a12eeb8e7..b4e2a6d91 100644 --- a/pkg/sentry/devices/tundev/tundev.go +++ b/pkg/sentry/devices/tundev/tundev.go @@ -18,6 +18,7 @@ package tundev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" @@ -25,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/link/tun" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -77,11 +77,11 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg switch request { case linux.TUNSETIFF: if !t.HasCapability(linux.CAP_NET_ADMIN) { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } stack, ok := t.NetworkContext().(*netstack.Stack) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var req linux.IFReq @@ -104,7 +104,7 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD index 6b4f8b0ed..563e96e0d 100644 --- a/pkg/sentry/fdimport/BUILD +++ b/pkg/sentry/fdimport/BUILD @@ -15,6 +15,7 @@ go_library( "//pkg/sentry/fs/host", "//pkg/sentry/fsimpl/host", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", ], ) diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go index badd5b073..f2b9630eb 100644 --- a/pkg/sentry/fdimport/fdimport.go +++ b/pkg/sentry/fdimport/fdimport.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/host" hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -31,9 +32,9 @@ import ( // sets up TTY for the first 3 FDs in the slice representing stdin, stdout, // stderr. Used FDs are either closed or released. It's safe for the caller to // close any remaining files upon return. -func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, uid auth.KUID, gid auth.KGID, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if kernel.VFS2Enabled { - ttyFile, err := importVFS2(ctx, fdTable, console, fds) + ttyFile, err := importVFS2(ctx, fdTable, console, uid, gid, fds) return nil, ttyFile, err } ttyFile, err := importFS(ctx, fdTable, console, fds) @@ -89,7 +90,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] return ttyFile.FileOperations.(*host.TTYFileOperations), nil } -func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) { +func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, uid auth.KUID, gid auth.KGID, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) { k := kernel.KernelFromContext(ctx) if k == nil { return nil, fmt.Errorf("cannot find kernel from context") @@ -103,7 +104,13 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi // Import the file as a host TTY file. if ttyFile == nil { var err error - appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), true /* isTTY */) + appFile, err = hostvfs2.NewFD(ctx, k.HostMount(), hostFD.FD(), &hostvfs2.NewFDOptions{ + Savable: true, + IsTTY: true, + VirtualOwner: true, + UID: uid, + GID: gid, + }) if err != nil { return nil, err } @@ -121,7 +128,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi } } else { var err error - appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), false /* isTTY */) + appFile, err = hostvfs2.NewFD(ctx, k.HostMount(), hostFD.FD(), &hostvfs2.NewFDOptions{ + Savable: true, + VirtualOwner: true, + UID: uid, + GID: gid, + }) if err != nil { return nil, err } diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index 0dc100f9b..4e573d249 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -48,6 +48,7 @@ go_library( "//pkg/abi/linux", "//pkg/amutex", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/p9", @@ -67,7 +68,6 @@ go_library( "//pkg/sentry/usage", "//pkg/state", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -110,12 +110,12 @@ go_test( deps = [ ":fs", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/contexttest", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index 5aa668873..e48bd4dba 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -20,11 +20,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -161,7 +161,7 @@ func doCopyUp(ctx context.Context, d *Dirent) error { // then try to take copyMu for writing here, we'd deadlock. t := d.Inode.overlay.lower.StableAttr.Type if t != RegularFile && t != Directory && t != Symlink { - return syserror.EINVAL + return linuxerr.EINVAL } // Wait to get exclusive access to the upper Inode. @@ -194,7 +194,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx) if err != nil { log.Warningf("copy up failed to get lower attributes: %v", err) - return syserror.EIO + return linuxerr.EIO } var childUpperInode *Inode @@ -210,7 +210,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { childFile, err := parentUpper.Create(ctx, root, next.name, FileFlags{Read: true, Write: true}, attrs.Perms) if err != nil { log.Warningf("copy up failed to create file: %v", err) - return syserror.EIO + return linuxerr.EIO } defer childFile.DecRef(ctx) childUpperInode = childFile.Dirent.Inode @@ -218,13 +218,13 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { case Directory: if err := parentUpper.CreateDirectory(ctx, root, next.name, attrs.Perms); err != nil { log.Warningf("copy up failed to create directory: %v", err) - return syserror.EIO + return linuxerr.EIO } childUpper, err := parentUpper.Lookup(ctx, next.name) if err != nil { werr := fmt.Errorf("copy up failed to lookup directory: %v", err) cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode @@ -234,17 +234,17 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { link, err := childLower.Readlink(ctx) if err != nil { log.Warningf("copy up failed to read symlink value: %v", err) - return syserror.EIO + return linuxerr.EIO } if err := parentUpper.CreateLink(ctx, root, link, next.name); err != nil { log.Warningf("copy up failed to create symlink: %v", err) - return syserror.EIO + return linuxerr.EIO } childUpper, err := parentUpper.Lookup(ctx, next.name) if err != nil { werr := fmt.Errorf("copy up failed to lookup symlink: %v", err) cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode @@ -258,14 +258,14 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil { werr := fmt.Errorf("copy up failed to copy up attributes: %v", err) cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } // Copy the entire file. if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil { werr := fmt.Errorf("copy up failed to copy up contents: %v", err) cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } lowerMappable := next.Inode.overlay.lower.Mappable() @@ -273,7 +273,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { if lowerMappable != nil && upperMappable == nil { werr := fmt.Errorf("copy up failed: cannot ensure memory mapping coherence") cleanupUpper(ctx, parentUpper, next.name, werr) - return syserror.EIO + return linuxerr.EIO } // Propagate memory mappings to the upper Inode. @@ -410,7 +410,7 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error return err } lowerXattr, err := lower.ListXattr(ctx, linux.XATTR_SIZE_MAX) - if err != nil && err != syserror.EOPNOTSUPP { + if err != nil && !linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { return err } diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 23a3a9a2d..7baf26b24 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -18,6 +18,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/rand", "//pkg/safemem", @@ -33,7 +34,6 @@ go_library( "//pkg/sentry/mm", "//pkg/sentry/pgalloc", "//pkg/sentry/socket/netstack", - "//pkg/syserror", "//pkg/tcpip/link/tun", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go index deb9c6ad8..6f0c1fc68 100644 --- a/pkg/sentry/fs/dev/full.go +++ b/pkg/sentry/fs/dev/full.go @@ -17,9 +17,9 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -77,5 +77,5 @@ var _ fs.FileOperations = (*fullFileOperations)(nil) // Write implements FileOperations.Write. func (*fullFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.ENOSPC + return 0, linuxerr.ENOSPC } diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go index 77e8d222a..1abf11142 100644 --- a/pkg/sentry/fs/dev/net_tun.go +++ b/pkg/sentry/fs/dev/net_tun.go @@ -17,6 +17,7 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -24,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/link/tun" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -98,11 +98,11 @@ func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io user switch request { case linux.TUNSETIFF: if !t.HasCapability(linux.CAP_NET_ADMIN) { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } stack, ok := t.NetworkContext().(*netstack.Stack) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var req linux.IFReq @@ -125,7 +125,7 @@ func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io user return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 9d5d40954..d300a32e0 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -22,12 +22,12 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) type globalDirentMap struct { @@ -487,11 +487,11 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be // expensive, if possible release the lock and re-acquire it. if walkMayUnlock { - d.mu.Unlock() + d.mu.Unlock() // +checklocksforce: results in an inconsistent block. } c, err := d.Inode.Lookup(ctx, name) if walkMayUnlock { - d.mu.Lock() + d.mu.Lock() // +checklocksforce: see above. } // No dice. if err != nil { @@ -593,21 +593,27 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { // lockDirectory should be called for any operation that changes this `d`s // children (creating or removing them). -func (d *Dirent) lockDirectory() func() { +// +checklocksacquire:d.dirMu +// +checklocksacquire:d.mu +func (d *Dirent) lockDirectory() { renameMu.RLock() d.dirMu.Lock() d.mu.Lock() - return func() { - d.mu.Unlock() - d.dirMu.Unlock() - renameMu.RUnlock() - } +} + +// unlockDirectory is the reverse of lockDirectory. +// +checklocksrelease:d.dirMu +// +checklocksrelease:d.mu +func (d *Dirent) unlockDirectory() { + d.mu.Unlock() + d.dirMu.Unlock() + renameMu.RUnlock() // +checklocksforce: see lockDirectory. } // Create creates a new regular file in this directory. func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) { - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Does something already exist? if d.exists(ctx, root, name) { @@ -669,8 +675,8 @@ func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) { // genericCreate executes create if name does not exist. Removes a negative Dirent at name if // create succeeds. func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error { - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Does something already exist? if d.exists(ctx, root, name) { @@ -857,7 +863,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, // Once we have written entries for "." and "..", future errors from // IterateDir will be hidden. if !IsDir(d.Inode.StableAttr) { - return 0, syserror.ENOTDIR + return 0, linuxerr.ENOTDIR } // This is a special case for lseek(fd, 0, SEEK_END). @@ -956,14 +962,14 @@ func (d *Dirent) isMountPointLocked() bool { func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err error) { // Did we race with deletion? if atomic.LoadInt32(&d.deleted) != 0 { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // Refuse to mount a symlink. // // See Linux equivalent in fs/namespace.c:do_add_mount. if IsSymlink(inode.StableAttr) { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // Dirent that'll replace d. @@ -991,7 +997,7 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { // Did we race with deletion? if atomic.LoadInt32(&d.deleted) != 0 { - return syserror.ENOENT + return linuxerr.ENOENT } // Remount our former child in its place. @@ -1020,8 +1026,8 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath panic("Dirent.Remove: root must not be nil") } - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Try to walk to the node. child, err := d.walk(ctx, root, name, false /* may unlock */) @@ -1081,8 +1087,8 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) panic("Dirent.Remove: root must not be nil") } - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() // Check for dots. if name == "." { @@ -1258,17 +1264,15 @@ func (d *Dirent) dropExtendedReference() { d.Inode.MountSource.fscache.Remove(d) } -// lockForRename takes locks on oldParent and newParent as required by Rename -// and returns a function that will unlock the locks taken. The returned -// function must be called even if a non-nil error is returned. -func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) { +// lockForRename takes locks on oldParent and newParent as required by Rename. +// On return, unlockForRename must always be called, even with an error. +// +checklocksacquire:oldParent.mu +// +checklocksacquire:newParent.mu +func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) error { renameMu.Lock() if oldParent == newParent { oldParent.mu.Lock() - return func() { - oldParent.mu.Unlock() - renameMu.Unlock() - }, nil + return nil // +checklocksforce: only one lock exists. } // Renaming between directories is a bit subtle: @@ -1296,11 +1300,7 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName // itself. err = unix.EINVAL } - return func() { - newParent.mu.Unlock() - oldParent.mu.Unlock() - renameMu.Unlock() - }, err + return err } child = p } @@ -1309,17 +1309,27 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName // have no relationship; in either case we can do this: newParent.mu.Lock() oldParent.mu.Lock() - return func() { + return nil +} + +// unlockForRename is the opposite of lockForRename. +// +checklocksrelease:oldParent.mu +// +checklocksrelease:newParent.mu +func unlockForRename(oldParent, newParent *Dirent) { + if oldParent == newParent { oldParent.mu.Unlock() - newParent.mu.Unlock() - renameMu.Unlock() - }, nil + renameMu.Unlock() // +checklocksforce: only one lock exists. + return + } + newParent.mu.Unlock() + oldParent.mu.Unlock() + renameMu.Unlock() // +checklocksforce: not tracked. } func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { uattr, err := d.Inode.UnstableAttr(ctx) if err != nil { - return syserror.EPERM + return linuxerr.EPERM } if !uattr.Perms.Sticky { return nil @@ -1332,7 +1342,7 @@ func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { vuattr, err := victim.Inode.UnstableAttr(ctx) if err != nil { - return syserror.EPERM + return linuxerr.EPERM } if vuattr.Owner.UID == creds.EffectiveKUID { return nil @@ -1340,7 +1350,7 @@ func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { if victim.Inode.CheckCapability(ctx, linux.CAP_FOWNER) { return nil } - return syserror.EPERM + return linuxerr.EPERM } // MayDelete determines whether `name`, a child of `d`, can be deleted or @@ -1352,8 +1362,8 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error return err } - unlock := d.lockDirectory() - defer unlock() + d.lockDirectory() + defer d.unlockDirectory() victim, err := d.walk(ctx, root, name, true /* may unlock */) if err != nil { @@ -1374,7 +1384,7 @@ func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error { } if victim.IsRoot() { - return syserror.EBUSY + return linuxerr.EBUSY } return nil @@ -1391,8 +1401,8 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string } // Acquire global renameMu lock, and mu locks on oldParent/newParent. - unlock, err := lockForRename(oldParent, oldName, newParent, newName) - defer unlock() + err := lockForRename(oldParent, oldName, newParent, newName) + defer unlockForRename(oldParent, newParent) if err != nil { return err } @@ -1439,7 +1449,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // replaced is the dirent that is being overwritten by rename. replaced, err := newParent.walk(ctx, root, newName, false /* may unlock */) if err != nil { - if err != syserror.ENOENT { + if !linuxerr.Equals(linuxerr.ENOENT, err) { return err } diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index 2120f2bad..9f1fe5160 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -13,6 +13,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fd", "//pkg/fdnotifier", "//pkg/log", @@ -21,7 +22,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -38,12 +38,13 @@ go_test( library = ":fdpipe", deps = [ "//pkg/context", + "//pkg/errors", + "//pkg/errors/linuxerr", "//pkg/fd", "//pkg/fdnotifier", "//pkg/hostarch", "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/syserror", "//pkg/usermem", "@com_github_google_uuid//:go_default_library", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 757b7d511..4370cce33 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -20,6 +20,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -141,7 +141,7 @@ func (p *pipeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{secio.FullReader{p.file}}) total := int64(bufN) + n if err != nil && isBlockError(err) { - return total, syserror.ErrWouldBlock + return total, linuxerr.ErrWouldBlock } return total, err } @@ -150,15 +150,15 @@ func (p *pipeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO func (p *pipeOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { n, err := src.CopyInTo(ctx, safemem.FromIOWriter{p.file}) if err != nil && isBlockError(err) { - return n, syserror.ErrWouldBlock + return n, linuxerr.ErrWouldBlock } return n, err } // isBlockError unwraps os errors and checks if they are caused by EAGAIN or -// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. +// EWOULDBLOCK. This is so they can be transformed into linuxerr.ErrWouldBlock. func isBlockError(err error) bool { - if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK { + if linuxerr.Equals(linuxerr.EAGAIN, err) || linuxerr.Equals(linuxerr.EWOULDBLOCK, err) { return true } if pe, ok := err.(*os.PathError); ok { diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go index adda19168..e91e1b5cb 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener.go @@ -21,9 +21,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) // NonBlockingOpener is a generic host file opener used to retry opening host @@ -40,7 +40,7 @@ func Open(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (fs p := &pipeOpenState{} canceled := false for { - if file, err := p.TryOpen(ctx, opener, flags); err != syserror.ErrWouldBlock { + if file, err := p.TryOpen(ctx, opener, flags); err != linuxerr.ErrWouldBlock { return file, err } @@ -51,7 +51,7 @@ func Open(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (fs if p.hostFile != nil { p.hostFile.Close() } - return nil, syserror.ErrInterrupted + return nil, linuxerr.ErrInterrupted } cancel := ctx.SleepStart() @@ -106,13 +106,13 @@ func (p *pipeOpenState) TryOpen(ctx context.Context, opener NonBlockingOpener, f } return newPipeOperations(ctx, opener, flags, f, nil) - // Handle opening O_WRONLY blocking: convert ENXIO to syserror.ErrWouldBlock. + // Handle opening O_WRONLY blocking: convert ENXIO to linuxerr.ErrWouldBlock. // See TryOpenWriteOnly for more details. case flags.Write: return p.TryOpenWriteOnly(ctx, opener) default: - // Handle opening O_RDONLY blocking: convert EOF from read to syserror.ErrWouldBlock. + // Handle opening O_RDONLY blocking: convert EOF from read to linuxerr.ErrWouldBlock. // See TryOpenReadOnly for more details. return p.TryOpenReadOnly(ctx, opener) } @@ -120,7 +120,7 @@ func (p *pipeOpenState) TryOpen(ctx context.Context, opener NonBlockingOpener, f // TryOpenReadOnly tries to open a host pipe read only but only returns a fs.File when // there is a coordinating writer. Call TryOpenReadOnly repeatedly on the same pipeOpenState -// until syserror.ErrWouldBlock is no longer returned. +// until linuxerr.ErrWouldBlock is no longer returned. // // How it works: // @@ -150,7 +150,7 @@ func (p *pipeOpenState) TryOpenReadOnly(ctx context.Context, opener NonBlockingO if n == 0 { // EOF means that we're not ready yet. if rerr == nil || rerr == io.EOF { - return nil, syserror.ErrWouldBlock + return nil, linuxerr.ErrWouldBlock } // Any error that is not EWOULDBLOCK also means we're not // ready yet, and probably never will be ready. In this @@ -175,16 +175,16 @@ func (p *pipeOpenState) TryOpenReadOnly(ctx context.Context, opener NonBlockingO // TryOpenWriteOnly tries to open a host pipe write only but only returns a fs.File when // there is a coordinating reader. Call TryOpenWriteOnly repeatedly on the same pipeOpenState -// until syserror.ErrWouldBlock is no longer returned. +// until linuxerr.ErrWouldBlock is no longer returned. // // How it works: // // Opening a pipe write only will return ENXIO until readers are available. Converts the ENXIO -// to an syserror.ErrWouldBlock, to tell callers to retry. +// to an linuxerr.ErrWouldBlock, to tell callers to retry. func (*pipeOpenState) TryOpenWriteOnly(ctx context.Context, opener NonBlockingOpener) (*pipeOperations, error) { hostFile, err := opener.NonBlockingOpen(ctx, fs.PermMask{Write: true}) if unwrapError(err) == unix.ENXIO { - return nil, syserror.ErrWouldBlock + return nil, linuxerr.ErrWouldBlock } if err != nil { return nil, err diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index 7b3ff191f..e1587288e 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -25,12 +25,11 @@ import ( "github.com/google/uuid" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -146,18 +145,18 @@ func TestTryOpen(t *testing.T) { err: unix.ENOENT, }, { - desc: "Blocking Write only returns with syserror.ErrWouldBlock", + desc: "Blocking Write only returns with linuxerr.ErrWouldBlock", makePipe: true, flags: fs.FileFlags{Write: true}, expectFile: false, - err: syserror.ErrWouldBlock, + err: linuxerr.ErrWouldBlock, }, { - desc: "Blocking Read only returns with syserror.ErrWouldBlock", + desc: "Blocking Read only returns with linuxerr.ErrWouldBlock", makePipe: true, flags: fs.FileFlags{Read: true}, expectFile: false, - err: syserror.ErrWouldBlock, + err: linuxerr.ErrWouldBlock, }, } { name := pipename() @@ -316,7 +315,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { // another writer comes along. This means we can open the same pipe write only // with no problems + write to it, given that opener.Open already tried to open // the pipe RDONLY and succeeded, which we know happened if TryOpen returns - // syserror.ErrwouldBlock. + // linuxerr.ErrwouldBlock. // // This simulates the open(RDONLY) <-> open(WRONLY)+write race we care about, but // does not cause our test to be racy (which would be terrible). @@ -328,8 +327,8 @@ func TestCopiedReadAheadBuffer(t *testing.T) { pipeOps.Release(ctx) t.Fatalf("open(%s, %o) got file, want nil", name, unix.O_RDONLY) } - if err != syserror.ErrWouldBlock { - t.Fatalf("open(%s, %o) got error %v, want %v", name, unix.O_RDONLY, err, syserror.ErrWouldBlock) + if err != linuxerr.ErrWouldBlock { + t.Fatalf("open(%s, %o) got error %v, want %v", name, unix.O_RDONLY, err, linuxerr.ErrWouldBlock) } // Then open the same pipe write only and write some bytes to it. The next @@ -515,8 +514,8 @@ func assertReaderHungup(t *testing.T, desc string, reader io.Reader) bool { } func assertWriterHungup(t *testing.T, desc string, writer io.Writer) bool { - if _, err := writer.Write([]byte("hello")); unwrapError(err) != unix.EPIPE { - t.Errorf("%s: write to self after hangup got error %v, want %v", desc, err, unix.EPIPE) + if _, err := writer.Write([]byte("hello")); !linuxerr.Equals(linuxerr.EPIPE, unwrapError(err)) { + t.Errorf("%s: write to self after hangup got error %v, want %v", desc, err, linuxerr.EPIPE) return false } return true diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index ab0e9dac7..63900e766 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -21,14 +21,14 @@ import ( "testing" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/errors" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/pkg/hostarch" ) func singlePipeFD() (int, error) { @@ -209,17 +209,17 @@ func TestPipeRequest(t *testing.T) { { desc: "ReadDir on pipe returns ENOTDIR", context: &ReadDir{}, - err: unix.ENOTDIR, + err: linuxerr.ENOTDIR, }, { desc: "Fsync on pipe returns EINVAL", context: &Fsync{}, - err: unix.EINVAL, + err: linuxerr.EINVAL, }, { desc: "Seek on pipe returns ESPIPE", context: &Seek{}, - err: unix.ESPIPE, + err: linuxerr.ESPIPE, }, { desc: "Readv on pipe from empty buffer returns nil", @@ -237,7 +237,7 @@ func TestPipeRequest(t *testing.T) { context: &Readv{Dst: usermem.BytesIOSequence(make([]byte, 10))}, flags: fs.FileFlags{Read: true}, keepOpenPartner: true, - err: syserror.ErrWouldBlock, + err: linuxerr.ErrWouldBlock, }, { desc: "Writev on pipe from empty buffer returns nil", @@ -248,7 +248,7 @@ func TestPipeRequest(t *testing.T) { desc: "Writev on pipe from non-empty buffer and closed partner returns EPIPE", context: &Writev{Src: usermem.BytesIOSequence([]byte("hello"))}, flags: fs.FileFlags{Write: true}, - err: unix.EPIPE, + err: linuxerr.EPIPE, }, { desc: "Writev on pipe from non-empty buffer and open partner succeeds", @@ -307,7 +307,11 @@ func TestPipeRequest(t *testing.T) { t.Errorf("%s: unknown request type %T", test.desc, test.context) } - if unwrapError(err) != test.err { + if linuxErr, ok := test.err.(*errors.Error); ok { + if !linuxerr.Equals(linuxErr, unwrapError(err)) { + t.Errorf("%s: got error %v, want %v", test.desc, err, test.err) + } + } else if test.err != unwrapError(err) { t.Errorf("%s: got error %v, want %v", test.desc, err, test.err) } } @@ -405,8 +409,8 @@ func TestPipeReadsAccumulate(t *testing.T) { n, err := p.Read(ctx, file, iov, 0) total := n iov = iov.DropFirst64(n) - if err != syserror.ErrWouldBlock { - t.Fatalf("Readv got error %v, want %v", err, syserror.ErrWouldBlock) + if err != linuxerr.ErrWouldBlock { + t.Fatalf("Readv got error %v, want %v", err, linuxerr.ErrWouldBlock) } // Write a few more bytes to allow us to read more/accumulate. @@ -474,8 +478,8 @@ func TestPipeWritesAccumulate(t *testing.T) { } iov := usermem.BytesIOSequence(writeBuffer) n, err := p.Write(ctx, file, iov, 0) - if err != syserror.ErrWouldBlock { - t.Fatalf("Writev got error %v, want %v", err, syserror.ErrWouldBlock) + if err != linuxerr.ErrWouldBlock { + t.Fatalf("Writev got error %v, want %v", err, linuxerr.ErrWouldBlock) } if n != int64(pipeSize) { t.Fatalf("Writev partial write, got: %v, want %v", n, pipeSize) diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 57f904801..df04f044d 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsmetric" @@ -27,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -195,10 +195,10 @@ func (f *File) EventUnregister(e *waiter.Entry) { // offset to the value returned by f.FileOperations.Seek if the operation // is successful. // -// Returns syserror.ErrInterrupted if seeking was interrupted. +// Returns linuxerr.ErrInterrupted if seeking was interrupted. func (f *File) Seek(ctx context.Context, whence SeekWhence, offset int64) (int64, error) { if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -217,10 +217,10 @@ func (f *File) Seek(ctx context.Context, whence SeekWhence, offset int64) (int64 // Readdir unconditionally updates the access time on the File's Inode, // see fs/readdir.c:iterate_dir. // -// Returns syserror.ErrInterrupted if reading was interrupted. +// Returns linuxerr.ErrInterrupted if reading was interrupted. func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error { if !f.mu.Lock(ctx) { - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -232,13 +232,13 @@ func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error { // Readv calls f.FileOperations.Read with f as the File, advancing the file // offset if f.FileOperations.Read returns bytes read > 0. // -// Returns syserror.ErrInterrupted if reading was interrupted. +// Returns linuxerr.ErrInterrupted if reading was interrupted. func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) { start := fsmetric.StartReadWait() defer fsmetric.FinishReadWait(fsmetric.ReadWait, start) if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } fsmetric.Reads.Increment() @@ -260,7 +260,7 @@ func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) defer fsmetric.FinishReadWait(fsmetric.ReadWait, start) if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } fsmetric.Reads.Increment() @@ -276,10 +276,10 @@ func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) // unavoidably racy for network file systems. Writev also truncates src // to avoid overrunning the current file size limit if necessary. // -// Returns syserror.ErrInterrupted if writing was interrupted. +// Returns linuxerr.ErrInterrupted if writing was interrupted. func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error) { if !f.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) // Handle append mode. @@ -297,7 +297,7 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error case ok && limit == 0: unlockAppendMu() f.mu.Unlock() - return 0, syserror.ErrExceedsFileSizeLimit + return 0, linuxerr.ErrExceedsFileSizeLimit case ok: src = src.TakeFirst64(limit) } @@ -335,7 +335,7 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 limit, ok := f.checkLimit(ctx, offset) switch { case ok && limit == 0: - return 0, syserror.ErrExceedsFileSizeLimit + return 0, linuxerr.ErrExceedsFileSizeLimit case ok: src = src.TakeFirst64(limit) } @@ -352,7 +352,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { if err != nil { // This is an odd error, we treat it as evidence that // something is terribly wrong with the filesystem. - return syserror.EIO + return linuxerr.EIO } // Update the offset. @@ -381,10 +381,10 @@ func (f *File) checkLimit(ctx context.Context, offset int64) (int64, bool) { // Fsync calls f.FileOperations.Fsync with f as the File. // -// Returns syserror.ErrInterrupted if syncing was interrupted. +// Returns linuxerr.ErrInterrupted if syncing was interrupted. func (f *File) Fsync(ctx context.Context, start int64, end int64, syncType SyncType) error { if !f.mu.Lock(ctx) { - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -393,10 +393,10 @@ func (f *File) Fsync(ctx context.Context, start int64, end int64, syncType SyncT // Flush calls f.FileOperations.Flush with f as the File. // -// Returns syserror.ErrInterrupted if syncing was interrupted. +// Returns linuxerr.ErrInterrupted if syncing was interrupted. func (f *File) Flush(ctx context.Context) error { if !f.mu.Lock(ctx) { - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -405,10 +405,10 @@ func (f *File) Flush(ctx context.Context) error { // ConfigureMMap calls f.FileOperations.ConfigureMMap with f as the File. // -// Returns syserror.ErrInterrupted if interrupted. +// Returns linuxerr.ErrInterrupted if interrupted. func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { if !f.mu.Lock(ctx) { - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -417,10 +417,10 @@ func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { // UnstableAttr calls f.FileOperations.UnstableAttr with f as the File. // -// Returns syserror.ErrInterrupted if interrupted. +// Returns linuxerr.ErrInterrupted if interrupted. func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) { if !f.mu.Lock(ctx) { - return UnstableAttr{}, syserror.ErrInterrupted + return UnstableAttr{}, linuxerr.ErrInterrupted } defer f.mu.Unlock() @@ -495,7 +495,7 @@ type lockedReader struct { // Read implements io.Reader.Read. func (r *lockedReader) Read(buf []byte) (int, error) { if r.Ctx.Interrupted() { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.Offset) r.Offset += n @@ -505,7 +505,7 @@ func (r *lockedReader) Read(buf []byte) (int, error) { // ReadAt implements io.Reader.ReadAt. func (r *lockedReader) ReadAt(buf []byte, offset int64) (int, error) { if r.Ctx.Interrupted() { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), offset) return int(n), err @@ -530,7 +530,7 @@ type lockedWriter struct { // Write implements io.Writer.Write. func (w *lockedWriter) Write(buf []byte) (int, error) { if w.Ctx.Interrupted() { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } n, err := w.WriteAt(buf, w.Offset) w.Offset += int64(n) @@ -549,7 +549,7 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) { // contract. Enforce that here. for written < len(buf) { if w.Ctx.Interrupted() { - return written, syserror.ErrInterrupted + return written, linuxerr.ErrInterrupted } var n int64 n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written)) diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index 6ec721022..ce47c3907 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -120,7 +120,7 @@ type FileOperations interface { // Files with !FileFlags.Pwrite. // // If only part of src could be written, Write must return an error - // indicating why (e.g. syserror.ErrWouldBlock). + // indicating why (e.g. linuxerr.ErrWouldBlock). // // Write does not check permissions nor flags. // diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 696613f3a..a27dd0b9a 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -16,13 +16,14 @@ package fs import ( "io" + "math" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -245,7 +246,7 @@ func (f *overlayFileOperations) onTop(ctx context.Context, file *File, fn func(* // Something very wrong; return a generic filesystem // error to avoid propagating internals. f.upperMu.Unlock() - return syserror.EIO + return linuxerr.EIO } // Save upper file. @@ -357,13 +358,16 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt } if !o.isMappableLocked() { - return syserror.ENODEV + return linuxerr.ENODEV } - // FIXME(jamieliu): This is a copy/paste of fsutil.GenericConfigureMMap, - // which we can't use because the overlay implementation is in package fs, - // so depending on fs/fsutil would create a circular dependency. Move - // overlay to fs/overlay. + // TODO(gvisor.dev/issue/1624): This is a copy/paste of + // fsutil.GenericConfigureMMap, which we can't use because the overlay + // implementation is in package fs, so depending on fs/fsutil would create + // a circular dependency. VFS2 overlay doesn't have this issue. + if opts.Offset+opts.Length > math.MaxInt64 { + return linuxerr.EOVERFLOW + } opts.Mappable = o opts.MappingIdentity = file file.IncRef() @@ -407,7 +411,7 @@ func (f *overlayFileOperations) Ioctl(ctx context.Context, overlayFile *File, io // copy up on any ioctl would be too drastic. In the future, it can have a // list of ioctls that are safe to send to lower and a list that triggers a // copy up. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } return f.upper.FileOperations.Ioctl(ctx, f.upper, io, args) } @@ -417,7 +421,7 @@ func (f *overlayFileOperations) FifoSize(ctx context.Context, overlayFile *File) err = f.onTop(ctx, overlayFile, func(file *File, ops FileOperations) error { sz, ok := ops.(FifoSizer) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } rv, err = sz.FifoSize(ctx, file) return err @@ -432,11 +436,11 @@ func (f *overlayFileOperations) SetFifoSize(size int64) (rv int64, err error) { if f.upper == nil { // Named pipes cannot be copied up and changes to the lower are prohibited. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } sz, ok := f.upper.FileOperations.(FifoSizer) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } return sz.SetFifoSize(size) } diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index 44587bb37..a346c316b 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -80,23 +80,33 @@ func AsyncBarrier() { // Async executes a function asynchronously. // // Async must not be called recursively. +// +checklocksignore func Async(f func()) { workMu.RLock() - go func() { // S/R-SAFE: AsyncBarrier must be called. - defer workMu.RUnlock() // Ensure RUnlock in case of panic. - f() - }() + go asyncWork(f) // S/R-SAFE: AsyncBarrier must be called. +} + +// +checklocksignore +func asyncWork(f func()) { + // Ensure RUnlock in case of panic. + defer workMu.RUnlock() + f() } // AsyncWithContext is just like Async, except that it calls the asynchronous // function with the given context as argument. This function exists to avoid // needing to allocate an extra function on the heap in a hot path. +// +checklocksignore func AsyncWithContext(ctx context.Context, f func(context.Context)) { workMu.RLock() - go func() { // S/R-SAFE: AsyncBarrier must be called. - defer workMu.RUnlock() // Ensure RUnlock in case of panic. - f(ctx) - }() + go asyncWorkWithContext(ctx, f) +} + +// +checklocksignore +func asyncWorkWithContext(ctx context.Context, f func(context.Context)) { + // Ensure RUnlock in case of panic. + defer workMu.RUnlock() + f(ctx) } // AsyncErrorBarrier waits for all outstanding asynchronous work to complete, or diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 6469cc3a9..1a59800ea 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -76,6 +76,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/safemem", @@ -89,7 +90,6 @@ go_library( "//pkg/sentry/usage", "//pkg/state", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -106,13 +106,13 @@ go_test( library = ":fsutil", deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/safemem", "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index dc9efa5df..38e3ed42d 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -16,12 +16,13 @@ package fsutil import ( "io" + "math" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -45,7 +46,7 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, // Does the Inode represents a non-seekable type? if fs.IsPipe(inode.StableAttr) || fs.IsSocket(inode.StableAttr) { - return current, syserror.ESPIPE + return current, linuxerr.ESPIPE } // Does the Inode represent a character device? @@ -63,12 +64,12 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, switch inode.StableAttr.Type { case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: if offset < 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } return offset, nil case fs.Directory, fs.SpecialDirectory: if offset != 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } // SEEK_SET to 0 moves the directory "cursor" to the beginning. if dirCursor != nil { @@ -76,22 +77,22 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, } return 0, nil default: - return current, syserror.EINVAL + return current, linuxerr.EINVAL } case fs.SeekCurrent: switch inode.StableAttr.Type { case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: if current+offset < 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } return current + offset, nil case fs.Directory, fs.SpecialDirectory: if offset != 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } return current, nil default: - return current, syserror.EINVAL + return current, linuxerr.EINVAL } case fs.SeekEnd: switch inode.StableAttr.Type { @@ -103,14 +104,14 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, } sz := uattr.Size if sz+offset < 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } return sz + offset, nil // FIXME(b/34778850): This is not universally correct. // Remove SpecialDirectory. case fs.SpecialDirectory: if offset != 0 { - return current, syserror.EINVAL + return current, linuxerr.EINVAL } // SEEK_END to 0 moves the directory "cursor" to the end. // @@ -121,12 +122,12 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, // futile (EOF will always be the result). return fs.FileMaxOffset, nil default: - return current, syserror.EINVAL + return current, linuxerr.EINVAL } } // Not a valid seek request. - return current, syserror.EINVAL + return current, linuxerr.EINVAL } // FileGenericSeek implements fs.FileOperations.Seek for files that use a @@ -152,7 +153,7 @@ type FileNoSeek struct{} // Seek implements fs.FileOperations.Seek. func (FileNoSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // FilePipeSeek implements fs.FileOperations.Seek and can be used for files @@ -161,7 +162,7 @@ type FilePipeSeek struct{} // Seek implements fs.FileOperations.Seek. func (FilePipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // FileNotDirReaddir implements fs.FileOperations.Readdir for non-directories. @@ -169,7 +170,7 @@ type FileNotDirReaddir struct{} // Readdir implements fs.FileOperations.FileNotDirReaddir. func (FileNotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) { - return 0, syserror.ENOTDIR + return 0, linuxerr.ENOTDIR } // FileNoFsync implements fs.FileOperations.Fsync for files that don't support @@ -178,7 +179,7 @@ type FileNoFsync struct{} // Fsync implements fs.FileOperations.Fsync. func (FileNoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error { - return syserror.EINVAL + return linuxerr.EINVAL } // FileNoopFsync implements fs.FileOperations.Fsync for files that don't need @@ -204,12 +205,15 @@ type FileNoMMap struct{} // ConfigureMMap implements fs.FileOperations.ConfigureMMap. func (FileNoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error { - return syserror.ENODEV + return linuxerr.ENODEV } // GenericConfigureMMap implements fs.FileOperations.ConfigureMMap for most // filesystems that support memory mapping. func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpts) error { + if opts.Offset+opts.Length > math.MaxInt64 { + return linuxerr.EOVERFLOW + } opts.Mappable = m opts.MappingIdentity = file file.IncRef() @@ -222,7 +226,7 @@ type FileNoIoctl struct{} // Ioctl implements fs.FileOperations.Ioctl. func (FileNoIoctl) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // FileNoSplice implements fs.FileOperations.ReadFrom and @@ -231,12 +235,12 @@ type FileNoSplice struct{} // WriteTo implements fs.FileOperations.WriteTo. func (FileNoSplice) WriteTo(context.Context, *fs.File, io.Writer, int64, bool) (int64, error) { - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // ReadFrom implements fs.FileOperations.ReadFrom. func (FileNoSplice) ReadFrom(context.Context, *fs.File, io.Reader, int64) (int64, error) { - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // DirFileOperations implements most of fs.FileOperations for directories, @@ -254,12 +258,12 @@ type DirFileOperations struct { // Read implements fs.FileOperations.Read func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Write implements fs.FileOperations.Write. func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // StaticDirFileOperations implements fs.FileOperations for directories with @@ -345,7 +349,7 @@ func NewFileStaticContentReader(b []byte) FileStaticContentReader { // Read implements fs.FileOperations.Read. func (scr *FileStaticContentReader) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset >= int64(len(scr.content)) { return 0, nil @@ -367,7 +371,7 @@ type FileNoRead struct{} // Read implements fs.FileOperations.Read. func (FileNoRead) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // FileNoWrite implements fs.FileOperations.Write to return EINVAL. @@ -375,7 +379,7 @@ type FileNoWrite struct{} // Write implements fs.FileOperations.Write. func (FileNoWrite) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // FileNoopRead implement fs.FileOperations.Read as a noop. diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index 23528bf25..37ddb1a3c 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -93,7 +93,8 @@ func NewHostFileMapper() *HostFileMapper { func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { f.refsMu.Lock() defer f.refsMu.Unlock() - for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + chunkStart := mr.Start &^ chunkMask + for { refs := f.refs[chunkStart] pgs := pagesInChunk(mr, chunkStart) if refs+pgs < refs { @@ -101,6 +102,10 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { panic(fmt.Sprintf("HostFileMapper.IncRefOn(%v): adding %d page references to chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) } f.refs[chunkStart] = refs + pgs + chunkStart += chunkSize + if chunkStart >= mr.End || chunkStart == 0 { + break + } } } @@ -112,7 +117,8 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { f.refsMu.Lock() defer f.refsMu.Unlock() - for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + chunkStart := mr.Start &^ chunkMask + for { refs := f.refs[chunkStart] pgs := pagesInChunk(mr, chunkStart) switch { @@ -128,6 +134,10 @@ func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { case refs < pgs: panic(fmt.Sprintf("HostFileMapper.DecRefOn(%v): removing %d page references from chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) } + chunkStart += chunkSize + if chunkStart >= mr.End || chunkStart == 0 { + break + } } } @@ -161,7 +171,8 @@ func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, if write { prot |= unix.PROT_WRITE } - for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { + chunkStart := fr.Start &^ chunkMask + for { m, ok := f.mappings[chunkStart] if !ok { addr, _, errno := unix.Syscall6( @@ -201,6 +212,10 @@ func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, endOff = fr.End - chunkStart } fn(f.unsafeBlockFromChunkMapping(m.addr).TakeFirst64(endOff).DropFirst64(startOff)) + chunkStart += chunkSize + if chunkStart >= fr.End || chunkStart == 0 { + break + } } return nil } diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index 85e7e35db..06a994193 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -17,12 +17,12 @@ package fsutil import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -166,7 +166,7 @@ func (i *InodeSimpleAttributes) DropLink() { // StatFS implements fs.InodeOperations.StatFS. func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) { if i.fsType == 0 { - return fs.Info{}, syserror.ENOSYS + return fs.Info{}, linuxerr.ENOSYS } return fs.Info{Type: i.fsType}, nil } @@ -218,7 +218,7 @@ func (i *InodeSimpleExtendedAttributes) GetXattr(_ context.Context, _ *fs.Inode, value, ok := i.xattrs[name] i.mu.RUnlock() if !ok { - return "", syserror.ENOATTR + return "", linuxerr.ENOATTR } return value, nil } @@ -229,17 +229,17 @@ func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode, defer i.mu.Unlock() if i.xattrs == nil { if flags&linux.XATTR_REPLACE != 0 { - return syserror.ENODATA + return linuxerr.ENODATA } i.xattrs = make(map[string]string) } _, ok := i.xattrs[name] if ok && flags&linux.XATTR_CREATE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } if !ok && flags&linux.XATTR_REPLACE != 0 { - return syserror.ENODATA + return linuxerr.ENODATA } i.xattrs[name] = value @@ -265,7 +265,7 @@ func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Ino delete(i.xattrs, name) return nil } - return syserror.ENOATTR + return linuxerr.ENOATTR } // staticFile is a file with static contents. It is returned by @@ -293,7 +293,7 @@ type InodeNoStatFS struct{} // StatFS implements fs.InodeOperations.StatFS. func (InodeNoStatFS) StatFS(context.Context) (fs.Info, error) { - return fs.Info{}, syserror.ENOSYS + return fs.Info{}, linuxerr.ENOSYS } // InodeStaticFileGetter implements GetFile for a file with static contents. @@ -331,52 +331,52 @@ type InodeNotDirectory struct{} // Lookup implements fs.InodeOperations.Lookup. func (InodeNotDirectory) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // Create implements fs.InodeOperations.Create. func (InodeNotDirectory) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // CreateLink implements fs.InodeOperations.CreateLink. func (InodeNotDirectory) CreateLink(context.Context, *fs.Inode, string, string) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // CreateHardLink implements fs.InodeOperations.CreateHardLink. func (InodeNotDirectory) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // CreateDirectory implements fs.InodeOperations.CreateDirectory. func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Bind implements fs.InodeOperations.Bind. func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, transport.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // CreateFifo implements fs.InodeOperations.CreateFifo. func (InodeNotDirectory) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Remove implements fs.InodeOperations.Remove. func (InodeNotDirectory) Remove(context.Context, *fs.Inode, string) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // RemoveDirectory implements fs.InodeOperations.RemoveDirectory. func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Rename implements fs.FileOperations.Rename. func (InodeNotDirectory) Rename(context.Context, *fs.Inode, *fs.Inode, string, *fs.Inode, string, bool) error { - return syserror.EINVAL + return linuxerr.EINVAL } // InodeNotSocket can be used by Inodes that are not sockets. @@ -392,7 +392,7 @@ type InodeNotTruncatable struct{} // Truncate implements fs.InodeOperations.Truncate. func (InodeNotTruncatable) Truncate(context.Context, *fs.Inode, int64) error { - return syserror.EINVAL + return linuxerr.EINVAL } // InodeIsDirTruncate implements fs.InodeOperations.Truncate for directories. @@ -400,7 +400,7 @@ type InodeIsDirTruncate struct{} // Truncate implements fs.InodeOperations.Truncate. func (InodeIsDirTruncate) Truncate(context.Context, *fs.Inode, int64) error { - return syserror.EISDIR + return linuxerr.EISDIR } // InodeNoopTruncate implements fs.InodeOperations.Truncate as a noop. @@ -416,7 +416,7 @@ type InodeNotRenameable struct{} // Rename implements fs.InodeOperations.Rename. func (InodeNotRenameable) Rename(context.Context, *fs.Inode, *fs.Inode, string, *fs.Inode, string, bool) error { - return syserror.EINVAL + return linuxerr.EINVAL } // InodeNotOpenable can be used by Inodes that cannot be opened. @@ -424,7 +424,7 @@ type InodeNotOpenable struct{} // GetFile implements fs.InodeOperations.GetFile. func (InodeNotOpenable) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) { - return nil, syserror.EIO + return nil, linuxerr.EIO } // InodeNotVirtual can be used by Inodes that are not virtual. @@ -448,12 +448,12 @@ type InodeNotSymlink struct{} // Readlink implements fs.InodeOperations.Readlink. func (InodeNotSymlink) Readlink(context.Context, *fs.Inode) (string, error) { - return "", syserror.ENOLINK + return "", linuxerr.ENOLINK } // Getlink implements fs.InodeOperations.Getlink. func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { - return nil, syserror.ENOLINK + return nil, linuxerr.ENOLINK } // InodeNoExtendedAttributes can be used by Inodes that do not support @@ -462,22 +462,22 @@ type InodeNoExtendedAttributes struct{} // GetXattr implements fs.InodeOperations.GetXattr. func (InodeNoExtendedAttributes) GetXattr(context.Context, *fs.Inode, string, uint64) (string, error) { - return "", syserror.EOPNOTSUPP + return "", linuxerr.EOPNOTSUPP } // SetXattr implements fs.InodeOperations.SetXattr. func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, string, uint32) error { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } // ListXattr implements fs.InodeOperations.ListXattr. func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) { - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } // RemoveXattr implements fs.InodeOperations.RemoveXattr. func (InodeNoExtendedAttributes) RemoveXattr(context.Context, *fs.Inode, string) error { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } // InodeNoopRelease implements fs.InodeOperations.Release as a noop. @@ -512,7 +512,7 @@ type InodeNotAllocatable struct{} // Allocate implements fs.InodeOperations.Allocate. func (InodeNotAllocatable) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } // InodeNoopAllocate implements fs.InodeOperations.Allocate as a noop. @@ -528,5 +528,5 @@ type InodeIsDirAllocate struct{} // Allocate implements fs.InodeOperations.Allocate. func (InodeIsDirAllocate) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { - return syserror.EISDIR + return linuxerr.EISDIR } diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go index e107c3096..25e76d9f2 100644 --- a/pkg/sentry/fs/fsutil/inode_cached_test.go +++ b/pkg/sentry/fs/fsutil/inode_cached_test.go @@ -20,13 +20,13 @@ import ( "testing" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -244,7 +244,7 @@ func (*sliceBackingFile) FD() int { } func (f *sliceBackingFile) Allocate(ctx context.Context, offset int64, length int64) error { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } type noopMappingSpace struct{} diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index 94cb05246..ee2f287d9 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -26,6 +26,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors", + "//pkg/errors/linuxerr", "//pkg/fd", "//pkg/hostarch", "//pkg/log", @@ -47,7 +49,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/unet", "//pkg/usermem", "//pkg/waiter", @@ -62,10 +63,10 @@ go_test( library = ":gofer", deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/p9", "//pkg/p9/p9test", "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index 73d80d9b5..62a517cd7 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -20,6 +20,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -226,7 +226,7 @@ func (f *fileOperations) maybeSync(ctx context.Context, file *fs.File, offset, n func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { if fs.IsDir(file.Dirent.Inode.StableAttr) { // Not all remote file systems enforce this so this client does. - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } var ( @@ -294,7 +294,7 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO if fs.IsDir(file.Dirent.Inode.StableAttr) { // Not all remote file systems enforce this so this client does. f.incrementReadCounters(start) - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } if f.inodeOperations.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) { diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 546ee7d04..4924debeb 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -19,8 +19,8 @@ import ( "testing" "time" - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/p9/p9test" "gvisor.dev/gvisor/pkg/sentry/contexttest" @@ -97,7 +97,7 @@ func TestLookup(t *testing.T) { }, { name: "mock Walk fails (function fails)", - want: unix.ENOENT, + want: linuxerr.ENOENT, }, } @@ -123,7 +123,7 @@ func TestLookup(t *testing.T) { var newInodeOperations fs.InodeOperations if dirent != nil { if dirent.IsNegative() { - err = unix.ENOENT + err = linuxerr.ENOENT } else { newInodeOperations = dirent.Inode.InodeOperations } @@ -131,9 +131,11 @@ func TestLookup(t *testing.T) { // Check return values. if err != test.want { + t.Logf("err: %v %T", err, err) t.Errorf("Lookup got err %v, want %v", err, test.want) } if err == nil && newInodeOperations == nil { + t.Logf("err: %v %T", err, err) t.Errorf("Lookup got non-nil err and non-nil node, wanted at least one non-nil") } }) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index da3178527..c3856094f 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -20,6 +20,8 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + gErr "gvisor.dev/gvisor/pkg/errors" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" @@ -31,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // inodeOperations implements fs.InodeOperations. @@ -476,7 +477,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi switch d.Inode.StableAttr.Type { case fs.Socket: if i.session().overrides != nil { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } return i.getFileSocket(ctx, d, flags) case fs.Pipe: @@ -676,7 +677,7 @@ func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string // Getlink implementfs fs.InodeOperations.Getlink. func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { if !fs.IsSymlink(i.fileState.sattr) { - return nil, syserror.ENOLINK + return nil, linuxerr.ENOLINK } return nil, fs.ErrResolveViaReadlink } @@ -714,16 +715,16 @@ func (i *inodeOperations) configureMMap(file *fs.File, opts *memmap.MMapOpts) er if i.fileState.hostMappable != nil { return fsutil.GenericConfigureMMap(file, i.fileState.hostMappable, opts) } - return syserror.ENODEV + return linuxerr.ENODEV } func init() { - syserror.AddErrorUnwrapper(func(err error) (unix.Errno, bool) { + linuxerr.AddErrorUnwrapper(func(err error) (*gErr.Error, bool) { if _, ok := err.(p9.ErrSocket); ok { // Treat as an I/O error. - return unix.EIO, true + return linuxerr.EIO, true } - return 0, false + return nil, false }) } diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go index e2af1d2ae..19f91f010 100644 --- a/pkg/sentry/fs/gofer/inode_state.go +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -112,13 +112,6 @@ func (i *inodeFileState) loadLoading(_ struct{}) { // +checklocks:i.loading func (i *inodeFileState) afterLoad() { load := func() (err error) { - // See comment on i.loading(). - defer func() { - if err == nil { - i.loading.Unlock() - } - }() - // Manually restore the p9.File. name, ok := i.s.inodeMappings[i.sattr.InodeID] if !ok { @@ -167,6 +160,9 @@ func (i *inodeFileState) afterLoad() { i.savedUAttr = nil } + // See comment on i.loading(). This only unlocks on the + // non-error path. + i.loading.Unlock() // +checklocksforce: per comment. return nil } diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index 940838a44..2f8769f1e 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -18,13 +18,13 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" ) // maxFilenameLen is the maximum length of a filename. This is dictated by 9P's @@ -43,10 +43,11 @@ func changeType(mode p9.FileMode, newType p9.FileMode) p9.FileMode { // policy. func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { if len(name) > maxFilenameLen { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } - cp := i.session().cachePolicy + s := i.session() + cp := s.cachePolicy if cp.cacheReaddir() { // Check to see if we have readdirCache that indicates the // child does not exist. Avoid holding readdirMu longer than @@ -58,7 +59,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string if cp.cacheNegativeDirents() { return fs.NewNegativeDirent(name), nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } i.readdirMu.Unlock() } @@ -66,18 +67,18 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string // Get a p9.File for name. qids, newFile, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name}) if err != nil { - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { if cp.cacheNegativeDirents() { // Return a negative Dirent. It will stay cached until something // is created over it. return fs.NewNegativeDirent(name), nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return nil, err } - if i.session().overrides != nil { + if s.overrides != nil { // Check if file belongs to a internal named pipe. Note that it doesn't need // to check for sockets because it's done in newInodeOperations below. deviceKey := device.MultiDeviceKey{ @@ -85,13 +86,13 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string SecondaryDevice: i.session().connID, Inode: qids[0].Path, } - unlock := i.session().overrides.lock() - if pipeInode := i.session().overrides.getPipe(deviceKey); pipeInode != nil { - unlock() + s.overrides.lock() + if pipeInode := s.overrides.getPipe(deviceKey); pipeInode != nil { + s.overrides.unlock() pipeInode.IncRef() return fs.NewDirent(ctx, pipeInode, name), nil } - unlock() + s.overrides.unlock() } // Construct the Inode operations. @@ -106,7 +107,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string // Ownership is currently ignored. func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { if len(name) > maxFilenameLen { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } // Create replaces the directory fid with the newly created/opened @@ -167,7 +168,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string hostFile.Close() } unopened.close(ctx) - return nil, syserror.EIO + return nil, linuxerr.EIO } qid := qids[0] @@ -195,7 +196,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string // CreateLink uses Create to create a symlink between oldname and newname. func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error { if len(newname) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } owner := fs.FileOwnerFromContext(ctx) @@ -209,29 +210,32 @@ func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname // CreateHardLink implements InodeOperations.CreateHardLink. func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, target *fs.Inode, newName string) error { if len(newName) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } targetOpts, ok := target.InodeOperations.(*inodeOperations) if !ok { - return syserror.EXDEV + return linuxerr.EXDEV } if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil { return err } - if i.session().cachePolicy.cacheUAttrs(inode) { + + s := i.session() + if s.cachePolicy.cacheUAttrs(inode) { // Increase link count. targetOpts.cachingInodeOps.IncLinks(ctx) } + i.touchModificationAndStatusChangeTime(ctx, inode) return nil } // CreateDirectory uses Create to create a directory named s under inodeOperations. -func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s string, perm fs.FilePermissions) error { - if len(s) > maxFilenameLen { - return syserror.ENAMETOOLONG +func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + if len(name) > maxFilenameLen { + return linuxerr.ENAMETOOLONG } // If the parent directory has setgid enabled, change the new directory's @@ -246,16 +250,18 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s perm.SetGID = true } - if _, err := i.fileState.file.mkdir(ctx, s, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { + if _, err := i.fileState.file.mkdir(ctx, name, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { return err } - if i.session().cachePolicy.cacheUAttrs(dir) { + + s := i.session() + if s.cachePolicy.cacheUAttrs(dir) { // Increase link count. // // N.B. This will update the modification time. i.cachingInodeOps.IncLinks(ctx) } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Invalidate readdir cache. i.markDirectoryDirty() } @@ -265,16 +271,17 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s // Bind implements InodeOperations.Bind. func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) { if len(name) > maxFilenameLen { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } - if i.session().overrides == nil { - return nil, syserror.EOPNOTSUPP + s := i.session() + if s.overrides == nil { + return nil, linuxerr.EOPNOTSUPP } // Stabilize the override map while creation is in progress. - unlock := i.session().overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() sattr, iops, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeSocket) if err != nil { @@ -283,22 +290,23 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, // Construct the positive Dirent. childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - i.session().overrides.addBoundEndpoint(iops.fileState.key, childDir, ep) + s.overrides.addBoundEndpoint(iops.fileState.key, childDir, ep) return childDir, nil } // CreateFifo implements fs.InodeOperations.CreateFifo. func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } owner := fs.FileOwnerFromContext(ctx) mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe // N.B. FIFOs use major/minor numbers 0. + s := i.session() if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { - if i.session().overrides == nil || err != syserror.EPERM { + if s.overrides == nil || !linuxerr.Equals(linuxerr.EPERM, err) { return err } // If gofer doesn't support mknod, check if we can create an internal fifo. @@ -310,13 +318,14 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st } func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, name string, owner fs.FileOwner, perm fs.FilePermissions) error { - if i.session().overrides == nil { - return syserror.EPERM + s := i.session() + if s.overrides == nil { + return linuxerr.EPERM } // Stabilize the override map while creation is in progress. - unlock := i.session().overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() sattr, fileOps, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeNamedPipe) if err != nil { @@ -335,7 +344,7 @@ func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, // Construct the positive Dirent. childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - i.session().overrides.addPipe(fileOps.fileState.key, childDir, inode) + s.overrides.addPipe(fileOps.fileState.key, childDir, inode) return nil } @@ -382,11 +391,12 @@ func (i *inodeOperations) createEndpointFile(ctx context.Context, dir *fs.Inode, // Remove implements InodeOperations.Remove. func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } + s := i.session() var key *device.MultiDeviceKey - if i.session().overrides != nil { + if s.overrides != nil { // Find out if file being deleted is a socket or pipe that needs to be // removed from endpoint map. if d, err := i.Lookup(ctx, dir, name); err == nil { @@ -401,8 +411,8 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string } // Stabilize the override map while deletion is in progress. - unlock := i.session().overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() } } } @@ -411,7 +421,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return err } if key != nil { - i.session().overrides.remove(ctx, *key) + s.overrides.remove(ctx, *key) } i.touchModificationAndStatusChangeTime(ctx, dir) @@ -421,18 +431,20 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string // Remove implements InodeOperations.RemoveDirectory. func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } // 0x200 = AT_REMOVEDIR. if err := i.fileState.file.unlinkAt(ctx, name, 0x200); err != nil { return err } - if i.session().cachePolicy.cacheUAttrs(dir) { + + s := i.session() + if s.cachePolicy.cacheUAttrs(dir) { // Decrease link count and updates atime. i.cachingInodeOps.DecLinks(ctx) } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Invalidate readdir cache. i.markDirectoryDirty() } @@ -442,12 +454,12 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na // Rename renames this node. func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { if len(newName) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } // Don't allow renames across different mounts. if newParent.MountSource != oldParent.MountSource { - return syserror.EXDEV + return linuxerr.EXDEV } // Unwrap the new parent to a *inodeOperations. @@ -462,12 +474,13 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent } // Is the renamed entity a directory? Fix link counts. + s := i.session() if fs.IsDir(i.fileState.sattr) { // Update cached state. - if i.session().cachePolicy.cacheUAttrs(oldParent) { + if s.cachePolicy.cacheUAttrs(oldParent) { oldParentInodeOperations.cachingInodeOps.DecLinks(ctx) } - if i.session().cachePolicy.cacheUAttrs(newParent) { + if s.cachePolicy.cacheUAttrs(newParent) { // Only IncLinks if there is a new addition to // newParent. If this is replacement, then the total // count remains the same. @@ -476,7 +489,7 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent } } } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Mark old directory dirty. oldParentInodeOperations.markDirectoryDirty() if oldParent != newParent { @@ -486,17 +499,18 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent } // Rename always updates ctime. - if i.session().cachePolicy.cacheUAttrs(inode) { + if s.cachePolicy.cacheUAttrs(inode) { i.cachingInodeOps.TouchStatusChangeTime(ctx) } return nil } func (i *inodeOperations) touchModificationAndStatusChangeTime(ctx context.Context, inode *fs.Inode) { - if i.session().cachePolicy.cacheUAttrs(inode) { + s := i.session() + if s.cachePolicy.cacheUAttrs(inode) { i.cachingInodeOps.TouchModificationAndStatusChangeTime(ctx) } - if i.session().cachePolicy.cacheReaddir() { + if s.cachePolicy.cacheReaddir() { // Invalidate readdir cache. i.markDirectoryDirty() } diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 7cf3522ff..b7debeecb 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -98,9 +98,14 @@ func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) { // lock blocks other addition and removal operations from happening while // the backing file is being created or deleted. Returns a function that unlocks // the endpoint map. -func (e *overrideMaps) lock() func() { +// +checklocksacquire:e.mu +func (e *overrideMaps) lock() { e.mu.Lock() - return func() { e.mu.Unlock() } +} + +// +checklocksrelease:e.mu +func (e *overrideMaps) unlock() { + e.mu.Unlock() } // getBoundEndpoint returns the bound endpoint mapped to the given key. @@ -366,8 +371,8 @@ func newOverrideMaps() *overrideMaps { // fillKeyMap populates key and dirent maps upon restore from saved pathmap. func (s *session) fillKeyMap(ctx context.Context) error { - unlock := s.overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() for ep, dirPath := range s.overrides.pathMap { _, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath)) @@ -394,8 +399,8 @@ func (s *session) fillKeyMap(ctx context.Context) error { // fillPathMap populates paths for overrides from dirents in direntMap // before save. func (s *session) fillPathMap(ctx context.Context) error { - unlock := s.overrides.lock() - defer unlock() + s.overrides.lock() + defer s.overrides.unlock() for _, endpoint := range s.overrides.keyMap { mountRoot := endpoint.dirent.MountRoot() diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 8a1c69ac2..1fd8a0910 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -32,10 +32,11 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. return nil } - if i.session().overrides != nil { - unlock := i.session().overrides.lock() - defer unlock() - ep := i.session().overrides.getBoundEndpoint(i.fileState.key) + s := i.session() + if s.overrides != nil { + s.overrides.lock() + defer s.overrides.unlock() + ep := s.overrides.getBoundEndpoint(i.fileState.key) if ep != nil { return ep } diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index 3c45f6cc5..921612e9c 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -28,9 +28,9 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fd", "//pkg/fdnotifier", - "//pkg/iovec", "//pkg/log", "//pkg/marshal/primitive", "//pkg/refs", @@ -40,6 +40,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/hostfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -51,7 +52,6 @@ go_library( "//pkg/sentry/uniqueid", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/unet", "//pkg/usermem", diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index 07bd078b7..1d0d95634 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -19,6 +19,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" @@ -27,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -200,7 +200,7 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I writer := fd.NewReadWriter(f.iops.fileState.FD()) n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer}) if isBlockError(err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } return n, err } @@ -231,7 +231,7 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO if n != 0 { err = nil } else { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } } return n, err @@ -268,7 +268,7 @@ func (f *fileOperations) Flush(context.Context, *fs.File) error { // ConfigureMMap implements fs.FileOperations.ConfigureMMap. func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { if !canMap(file.Dirent.Inode) { - return syserror.ENODEV + return linuxerr.ENODEV } return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts) } diff --git a/pkg/sentry/fs/host/host.go b/pkg/sentry/fs/host/host.go index 081ba1dd8..9f6dbd7e9 100644 --- a/pkg/sentry/fs/host/host.go +++ b/pkg/sentry/fs/host/host.go @@ -17,8 +17,8 @@ package host import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) // filesystem is a host filesystem. @@ -40,7 +40,7 @@ func (*filesystem) Name() string { // Mount returns an error. Mounting hostfs is not allowed. func (*filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, dataObj interface{}) (*fs.Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // AllowUserMount prohibits users from using mount(2) with this file system. diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index e299b532c..92d58e3e9 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -17,6 +17,7 @@ package host import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" @@ -25,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -113,7 +113,7 @@ func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMa return nil } if mask.UID || mask.GID { - return syserror.EPERM + return linuxerr.EPERM } if mask.Perms { if err := unix.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil { @@ -219,53 +219,53 @@ func (i *inodeOperations) Release(context.Context) { // Lookup implements fs.InodeOperations.Lookup. func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // Create implements fs.InodeOperations.Create. func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // CreateDirectory implements fs.InodeOperations.CreateDirectory. func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { - return syserror.EPERM + return linuxerr.EPERM } // CreateLink implements fs.InodeOperations.CreateLink. func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error { - return syserror.EPERM + return linuxerr.EPERM } // CreateHardLink implements fs.InodeOperations.CreateHardLink. func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { - return syserror.EPERM + return linuxerr.EPERM } // CreateFifo implements fs.InodeOperations.CreateFifo. func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { - return syserror.EPERM + return linuxerr.EPERM } // Remove implements fs.InodeOperations.Remove. func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { - return syserror.EPERM + return linuxerr.EPERM } // RemoveDirectory implements fs.InodeOperations.RemoveDirectory. func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { - return syserror.EPERM + return linuxerr.EPERM } // Rename implements fs.InodeOperations.Rename. func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { - return syserror.EPERM + return linuxerr.EPERM } // Bind implements fs.InodeOperations.Bind. func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) { - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } // BoundEndpoint implements fs.InodeOperations.BoundEndpoint. @@ -276,7 +276,7 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. // GetFile implements fs.InodeOperations.GetFile. func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { if fs.IsSocket(d.Inode.StableAttr) { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } return newFile(ctx, d, flags, i), nil @@ -313,7 +313,7 @@ func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermM // SetOwner implements fs.InodeOperations.SetOwner. func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error { - return syserror.EPERM + return linuxerr.EPERM } // SetPermissions implements fs.InodeOperations.SetPermissions. @@ -392,14 +392,14 @@ func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string // Getlink implements fs.InodeOperations.Getlink. func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { if !fs.IsSymlink(i.fileState.sattr) { - return nil, syserror.ENOLINK + return nil, linuxerr.ENOLINK } return nil, fs.ErrResolveViaReadlink } // StatFS implements fs.InodeOperations.StatFS. func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) { - return fs.Info{}, syserror.ENOSYS + return fs.Info{}, linuxerr.ENOSYS } // AddLink implements fs.InodeOperations.AddLink. diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 46a2dc47d..54c421775 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -21,6 +21,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/refs" @@ -31,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/waiter" @@ -211,9 +211,9 @@ func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMess if n < totalLen && err == nil { // The host only returns a short write if it would otherwise // block (and only for stream sockets). - err = syserror.EAGAIN + err = linuxerr.EAGAIN } - if n > 0 && err != syserror.EAGAIN { + if n > 0 && !linuxerr.Equals(linuxerr.EAGAIN, err) { // The caller may need to block to send more data, but // otherwise there isn't anything that can be done about an // error with a partial write. diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go index 7380d75e7..d98e3c6d1 100644 --- a/pkg/sentry/fs/host/socket_iovec.go +++ b/pkg/sentry/fs/host/socket_iovec.go @@ -16,8 +16,8 @@ package host import ( "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/iovec" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/hostfd" ) // LINT.IfChange @@ -66,13 +66,13 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec if length > maxlen { if truncate { stopLen = maxlen - err = syserror.EAGAIN + err = linuxerr.EAGAIN } else { - return 0, nil, nil, syserror.EMSGSIZE + return 0, nil, nil, linuxerr.EMSGSIZE } } - if iovsRequired > iovec.MaxIovs { + if iovsRequired > hostfd.MaxSendRecvMsgIov { // The kernel will reject our call if we pass this many iovs. // Use a single intermediate buffer instead. b := make([]byte, stopLen) diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 1183727ab..4e561c5ed 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -17,13 +17,13 @@ package host import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -126,7 +126,7 @@ func (t *TTYFileOperations) Release(ctx context.Context) { func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // Ignore arg[0]. This is the real FD: @@ -167,7 +167,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO pidns := kernel.PIDNamespaceFromContext(ctx) if pidns == nil { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } t.mu.Lock() @@ -191,8 +191,8 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from // tty_check_change() to -ENOTTY. - if err == syserror.EIO { - return 0, syserror.ENOTTY + if linuxerr.Equals(linuxerr.EIO, err) { + return 0, linuxerr.ENOTTY } return 0, err } @@ -200,7 +200,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO // Check that calling task's process group is in the TTY // session. if task.ThreadGroup().Session() != t.session { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } var pgIDP primitive.Int32 @@ -211,19 +211,19 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO // pgID must be non-negative. if pgID < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Process group with pgID must exist in this PID namespace. pidns := task.PIDNamespace() pg := pidns.ProcessGroupWithID(pgID) if pg == nil { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } // Check that new process group is in the TTY session. if pg.Session() != t.session { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } t.fgProcessGroup = pg @@ -283,7 +283,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO unimpl.EmitUnimplementedEvent(ctx) fallthrough default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } @@ -326,7 +326,7 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e // If the signal is SIGTTIN, then we are attempting to read // from the TTY. Don't send the signal and return EIO. if sig == linux.SIGTTIN { - return syserror.EIO + return linuxerr.EIO } // Otherwise, we are writing or changing terminal state. This is allowed. @@ -335,7 +335,7 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e // If the process group is an orphan, return EIO. if pg.IsOrphan() { - return syserror.EIO + return linuxerr.EIO } // Otherwise, send the signal to the process group and return ERESTARTSYS. @@ -348,7 +348,7 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e // // Linux ignores the result of kill_pgrp(). _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) - return syserror.ERESTARTSYS + return linuxerr.ERESTARTSYS } // LINT.ThenChange(../../fsimpl/host/tty.go) diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go index ab74724a3..f2a33cc14 100644 --- a/pkg/sentry/fs/host/util.go +++ b/pkg/sentry/fs/host/util.go @@ -19,12 +19,12 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" ) func nodeType(s *unix.Stat_t) fs.InodeType { @@ -96,9 +96,9 @@ type dirInfo struct { // LINT.IfChange // isBlockError unwraps os errors and checks if they are caused by EAGAIN or -// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. +// EWOULDBLOCK. This is so they can be transformed into linuxerr.ErrWouldBlock. func isBlockError(err error) bool { - if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK { + if linuxerr.Equals(linuxerr.EAGAIN, err) || linuxerr.Equals(linuxerr.EWOULDBLOCK, err) { return true } if pe, ok := err.(*os.PathError); ok { diff --git a/pkg/sentry/fs/host/util_amd64_unsafe.go b/pkg/sentry/fs/host/util_amd64_unsafe.go index 21782f1da..e90629f4e 100644 --- a/pkg/sentry/fs/host/util_amd64_unsafe.go +++ b/pkg/sentry/fs/host/util_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package host diff --git a/pkg/sentry/fs/host/util_arm64_unsafe.go b/pkg/sentry/fs/host/util_arm64_unsafe.go index ed8f5242a..9fbb93726 100644 --- a/pkg/sentry/fs/host/util_arm64_unsafe.go +++ b/pkg/sentry/fs/host/util_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package host diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index 41a3c2047..2c6b9e9db 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -17,6 +17,7 @@ package fs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" @@ -25,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Inode is a file system object that can be simultaneously referenced by different @@ -298,7 +298,7 @@ func (i *Inode) RemoveXattr(ctx context.Context, d *Dirent, name string) error { func (i *Inode) CheckPermission(ctx context.Context, p PermMask) error { // First check the outer-most mounted filesystem. if p.Write && i.MountSource.Flags.ReadOnly { - return syserror.EROFS + return linuxerr.EROFS } if i.overlay != nil { @@ -312,7 +312,7 @@ func (i *Inode) CheckPermission(ctx context.Context, p PermMask) error { // we should not attempt to modify the writable layer if it // is mounted read-only. if p.Write && overlayUpperMountSource(i.MountSource).Flags.ReadOnly { - return syserror.EROFS + return linuxerr.EROFS } } @@ -324,7 +324,7 @@ func (i *Inode) check(ctx context.Context, p PermMask) error { return overlayCheck(ctx, i.overlay, p) } if !i.InodeOperations.Check(ctx, i, p) { - return syserror.EACCES + return linuxerr.EACCES } return nil } @@ -356,7 +356,7 @@ func (i *Inode) SetTimestamps(ctx context.Context, d *Dirent, ts TimeSpec) error // Truncate calls i.InodeOperations.Truncate with i as the Inode. func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error { if IsDir(i.StableAttr) { - return syserror.EISDIR + return linuxerr.EISDIR } if i.overlay != nil { diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go index 2bbfb72ef..0f8022906 100644 --- a/pkg/sentry/fs/inode_operations.go +++ b/pkg/sentry/fs/inode_operations.go @@ -66,7 +66,7 @@ type InodeOperations interface { // // * A nil Dirent and a non-nil error. If the reason that Lookup failed // was because the name does not exist under Inode, then must return - // syserror.ENOENT. + // linuxerr.ENOENT. // // * If name does not exist under dir and the file system wishes this // fact to be cached, a non-nil Dirent containing a nil Inode and a @@ -283,7 +283,7 @@ type InodeOperations interface { // // Any error returned from Getlink other than ErrResolveViaReadlink // indicates the caller's inability to traverse this Inode as a link - // (e.g. syserror.ENOLINK indicates that the Inode is not a link, + // (e.g. linuxerr.ENOLINK indicates that the Inode is not a link, // syscall.EPERM indicates that traversing the link is not allowed, etc). Getlink(context.Context, *Inode) (*Dirent, error) diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index e97afc626..21ad7fa69 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" ) func overlayHasWhiteout(ctx context.Context, parent *Inode, name string) bool { @@ -71,7 +71,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // A file could have been created over a whiteout, so we need to // check if something exists in the upper file system first. child, err := parent.upper.Lookup(ctx, name) - if err != nil && err != syserror.ENOENT { + if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { // We encountered an error that an overlay cannot handle, // we must propagate it to the caller. parent.copyMu.RUnlock() @@ -102,7 +102,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // Upper fs is not OK with a negative Dirent // being cached in the Dirent tree, so don't // return one. - return nil, false, syserror.ENOENT + return nil, false, linuxerr.ENOENT } entry, err := newOverlayEntry(ctx, upperInode, nil, false) if err != nil { @@ -125,7 +125,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // Check the lower file system. child, err := parent.lower.Lookup(ctx, name) // Same song and dance as above. - if err != nil && err != syserror.ENOENT { + if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { // Don't leak resources. if upperInode != nil { upperInode.DecRef(ctx) @@ -164,7 +164,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name if negativeUpperChild { return NewNegativeDirent(name), false, nil } - return nil, false, syserror.ENOENT + return nil, false, linuxerr.ENOENT } // Did we find a lower Inode? Remember this because we may decide we don't @@ -343,7 +343,7 @@ func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child * return err } if ser.Written() != 0 { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } if child.Inode.overlay.upper != nil { @@ -374,7 +374,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena // Maybe some day we can allow the more complicated case of // non-overlay X overlay renames, but that's not necessary right now. if renamed.Inode.overlay == nil || newParent.Inode.overlay == nil || oldParent.Inode.overlay == nil { - return syserror.EXDEV + return linuxerr.EXDEV } if replacement { @@ -396,7 +396,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena // newName has been removed out from under us. That's fine; // filesystems where that can happen must handle stale // 'replaced'. - if err != nil && err != syserror.ENOENT { + if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { return err } if err == nil { @@ -420,7 +420,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena // need to bother checking for them. if len(children) > 0 { replaced.DecRef(ctx) - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } @@ -552,7 +552,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin // Don't forward the value of the extended attribute if it would // unexpectedly change the behavior of a wrapping overlay layer. if isXattrOverlay(name) { - return "", syserror.ENODATA + return "", linuxerr.ENODATA } o.copyMu.RLock() @@ -568,7 +568,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin func overlaySetXattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error { // Don't allow changes to overlay xattrs through a setxattr syscall. if isXattrOverlay(name) { - return syserror.EPERM + return linuxerr.EPERM } if err := copyUp(ctx, d); err != nil { @@ -600,7 +600,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error { // Don't allow changes to overlay xattrs through a removexattr syscall. if isXattrOverlay(name) { - return syserror.EPERM + return linuxerr.EPERM } if err := copyUp(ctx, d); err != nil { @@ -687,7 +687,7 @@ func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) { dirent.DecRef(ctx) // Claim that the path is not accessible. - err = syserror.EACCES + err = linuxerr.EACCES log.Warningf("Getlink not supported in overlay for %q", name) } return nil, err diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go index aa9851b26..a3800d700 100644 --- a/pkg/sentry/fs/inode_overlay_test.go +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -18,11 +18,11 @@ import ( "testing" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" - "gvisor.dev/gvisor/pkg/syserror" ) func TestLookup(t *testing.T) { @@ -191,11 +191,11 @@ func TestLookup(t *testing.T) { } { t.Run(test.desc, func(t *testing.T) { dirent, err := test.dir.Lookup(ctx, test.name) - if test.found && (err == syserror.ENOENT || dirent.IsNegative()) { + if test.found && (linuxerr.Equals(linuxerr.ENOENT, err) || dirent.IsNegative()) { t.Fatalf("lookup %q expected to find positive dirent, got dirent %v err %v", test.name, dirent, err) } if !test.found { - if err != syserror.ENOENT && !dirent.IsNegative() { + if !linuxerr.Equals(linuxerr.ENOENT, err) && !dirent.IsNegative() { t.Errorf("lookup %q expected to return ENOENT or negative dirent, got dirent %v err %v", test.name, dirent, err) } // Nothing more to check. @@ -389,7 +389,7 @@ func (d *dir) GetXattr(_ context.Context, _ *fs.Inode, name string, _ uint64) (s return "y", nil } } - return "", syserror.ENOATTR + return "", linuxerr.ENOATTR } // GetFile implements InodeOperations.GetFile. diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index 1b83643db..51cd6cd37 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -20,12 +20,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -116,23 +116,23 @@ func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { // Seek implements FileOperations.Seek. func (*Inotify) Seek(context.Context, *File, SeekWhence, int64) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Readdir implements FileOperatons.Readdir. func (*Inotify) Readdir(context.Context, *File, DentrySerializer) (int64, error) { - return 0, syserror.ENOTDIR + return 0, linuxerr.ENOTDIR } // Write implements FileOperations.Write. func (*Inotify) Write(context.Context, *File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Read implements FileOperations.Read. func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ int64) (int64, error) { if dst.NumBytes() < inotifyEventBaseSize { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } i.evMu.Lock() @@ -140,7 +140,7 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i if i.events.Empty() { // Nothing to read yet, tell caller to block. - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } var writeLen int64 @@ -156,7 +156,7 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i // write some events out. return writeLen, nil } - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Linux always dequeues an available event as long as there's enough @@ -178,17 +178,17 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i // WriteTo implements FileOperations.WriteTo. func (*Inotify) WriteTo(context.Context, *File, io.Writer, int64, bool) (int64, error) { - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Fsync implements FileOperations.Fsync. func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error { - return syserror.EINVAL + return linuxerr.EINVAL } // ReadFrom implements FileOperations.ReadFrom. func (*Inotify) ReadFrom(context.Context, *File, io.Reader, int64) (int64, error) { - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Flush implements FileOperations.Flush. @@ -198,7 +198,7 @@ func (*Inotify) Flush(context.Context, *File) error { // ConfigureMMap implements FileOperations.ConfigureMMap. func (*Inotify) ConfigureMMap(context.Context, *File, *memmap.MMapOpts) error { - return syserror.ENODEV + return linuxerr.ENODEV } // UnstableAttr implements FileOperations.UnstableAttr. @@ -222,7 +222,7 @@ func (i *Inotify) Ioctl(ctx context.Context, _ *File, io usermem.IO, args arch.S return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } @@ -329,7 +329,7 @@ func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { watch, ok := i.watches[wd] if !ok { i.mu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } // Remove the watch from this instance. diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go index 1d6ea5736..fba7b961b 100644 --- a/pkg/sentry/fs/mock.go +++ b/pkg/sentry/fs/mock.go @@ -16,7 +16,7 @@ package fs import ( "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // MockInodeOperations implements InodeOperations for testing Inodes. @@ -109,7 +109,7 @@ func (n *MockInodeOperations) SetPermissions(context.Context, *Inode, FilePermis // SetOwner implements fs.InodeOperations.SetOwner. func (*MockInodeOperations) SetOwner(context.Context, *Inode, FileOwner) error { - return syserror.EINVAL + return linuxerr.EINVAL } // SetTimestamps implements fs.InodeOperations.SetTimestamps. @@ -172,5 +172,5 @@ func (n *MockInodeOperations) RemoveDirectory(context.Context, *Inode, string) e // Getlink implements fs.InodeOperations.Getlink. func (n *MockInodeOperations) Getlink(context.Context, *Inode) (*Dirent, error) { - return nil, syserror.ENOLINK + return nil, linuxerr.ENOLINK } diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 243098a09..10146af4e 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -20,10 +20,10 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // DefaultTraversalLimit provides a sensible default traversal limit that may @@ -281,7 +281,7 @@ func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error // Linux allows mounting over the root (?). It comes with a strange set // of semantics. We'll just not do this for now. if node.parent == nil { - return syserror.EBUSY + return linuxerr.EBUSY } // For both mount and unmount, we take this lock so we can swap out the @@ -357,7 +357,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly orig, ok := mns.mounts[node] if !ok { // node is not a mount point. - return syserror.EINVAL + return linuxerr.EINVAL } if orig.previous == nil { @@ -380,7 +380,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly if refs := m.DirentRefs(); refs < 2 { panic(fmt.Sprintf("have %d refs on unmount, expect 2 or more", refs)) } else if refs != 2 { - return syserror.EBUSY + return linuxerr.EBUSY } } @@ -497,7 +497,7 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path if current != root { if !IsDir(current.Inode.StableAttr) { current.DecRef(ctx) // Drop reference from above. - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil { current.DecRef(ctx) // Drop reference from above. @@ -566,8 +566,8 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema // Resolve the path. target, err := node.Inode.Getlink(ctx) - switch err { - case nil: + switch { + case err == nil: // Make sure we didn't exhaust the traversal budget. if *remainingTraversals == 0 { target.DecRef(ctx) @@ -577,11 +577,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema node.DecRef(ctx) // Drop the original reference. return target, nil - case unix.ENOLINK: + case linuxerr.Equals(linuxerr.ENOLINK, err): // Not a symlink. return node, nil - case ErrResolveViaReadlink: + case err == ErrResolveViaReadlink: defer node.DecRef(ctx) // See above. // First, check if we should traverse. diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index f96f5a3e5..7e72e47b5 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -19,11 +19,11 @@ import ( "strings" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // The virtual filesystem implements an overlay configuration. For a high-level @@ -218,7 +218,7 @@ func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExist // We don't support copying up from character devices, // named pipes, or anything weird (like proc files). log.Warningf("%s not supported in lower filesytem", lower.StableAttr.Type) - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } return &overlayEntry{ diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index 7af7e0b45..bc75ae505 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -30,6 +30,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/sentry/fs", @@ -49,7 +50,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip/header", "//pkg/tcpip/network/ipv4", "//pkg/usermem", diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go index 24426b225..75dc5d204 100644 --- a/pkg/sentry/fs/proc/exec_args.go +++ b/pkg/sentry/fs/proc/exec_args.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -104,10 +104,10 @@ var _ fs.FileOperations = (*execArgFile)(nil) // Read reads the exec arg from the process's address space.. func (f *execArgFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } - m, err := getTaskMM(f.t) + m, err := getTaskMMIncRef(f.t) if err != nil { return 0, err } diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index e90da225a..e68bb46c0 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -20,12 +20,12 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -37,7 +37,7 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF n, err := strconv.ParseUint(p, 10, 64) if err != nil { // Not found. - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } var file *fs.File @@ -48,7 +48,7 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF } }) if file == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return toInode(file, fdFlags), nil } diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 91c35eea9..187e9a921 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -34,7 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" ) @@ -291,7 +291,7 @@ func (n *netSnmp) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s continue } if err := n.s.Statistics(stat, line.prefix); err != nil { - if err == syserror.EOPNOTSUPP { + if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) } else { log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index 2f2a9f920..b9629c598 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package proc implements a partial in-memory file system for profs. +// Package proc implements a partial in-memory file system for procfs. package proc import ( @@ -21,13 +21,13 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -124,13 +124,13 @@ func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if t := kernel.TaskFromContext(ctx); t != nil { tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) if tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return strconv.FormatUint(uint64(tgid), 10), nil } // Who is reading this link? - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // threadSelf is more magical than "self" link. @@ -148,13 +148,13 @@ func (s *threadSelf) Readlink(ctx context.Context, inode *fs.Inode) (string, err tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) tid := s.pidns.IDOfTask(t) if tid == 0 || tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return fmt.Sprintf("%d/task/%d", tgid, tid), nil } // Who is reading this link? - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // Lookup loads an Inode at name into a Dirent. diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD index 713b81e08..90bd32345 100644 --- a/pkg/sentry/fs/proc/seqfile/BUILD +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -9,13 +9,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/proc/device", "//pkg/sentry/kernel/time", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go index b01688b1d..77270814e 100644 --- a/pkg/sentry/fs/proc/seqfile/seqfile.go +++ b/pkg/sentry/fs/proc/seqfile/seqfile.go @@ -20,13 +20,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -204,7 +204,7 @@ var _ fs.FileOperations = (*seqFileOperations)(nil) // Write implements fs.FileOperations.Write. func (*seqFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EACCES + return 0, linuxerr.EACCES } // Read implements fs.FileOperations.Read. diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 4893af56b..71f37d582 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -592,7 +592,7 @@ func (pf *portRangeFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSe // Port numbers must be uint16s. if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if err := pf.inode.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil { diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index ae5ed25f9..03f2a882d 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -34,19 +35,32 @@ import ( "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // LINT.IfChange -// getTaskMM returns t's MemoryManager. If getTaskMM succeeds, the MemoryManager's -// users count is incremented, and must be decremented by the caller when it is -// no longer in use. -func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) { +// getTaskMM gets the kernel task's MemoryManager. No additional reference is +// taken on mm here. This is safe because MemoryManager.destroy is required to +// leave the MemoryManager in a state where it's still usable as a +// DynamicBytesSource. +func getTaskMM(t *kernel.Task) *mm.MemoryManager { + var tmm *mm.MemoryManager + t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + tmm = mm + } + }) + return tmm +} + +// getTaskMMIncRef returns t's MemoryManager. If getTaskMMIncRef succeeds, the +// MemoryManager's users count is incremented, and must be decremented by the +// caller when it is no longer in use. +func getTaskMMIncRef(t *kernel.Task) (*mm.MemoryManager, error) { if t.ExitState() == kernel.TaskExitDead { - return nil, syserror.ESRCH + return nil, linuxerr.ESRCH } var m *mm.MemoryManager t.WithMuLocked(func(t *kernel.Task) { @@ -61,9 +75,9 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) { func checkTaskState(t *kernel.Task) error { switch t.ExitState() { case kernel.TaskExitZombie: - return syserror.EACCES + return linuxerr.EACCES case kernel.TaskExitDead: - return syserror.ESRCH + return linuxerr.ESRCH } return nil } @@ -181,7 +195,7 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry tasks := f.t.ThreadGroup().MemberIDs(f.pidns) if len(tasks) == 0 { - return offset, syserror.ENOENT + return offset, linuxerr.ENOENT } if offset == 0 { @@ -233,15 +247,15 @@ var _ fs.FileOperations = (*subtasksFile)(nil) func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) { tid, err := strconv.ParseUint(p, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } task := s.p.pidns.TaskWithID(kernel.ThreadID(tid)) if task == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } if task.ThreadGroup() != s.t.ThreadGroup() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } td := s.p.newTaskDir(ctx, task, dir.MountSource, false) @@ -269,28 +283,25 @@ func (e *exe) executable() (file fsbridge.File, err error) { if err := checkTaskState(e.t); err != nil { return nil, err } - e.t.WithMuLocked(func(t *kernel.Task) { - mm := t.MemoryManager() - if mm == nil { - err = syserror.EACCES - return - } + mm := getTaskMM(e.t) + if mm == nil { + return nil, linuxerr.EACCES + } - // The MemoryManager may be destroyed, in which case - // MemoryManager.destroy will simply set the executable to nil - // (with locks held). - file = mm.Executable() - if file == nil { - err = syserror.ESRCH - } - }) + // The MemoryManager may be destroyed, in which case + // MemoryManager.destroy will simply set the executable to nil + // (with locks held). + file = mm.Executable() + if file == nil { + err = linuxerr.ESRCH + } return } // Readlink implements fs.InodeOperations. func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if !kernel.ContextCanTrace(ctx, e.t, false) { - return "", syserror.EACCES + return "", linuxerr.EACCES } // Pull out the executable for /proc/TID/exe. @@ -323,7 +334,7 @@ func newCwd(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inode // Readlink implements fs.InodeOperations. func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if !kernel.ContextCanTrace(ctx, e.t, false) { - return "", syserror.EACCES + return "", linuxerr.EACCES } if err := checkTaskState(e.t); err != nil { return "", err @@ -331,14 +342,14 @@ func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { cwd := e.t.FSContext().WorkingDirectory() if cwd == nil { // It could have raced with process deletion. - return "", syserror.ESRCH + return "", linuxerr.ESRCH } defer cwd.DecRef(ctx) root := fs.RootFromContext(ctx) if root == nil { // It could have raced with process deletion. - return "", syserror.ESRCH + return "", linuxerr.ESRCH } defer root.DecRef(ctx) @@ -380,7 +391,7 @@ func (n *namespaceSymlink) Readlink(ctx context.Context, inode *fs.Inode) (strin // Getlink implements fs.InodeOperations.Getlink. func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) { if !kernel.ContextCanTrace(ctx, n.t, false) { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } if err := checkTaskState(n.t); err != nil { return nil, err @@ -448,7 +459,7 @@ func (m *memData) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileF // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH if !kernel.ContextCanTrace(ctx, m.t, true) { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } if err := checkTaskState(m.t); err != nil { return nil, err @@ -463,7 +474,7 @@ func (m *memDataFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen if dst.NumBytes() == 0 { return 0, nil } - mm, err := getTaskMM(m.t) + mm, err := getTaskMMIncRef(m.t) if err != nil { return 0, nil } @@ -473,12 +484,12 @@ func (m *memDataFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen n, readErr := mm.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) if n > 0 { if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return int64(n), nil } if readErr != nil { - return 0, syserror.EIO + return 0, linuxerr.EIO } return 0, nil } @@ -494,22 +505,9 @@ func newMaps(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Inod return newProcInode(ctx, seqfile.NewSeqFile(ctx, &mapsData{t}), msrc, fs.SpecialFile, t) } -func (md *mapsData) mm() *mm.MemoryManager { - var tmm *mm.MemoryManager - md.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - // No additional reference is taken on mm here. This is safe - // because MemoryManager.destroy is required to leave the - // MemoryManager in a state where it's still usable as a SeqSource. - tmm = mm - } - }) - return tmm -} - // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. func (md *mapsData) NeedsUpdate(generation int64) bool { - if mm := md.mm(); mm != nil { + if mm := getTaskMM(md.t); mm != nil { return mm.NeedsUpdate(generation) } return true @@ -517,7 +515,7 @@ func (md *mapsData) NeedsUpdate(generation int64) bool { // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { - if mm := md.mm(); mm != nil { + if mm := getTaskMM(md.t); mm != nil { return mm.ReadMapsSeqFileData(ctx, h) } return []seqfile.SeqData{}, 0 @@ -534,22 +532,9 @@ func newSmaps(ctx context.Context, t *kernel.Task, msrc *fs.MountSource) *fs.Ino return newProcInode(ctx, seqfile.NewSeqFile(ctx, &smapsData{t}), msrc, fs.SpecialFile, t) } -func (sd *smapsData) mm() *mm.MemoryManager { - var tmm *mm.MemoryManager - sd.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - // No additional reference is taken on mm here. This is safe - // because MemoryManager.destroy is required to leave the - // MemoryManager in a state where it's still usable as a SeqSource. - tmm = mm - } - }) - return tmm -} - // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. func (sd *smapsData) NeedsUpdate(generation int64) bool { - if mm := sd.mm(); mm != nil { + if mm := getTaskMM(sd.t); mm != nil { return mm.NeedsUpdate(generation) } return true @@ -557,7 +542,7 @@ func (sd *smapsData) NeedsUpdate(generation int64) bool { // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. func (sd *smapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { - if mm := sd.mm(); mm != nil { + if mm := getTaskMM(sd.t); mm != nil { return mm.ReadSmapsSeqFileData(ctx, h) } return []seqfile.SeqData{}, 0 @@ -627,12 +612,10 @@ func (s *taskStatData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) fmt.Fprintf(&buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) + if mm := getTaskMM(s.t); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } fmt.Fprintf(&buf, "%d %d ", vss, rss/hostarch.PageSize) // rsslim. @@ -677,12 +660,10 @@ func (s *statmData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([ } var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) + if mm := getTaskMM(s.t); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } var buf bytes.Buffer fmt.Fprintf(&buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize) @@ -734,12 +715,13 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( if fdTable := t.FDTable(); fdTable != nil { fds = fdTable.CurrentMaxFDs() } - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - data = mm.VirtualDataSize() - } }) + + if mm := getTaskMM(s.t); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + data = mm.VirtualDataSize() + } fmt.Fprintf(&buf, "FDSize:\t%d\n", fds) fmt.Fprintf(&buf, "VmSize:\t%d kB\n", vss>>10) fmt.Fprintf(&buf, "VmRSS:\t%d kB\n", rss>>10) @@ -867,7 +849,7 @@ var _ fs.FileOperations = (*commFile)(nil) // Read implements fs.FileOperations.Read. func (f *commFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } buf := []byte(f.t.Name() + "\n") @@ -922,10 +904,10 @@ type auxvecFile struct { // Read implements fs.FileOperations.Read. func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } - m, err := getTaskMM(f.t) + m, err := getTaskMMIncRef(f.t) if err != nil { return 0, err } @@ -1003,7 +985,7 @@ func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F // Read implements fs.FileOperations.Read. func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if f.t.ExitState() == kernel.TaskExitDead { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } var buf bytes.Buffer fmt.Fprintf(&buf, "%d\n", f.t.OOMScoreAdj()) @@ -1030,7 +1012,7 @@ func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOS } if f.t.ExitState() == kernel.TaskExitDead { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } if err := f.t.SetOOMScoreAdj(v); err != nil { return 0, err diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go index 30d5ad4cf..fcdc1e7bd 100644 --- a/pkg/sentry/fs/proc/uid_gid_map.go +++ b/pkg/sentry/fs/proc/uid_gid_map.go @@ -21,12 +21,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -108,7 +108,7 @@ const maxIDMapLines = 5 // Read implements fs.FileOperations.Read. func (imfo *idMapFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var entries []auth.IDMapEntry if imfo.iops.gids { @@ -134,7 +134,7 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u // the file ..." - user_namespaces(7) srclen := src.NumBytes() if srclen >= hostarch.PageSize || offset != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } b := make([]byte, srclen) if _, err := src.CopyIn(ctx, b); err != nil { @@ -154,7 +154,7 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u } lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) if len(lines) > maxIDMapLines { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } entries := make([]auth.IDMapEntry, len(lines)) @@ -162,7 +162,7 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u var e auth.IDMapEntry _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) if err != nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } entries[i] = e } diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go index c0f6fb802..ac896f963 100644 --- a/pkg/sentry/fs/proc/uptime.go +++ b/pkg/sentry/fs/proc/uptime.go @@ -20,10 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -74,7 +74,7 @@ type uptimeFile struct { // Read implements fs.FileOperations.Read. func (f *uptimeFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } now := ktime.NowFromContext(ctx) diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index 4a3d9636b..bfff010c5 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -14,13 +14,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/socket/unix/transport", "//pkg/sync", - "//pkg/syserror", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index 19990f9db..b1fadee7a 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -21,11 +21,11 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // CreateOps represents operations to create different file types. @@ -178,7 +178,7 @@ func (d *Dir) Children() ([]string, map[string]fs.DentAttr) { func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, error) { inode, ok := d.children[name] if !ok { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } delete(d.children, name) @@ -208,7 +208,7 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er // Remove removes the named non-directory. func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error { if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -226,7 +226,7 @@ func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error { // RemoveDirectory removes the named directory. func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) error { if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -240,7 +240,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err if ok, err := hasChildren(ctx, childInode); err != nil { return err } else if ok { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } // Child was empty. Proceed with removal. @@ -259,7 +259,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err // with a reference. func (d *Dir) Lookup(ctx context.Context, _ *fs.Inode, p string) (*fs.Dirent, error) { if len(p) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -283,16 +283,16 @@ func (d *Dir) walkLocked(ctx context.Context, p string) (*fs.Inode, error) { return inode, nil } - // fs.InodeOperations.Lookup returns syserror.ENOENT if p + // fs.InodeOperations.Lookup returns linuxerr.ENOENT if p // does not exist. - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // createInodeOperationsCommon creates a new child node at this dir by calling // makeInodeOperations. It is the common logic for creating a new child. func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, makeInodeOperations func() (*fs.Inode, error)) (*fs.Inode, error) { if len(name) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -311,7 +311,7 @@ func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, make // Create creates a new Inode with the given name and returns its File. func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perms fs.FilePermissions) (*fs.File, error) { if d.CreateOps == nil || d.CreateOps.NewFile == nil { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { @@ -333,7 +333,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F // CreateLink returns a new link. func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error { if d.CreateOps == nil || d.CreateOps.NewSymlink == nil { - return syserror.EACCES + return linuxerr.EACCES } _, err := d.createInodeOperationsCommon(ctx, newname, func() (*fs.Inode, error) { return d.NewSymlink(ctx, dir, oldname) @@ -344,7 +344,7 @@ func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname st // CreateHardLink creates a new hard link. func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error { if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } d.mu.Lock() @@ -362,7 +362,7 @@ func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inod // CreateDirectory returns a new subdirectory. func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error { if d.CreateOps == nil || d.CreateOps.NewDir == nil { - return syserror.EACCES + return linuxerr.EACCES } _, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { return d.NewDir(ctx, dir, perms) @@ -373,7 +373,7 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p // Bind implements fs.InodeOperations.Bind. func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) { if d.CreateOps == nil || d.CreateOps.NewBoundEndpoint == nil { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { return d.NewBoundEndpoint(ctx, dir, ep, perms) @@ -392,7 +392,7 @@ func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport // CreateFifo implements fs.InodeOperations.CreateFifo. func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error { if d.CreateOps == nil || d.CreateOps.NewFifo == nil { - return syserror.EACCES + return linuxerr.EACCES } _, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { return d.NewFifo(ctx, dir, perms) @@ -496,14 +496,14 @@ func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) { func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string, replacement bool) error { op, ok := oldParent.(*Dir) if !ok { - return syserror.EXDEV + return linuxerr.EXDEV } np, ok := newParent.(*Dir) if !ok { - return syserror.EXDEV + return linuxerr.EXDEV } if len(newName) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } np.mu.Lock() @@ -521,7 +521,7 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n if ok, err := hasChildren(ctx, replaced); err != nil { return err } else if ok { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go index d0c565879..dc9d27bb3 100644 --- a/pkg/sentry/fs/ramfs/socket.go +++ b/pkg/sentry/fs/ramfs/socket.go @@ -17,10 +17,10 @@ package ramfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -64,7 +64,7 @@ func (s *Socket) BoundEndpoint(*fs.Inode, string) transport.BoundEndpoint { // GetFile implements fs.FileOperations.GetFile. func (s *Socket) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } // +stateify savable diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go index 33da82868..266140f6f 100644 --- a/pkg/sentry/fs/splice.go +++ b/pkg/sentry/fs/splice.go @@ -19,7 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // Splice moves data to this file, directly from another. @@ -28,7 +28,7 @@ import ( func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, error) { // Verify basic file flag permissions. if !dst.Flags().Write || !src.Flags().Read { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Check whether or not the objects being sliced are stream-oriented @@ -54,26 +54,26 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, case dst.UniqueID < src.UniqueID: // Acquire dst first. if !dst.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } if !src.mu.Lock(ctx) { dst.mu.Unlock() - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } case dst.UniqueID > src.UniqueID: // Acquire src first. if !src.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } if !dst.mu.Lock(ctx) { src.mu.Unlock() - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } case dst.UniqueID == src.UniqueID: // Acquire only one lock; it's the same file. This is a // bit of a edge case, but presumably it's possible. if !dst.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } srcLock = false // Only need one unlock. } @@ -83,13 +83,13 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, case dstLock: // Acquire only dst. if !dst.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } opts.DstStart = dst.offset // Safe: locked. case srcLock: // Acquire only src. if !src.mu.Lock(ctx) { - return 0, syserror.ErrInterrupted + return 0, linuxerr.ErrInterrupted } opts.SrcStart = src.offset // Safe: locked. } @@ -107,7 +107,7 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, limit, ok := dst.checkLimit(ctx, opts.DstStart) switch { case ok && limit == 0: - err = syserror.ErrExceedsFileSizeLimit + err = linuxerr.ErrExceedsFileSizeLimit case ok && limit < opts.Length: opts.Length = limit // Cap the write. } @@ -139,7 +139,7 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, // Attempt to do a WriteTo; this is likely the most efficient. n, err := src.FileOperations.WriteTo(ctx, src, w, opts.Length, opts.Dup) - if n == 0 && err == syserror.ENOSYS && !opts.Dup { + if n == 0 && linuxerr.Equals(linuxerr.ENOSYS, err) && !opts.Dup { // Attempt as a ReadFrom. If a WriteTo, a ReadFrom may also be // more efficient than a copy if buffers are cached or readily // available. (It's unlikely that they can actually be donated). @@ -151,7 +151,7 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, // if we block at some point, we could lose data. If the source is // not a pipe then reading is not destructive; if the destination // is a regular file, then it is guaranteed not to block writing. - if n == 0 && err == syserror.ENOSYS && !opts.Dup && (!dstPipe || !srcPipe) { + if n == 0 && linuxerr.Equals(linuxerr.ENOSYS, err) && !opts.Dup && (!dstPipe || !srcPipe) { // Fallback to an in-kernel copy. n, err = io.Copy(w, &io.LimitedReader{ R: r, diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD index c7977a217..e61115932 100644 --- a/pkg/sentry/fs/timerfd/BUILD +++ b/pkg/sentry/fs/timerfd/BUILD @@ -8,12 +8,12 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel/time", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index c8ebe256c..1c8518d71 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -20,12 +20,12 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -121,7 +121,7 @@ func (t *TimerOperations) EventUnregister(e *waiter.Entry) { func (t *TimerOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { const sizeofUint64 = 8 if dst.NumBytes() < sizeofUint64 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if val := atomic.SwapUint64(&t.val, 0); val != 0 { var buf [sizeofUint64]byte @@ -133,12 +133,12 @@ func (t *TimerOperations) Read(ctx context.Context, file *fs.File, dst usermem.I } return sizeofUint64, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Write implements fs.FileOperations.Write. func (t *TimerOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Notify implements ktime.TimerListener.Notify. diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index 90398376a..511fffb43 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -15,6 +15,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/safemem", "//pkg/sentry/device", @@ -30,7 +31,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index 7faa822f0..1974523bf 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -31,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -150,7 +150,7 @@ func (*fileInodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldPare // GetFile implements fs.InodeOperations.GetFile. func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { if fs.IsSocket(d.Inode.StableAttr) { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } if flags.Write { @@ -217,7 +217,7 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in fallthrough case oldSize > size && f.seals&linux.F_SEAL_SHRINK != 0: // Shrink sealed f.dataMu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } if oldSize != size { @@ -278,7 +278,7 @@ func (f *fileInodeOperations) Allocate(ctx context.Context, _ *fs.Inode, offset, // Check if current seals allow growth. if f.seals&linux.F_SEAL_GROW != 0 { - return syserror.EPERM + return linuxerr.EPERM } f.attr.Size = newSize @@ -455,13 +455,13 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) if end == math.MaxInt64 { // Overflow. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check if seals prevent either file growth or all writes. switch { case rw.f.seals&linux.F_SEAL_WRITE != 0: // Write sealed - return 0, syserror.EPERM + return 0, linuxerr.EPERM case end > rw.f.attr.Size && rw.f.seals&linux.F_SEAL_GROW != 0: // Grow sealed // When growth is sealed, Linux effectively allows writes which would // normally grow the file to partially succeed up to the current EOF, @@ -482,7 +482,7 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) } if end <= rw.offset { // Truncation would result in no data being written. - return 0, syserror.EPERM + return 0, linuxerr.EPERM } } @@ -550,7 +550,7 @@ func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingS // Reject writable mapping if F_SEAL_WRITE is set. if f.seals&linux.F_SEAL_WRITE != 0 && writable { - return syserror.EPERM + return linuxerr.EPERM } f.mappings.AddMapping(ms, ar, offset, writable) @@ -655,7 +655,7 @@ func GetSeals(inode *fs.Inode) (uint32, error) { return f.seals, nil } // Not a memfd inode. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // AddSeals adds new file seals to a memfd inode. @@ -668,13 +668,13 @@ func AddSeals(inode *fs.Inode, val uint32) error { if f.seals&linux.F_SEAL_SEAL != 0 { // Seal applied which prevents addition of any new seals. - return syserror.EPERM + return linuxerr.EPERM } // F_SEAL_WRITE can only be added if there are no active writable maps. if f.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { if f.writableMappingPages > 0 { - return syserror.EBUSY + return linuxerr.EBUSY } } @@ -683,5 +683,5 @@ func AddSeals(inode *fs.Inode, val uint32) error { return nil } // Not a memfd inode. - return syserror.EINVAL + return linuxerr.EINVAL } diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index 6aa8ff331..9a835b556 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) var fsInfo = fs.Info{ @@ -49,7 +49,7 @@ var fsInfo = fs.Info{ func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { // Don't allow renames across different mounts. if newParent.MountSource != oldParent.MountSource { - return syserror.EXDEV + return linuxerr.EXDEV } op := oldParent.InodeOperations.(*Dir) diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index 86ada820e..9e9dc06f3 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -17,6 +17,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/marshal/primitive", "//pkg/refs", @@ -30,7 +31,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 13c9dbe7d..5716e2ee9 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -22,13 +22,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -154,12 +154,12 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str n, err := strconv.ParseUint(name, 10, 32) if err != nil { // Not found. - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } s, ok := d.replicas[uint32(n)] if !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } s.IncRef() @@ -170,54 +170,54 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str // // Creation is never allowed. func (d *dirInodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } // CreateDirectory implements fs.InodeOperations.CreateDirectory. // // Creation is never allowed. func (d *dirInodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { - return syserror.EACCES + return linuxerr.EACCES } // CreateLink implements fs.InodeOperations.CreateLink. // // Creation is never allowed. func (d *dirInodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error { - return syserror.EACCES + return linuxerr.EACCES } // CreateHardLink implements fs.InodeOperations.CreateHardLink. // // Creation is never allowed. func (d *dirInodeOperations) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error { - return syserror.EACCES + return linuxerr.EACCES } // CreateFifo implements fs.InodeOperations.CreateFifo. // // Creation is never allowed. func (d *dirInodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { - return syserror.EACCES + return linuxerr.EACCES } // Remove implements fs.InodeOperations.Remove. // // Removal is never allowed. func (d *dirInodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { - return syserror.EPERM + return linuxerr.EPERM } // RemoveDirectory implements fs.InodeOperations.RemoveDirectory. // // Removal is never allowed. func (d *dirInodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { - return syserror.EPERM + return linuxerr.EPERM } // Bind implements fs.InodeOperations.Bind. func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // GetFile implements fs.InodeOperations.GetFile. @@ -234,7 +234,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e n := d.next if n == math.MaxUint32 { - return nil, syserror.ENOMEM + return nil, linuxerr.ENOMEM } if _, ok := d.replicas[n]; ok { @@ -334,10 +334,10 @@ func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, seriali // Read implements FileOperations.Read func (df *dirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Write implements FileOperations.Write. func (df *dirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go index 13f4901db..0e5916380 100644 --- a/pkg/sentry/fs/tty/fs.go +++ b/pkg/sentry/fs/tty/fs.go @@ -16,9 +16,9 @@ package tty import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) // ptsDevice is the pseudo-filesystem device. @@ -64,7 +64,7 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou // No options are supported. if data != "" { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } return newDir(ctx, fs.NewMountSource(ctx, &superOperations{}, f, flags)), nil diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 3ba02c218..f9fca6d8e 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -20,10 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -193,7 +193,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque } return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { @@ -207,7 +207,7 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ l.replicaWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error { @@ -228,7 +228,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ } return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { @@ -242,7 +242,7 @@ func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSeq l.masterWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // transformer is a helper interface to make it easier to stateify queue. diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index 1cf869b62..88d6703a8 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -17,13 +17,13 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -157,7 +157,7 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, file *fs.File, io use t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { @@ -201,7 +201,7 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, file *fs.File, io use return mf.t.setForegroundProcessGroup(ctx, args, true /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index 11d6c15d0..25d3c887e 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -17,12 +17,12 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -110,7 +110,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl defer q.mu.Unlock() if !q.readable { - return 0, false, syserror.ErrWouldBlock + return 0, false, linuxerr.ErrWouldBlock } if dst.NumBytes() > canonMaxBytes { @@ -155,7 +155,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip room := waitBufMaxBytes - q.waitBufLen // If out of room, return EAGAIN. if room == 0 && copyLen > 0 { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Cap the size of the wait buffer. if copyLen > room { diff --git a/pkg/sentry/fs/tty/replica.go b/pkg/sentry/fs/tty/replica.go index 0e3eea3bd..ca5bc7535 100644 --- a/pkg/sentry/fs/tty/replica.go +++ b/pkg/sentry/fs/tty/replica.go @@ -17,12 +17,12 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -142,7 +142,7 @@ func (sf *replicaFileOperations) Ioctl(ctx context.Context, file *fs.File, io us t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { @@ -179,7 +179,7 @@ func (sf *replicaFileOperations) Ioctl(ctx context.Context, file *fs.File, io us return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD index 66e949c95..23b5508fd 100644 --- a/pkg/sentry/fs/user/BUILD +++ b/pkg/sentry/fs/user/BUILD @@ -12,13 +12,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/log", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go index 124bc95ed..67a9adfd7 100644 --- a/pkg/sentry/fs/user/path.go +++ b/pkg/sentry/fs/user/path.go @@ -21,13 +21,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // ResolveExecutablePath resolves the given executable name given the working @@ -80,7 +80,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s root := fs.RootFromContext(ctx) if root == nil { // Caller has no root. Don't bother traversing anything. - return "", syserror.ENOENT + return "", linuxerr.ENOENT } defer root.DecRef(ctx) for _, p := range paths { @@ -93,7 +93,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s binPath := path.Join(p, name) traversals := uint(linux.MaxSymlinkTraversals) d, err := mns.FindInode(ctx, root, nil, binPath, &traversals) - if err == syserror.ENOENT || err == syserror.EACCES { + if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.EACCES, err) { // Didn't find it here. continue } @@ -116,7 +116,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s } // Couldn't find it. - return "", syserror.ENOENT + return "", linuxerr.ENOENT } func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) { @@ -142,7 +142,7 @@ func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNam Flags: linux.O_RDONLY, } dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) - if err == syserror.ENOENT || err == syserror.EACCES { + if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.EACCES, err) { // Didn't find it here. continue } @@ -155,7 +155,7 @@ func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNam } // Couldn't find it. - return "", syserror.ENOENT + return "", linuxerr.ENOENT } // getPath returns the PATH as a slice of strings given the environment diff --git a/pkg/sentry/fsbridge/BUILD b/pkg/sentry/fsbridge/BUILD index 6c798f0bd..4631db2bb 100644 --- a/pkg/sentry/fsbridge/BUILD +++ b/pkg/sentry/fsbridge/BUILD @@ -13,12 +13,12 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go index 9785fd62a..527bde181 100644 --- a/pkg/sentry/fsbridge/fs.go +++ b/pkg/sentry/fsbridge/fs.go @@ -20,10 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -127,7 +127,7 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio defer d.DecRef(ctx) if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) { - return nil, syserror.ELOOP + return nil, linuxerr.ELOOP } fsPerm := openOptionsToPermMask(&opts) @@ -138,13 +138,13 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio // If they claim it's a directory, then make sure. if strings.HasSuffix(path, "/") { if d.Inode.StableAttr.Type != fs.Directory { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } } if opts.FileExec && d.Inode.StableAttr.Type != fs.RegularFile { ctx.Infof("%q is not a regular file: %v", path, d.Inode.StableAttr.Type) - return nil, syserror.EACCES + return nil, linuxerr.EACCES } f, err := d.Inode.GetFile(ctx, d, flagsToFileFlags(opts.Flags)) diff --git a/pkg/sentry/fsimpl/cgroupfs/BUILD b/pkg/sentry/fsimpl/cgroupfs/BUILD index 37efb641a..e5fdcc776 100644 --- a/pkg/sentry/fsimpl/cgroupfs/BUILD +++ b/pkg/sentry/fsimpl/cgroupfs/BUILD @@ -31,6 +31,8 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/coverage", + "//pkg/errors/linuxerr", + "//pkg/fspath", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", @@ -42,7 +44,6 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go index fe9871bdd..71bb0a9c8 100644 --- a/pkg/sentry/fsimpl/cgroupfs/base.go +++ b/pkg/sentry/fsimpl/cgroupfs/base.go @@ -23,10 +23,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -88,7 +88,6 @@ type controller interface { // +stateify savable type cgroupInode struct { dir - fs *filesystem // ts is the list of tasks in this cgroup. The kernel is responsible for // removing tasks from this list before they're destroyed, so any tasks on @@ -102,9 +101,10 @@ var _ kernel.CgroupImpl = (*cgroupInode)(nil) func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode { c := &cgroupInode{ - fs: fs, - ts: make(map[*kernel.Task]struct{}), + dir: dir{fs: fs}, + ts: make(map[*kernel.Task]struct{}), } + c.dir.cgi = c contents := make(map[string]kernfs.Inode) contents["cgroup.procs"] = fs.newControllerFile(ctx, creds, &cgroupProcsData{c}) @@ -115,8 +115,7 @@ func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credential } c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555)) - c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - c.dir.InitRefs() + c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents)) atomic.AddUint64(&fs.numCgroups, 1) @@ -253,7 +252,7 @@ func parseInt64FromString(ctx context.Context, src usermem.IOSequence, offset in // Note: This also handles zero-len writes if offset is beyond the end // of src, or src is empty. ctx.Warningf("cgroupfs.parseInt64FromString: failed to parse %q: %v", string(buf), err) - return 0, int64(n), syserror.EINVAL + return 0, int64(n), linuxerr.EINVAL } return val, int64(n), nil diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go index 05d7eb4ce..edc3b50b9 100644 --- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go +++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go @@ -32,7 +32,8 @@ // controllers associated with them. // // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between -// cgroupfs dentries and inodes. +// cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref +// counted and exist until they're unlinked once or the FS is destroyed. // // # Synchronization // @@ -48,10 +49,11 @@ // Lock order: // // kernel.CgroupRegistry.mu -// cgroupfs.filesystem.mu -// kernel.TaskSet.mu -// kernel.Task.mu -// cgroupfs.filesystem.tasksMu. +// kernfs.filesystem.mu +// kernel.TaskSet.mu +// kernel.Task.mu +// cgroupfs.filesystem.tasksMu. +// cgroupfs.dir.OrderedChildren.mu package cgroupfs import ( @@ -62,12 +64,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -108,6 +111,7 @@ type FilesystemType struct{} // +stateify savable type InternalData struct { DefaultControlValues map[string]int64 + InitialCgroupPath string } // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. @@ -134,6 +138,11 @@ type filesystem struct { numCgroups uint64 // Protected by atomic ops. root *kernfs.Dentry + // effectiveRoot is the initial cgroup new tasks are created in. Unless + // overwritten by internal mount options, root == effectiveRoot. If + // effectiveRoot != root, an extra reference is held on effectiveRoot for + // the lifetime of the filesystem. + effectiveRoot *kernfs.Dentry // tasksMu serializes task membership changes across all cgroups within a // filesystem. @@ -167,7 +176,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } } @@ -195,7 +204,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if _, ok := mopts["all"]; ok { if len(wantControllers) > 0 { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } delete(mopts, "all") @@ -209,7 +218,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if len(mopts) != 0 { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } k := kernel.KernelFromContext(ctx) @@ -229,6 +238,9 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs := vfsfs.Impl().(*filesystem) ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID) fs.root.IncRef() + if fs.effectiveRoot != fs.root { + fs.effectiveRoot.IncRef() + } return vfsfs, fs.root.VFSDentry(), nil } @@ -245,8 +257,8 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt var defaults map[string]int64 if opts.InternalData != nil { - ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) defaults = opts.InternalData.(*InternalData).DefaultControlValues + ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) } for _, ty := range wantControllers { @@ -286,6 +298,14 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt var rootD kernfs.Dentry rootD.InitRoot(&fs.Filesystem, root) fs.root = &rootD + fs.effectiveRoot = fs.root + + if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err) + rootD.DecRef(ctx) + fs.VFSFilesystem().DecRef(ctx) + return nil, nil, err + } // Register controllers. The registry may be modified concurrently, so if we // get an error, we raced with someone else who registered the same @@ -294,7 +314,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err) rootD.DecRef(ctx) fs.VFSFilesystem().DecRef(ctx) - return nil, nil, syserror.EBUSY + return nil, nil, linuxerr.EBUSY } // Move all existing tasks to the root of the new hierarchy. @@ -303,10 +323,47 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), rootD.VFSDentry(), nil } +// prepareInitialCgroup creates the initial cgroup according to opts. An initial +// cgroup is optional, and if not specified, this function is a no-op. +func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error { + if opts.InternalData == nil { + return nil + } + initPathStr := opts.InternalData.(*InternalData).InitialCgroupPath + if initPathStr == "" { + return nil + } + ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr) + initPath := fspath.Parse(initPathStr) + if !initPath.Absolute || !initPath.HasComponents() { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath) + return linuxerr.EINVAL + } + + // Have initial cgroup target, create the tree. + cgDir := fs.root.Inode().(*cgroupInode) + for pit := initPath.Begin; pit.Ok(); pit = pit.Next() { + cgDirI, err := cgDir.NewDir(ctx, pit.String(), vfs.MkdirOptions{}) + if err != nil { + return err + } + cgDir = cgDirI.(*cgroupInode) + } + + // Walk to target dentry. + initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath) + if err != nil { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err) + return linuxerr.ENOENT + } + fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here. + return nil +} + func (fs *filesystem) rootCgroup() kernel.Cgroup { return kernel.Cgroup{ - Dentry: fs.root, - CgroupImpl: fs.root.Inode().(kernel.CgroupImpl), + Dentry: fs.effectiveRoot, + CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl), } } @@ -320,6 +377,10 @@ func (fs *filesystem) Release(ctx context.Context) { r.Unregister(fs.hierarchyID) } + if fs.root != fs.effectiveRoot { + fs.effectiveRoot.DecRef(ctx) + } + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } @@ -346,15 +407,18 @@ func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error // // +stateify savable type dir struct { - dirRefs + kernfs.InodeNoopRefCount kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir. + kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren implStatFS locks vfs.FileLocks + + fs *filesystem // Immutable. + cgi *cgroupInode // Immutable. } // Keep implements kernfs.Inode.Keep. @@ -364,7 +428,7 @@ func (*dir) Keep() bool { // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // Open implements kernfs.Inode.Open. @@ -378,14 +442,100 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry return fd.VFSFileDescription(), nil } -// DecRef implements kernfs.Inode.DecRef. -func (d *dir) DecRef(ctx context.Context) { - d.dirRefs.DecRef(func() { d.Destroy(ctx) }) +// NewDir implements kernfs.Inode.NewDir. +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { + // "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable." + // -- Linux, kernel/cgroup.c:cgroup_mkdir(). + if strings.Contains(name, "\n") { + return nil, linuxerr.EINVAL + } + return d.OrderedChildren.Inserter(name, func() kernfs.Inode { + d.IncLinks(1) + return d.fs.newCgroupInode(ctx, auth.CredentialsFromContext(ctx)) + }) } -// StatFS implements kernfs.Inode.StatFS. -func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { - return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil +// Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of +// cgroup directories, and the rename may only change the name within the same +// parent. See linux, kernel/cgroup.c:cgroup_rename(). +func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error { + if _, ok := child.(*cgroupInode); !ok { + // Not a cgroup directory. Control files are backed by different types. + return linuxerr.ENOTDIR + } + + dstCGInode, ok := dst.(*cgroupInode) + if !ok { + // Not a cgroup inode, so definitely can't be *this* inode. + return linuxerr.EIO + } + // Note: We're intentionally comparing addresses, since two different dirs + // could plausibly be identical in memory, but would occupy different + // locations in memory. + if d != &dstCGInode.dir { + // Destination dir is a different cgroup inode. Cross directory renames + // aren't allowed. + return linuxerr.EIO + } + + // Rename moves oldname to newname within d. Proceed. + return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst) +} + +// Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only +// files in the filesystem are control files, which can't be deleted. +func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error { + return linuxerr.EPERM +} + +// hasChildrenLocked returns whether the cgroup dir contains any objects that +// prevent it from being deleted. +func (d *dir) hasChildrenLocked() bool { + // Subdirs take a link on the parent, so checks if there are any direct + // children cgroups. Exclude the dir's self link and the link from ".". + if d.InodeAttrs.Links()-2 > 0 { + return true + } + return len(d.cgi.ts) > 0 +} + +// HasChildren implements kernfs.Inode.HasChildren. +// +// The empty check for a cgroupfs directory is unlike a regular directory since +// a cgroupfs directory will always have control files. A cgroupfs directory can +// be deleted if cgroup contains no tasks and has no sub-cgroups. +func (d *dir) HasChildren() bool { + d.fs.tasksMu.RLock() + defer d.fs.tasksMu.RUnlock() + return d.hasChildrenLocked() +} + +// RmDir implements kernfs.Inode.RmDir. +func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error { + // Unlike a normal directory, we need to recheck if d is empty again, since + // vfs/kernfs can't stop tasks from entering or leaving the cgroup. + d.fs.tasksMu.RLock() + defer d.fs.tasksMu.RUnlock() + + cgi, ok := child.(*cgroupInode) + if !ok { + return linuxerr.ENOTDIR + } + if cgi.dir.hasChildrenLocked() { + return linuxerr.ENOTEMPTY + } + + // Disallow deletion of the effective root cgroup. + if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) { + ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath()) + return linuxerr.EBUSY + } + + err := d.OrderedChildren.RmDir(ctx, name, child) + if err == nil { + d.InodeAttrs.DecLinks() + } + return err } // controllerFile represents a generic control file that appears within a cgroup diff --git a/pkg/sentry/fsimpl/cgroupfs/memory.go b/pkg/sentry/fsimpl/cgroupfs/memory.go index 485c98376..d880c9bc4 100644 --- a/pkg/sentry/fsimpl/cgroupfs/memory.go +++ b/pkg/sentry/fsimpl/cgroupfs/memory.go @@ -31,22 +31,34 @@ import ( type memoryController struct { controllerCommon - limitBytes int64 + limitBytes int64 + softLimitBytes int64 + moveChargeAtImmigrate int64 } var _ controller = (*memoryController)(nil) func newMemoryController(fs *filesystem, defaults map[string]int64) *memoryController { c := &memoryController{ - // Linux sets this to (PAGE_COUNTER_MAX * PAGE_SIZE) by default, which - // is ~ 2**63 on a 64-bit system. So essentially, inifinity. The exact - // value isn't very important. - limitBytes: math.MaxInt64, + // Linux sets these limits to (PAGE_COUNTER_MAX * PAGE_SIZE) by default, + // which is ~ 2**63 on a 64-bit system. So essentially, inifinity. The + // exact value isn't very important. + + limitBytes: math.MaxInt64, + softLimitBytes: math.MaxInt64, } - if val, ok := defaults["memory.limit_in_bytes"]; ok { - c.limitBytes = val - delete(defaults, "memory.limit_in_bytes") + + consumeDefault := func(name string, valPtr *int64) { + if val, ok := defaults[name]; ok { + *valPtr = val + delete(defaults, name) + } } + + consumeDefault("memory.limit_in_bytes", &c.limitBytes) + consumeDefault("memory.soft_limit_in_bytes", &c.softLimitBytes) + consumeDefault("memory.move_charge_at_immigrate", &c.moveChargeAtImmigrate) + c.controllerCommon.init(controllerMemory, fs) return c } @@ -55,6 +67,8 @@ func newMemoryController(fs *filesystem, defaults map[string]int64) *memoryContr func (c *memoryController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) { contents["memory.usage_in_bytes"] = c.fs.newControllerFile(ctx, creds, &memoryUsageInBytesData{}) contents["memory.limit_in_bytes"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.limitBytes)) + contents["memory.soft_limit_in_bytes"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.softLimitBytes)) + contents["memory.move_charge_at_immigrate"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.moveChargeAtImmigrate)) } // +stateify savable diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index 6af3c3781..e0b879339 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -29,6 +29,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/marshal", "//pkg/marshal/primitive", @@ -44,7 +45,6 @@ go_library( "//pkg/sentry/unimpl", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], @@ -59,5 +59,6 @@ go_test( "//pkg/abi/linux", "//pkg/sentry/contexttest", "//pkg/usermem", + "//pkg/waiter", ], ) diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index e75954105..e711debcb 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -25,10 +25,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Name is the filesystem name. @@ -56,7 +56,7 @@ func (*FilesystemType) Name() string { func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // No data allowed. if opts.Data != "" { - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fstype.initOnce.Do(func() { @@ -179,7 +179,7 @@ func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credential i.mu.Lock() defer i.mu.Unlock() if i.nextIdx == math.MaxUint32 { - return nil, syserror.ENOMEM + return nil, linuxerr.ENOMEM } idx := i.nextIdx i.nextIdx++ @@ -240,7 +240,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, erro // Not a static entry. idx, err := strconv.ParseUint(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } i.mu.Lock() defer i.mu.Unlock() @@ -249,7 +249,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, erro return ri, nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // IterDirents implements kernfs.Inode.IterDirents. diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go index 448390cfe..1ef07d702 100644 --- a/pkg/sentry/fsimpl/devpts/devpts_test.go +++ b/pkg/sentry/fsimpl/devpts/devpts_test.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) func TestSimpleMasterToReplica(t *testing.T) { @@ -54,3 +55,36 @@ func TestSimpleMasterToReplica(t *testing.T) { t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr) } } + +type callback func(*waiter.Entry, waiter.EventMask) + +func (cb callback) Callback(entry *waiter.Entry, mask waiter.EventMask) { + cb(entry, mask) +} + +func TestEchoDeadlock(t *testing.T) { + ctx := contexttest.Context(t) + termios := linux.DefaultReplicaTermios + termios.LocalFlags |= linux.ECHO + ld := newLineDiscipline(termios) + outBytes := make([]byte, 32) + dst := usermem.BytesIOSequence(outBytes) + entry := &waiter.Entry{Callback: callback(func(*waiter.Entry, waiter.EventMask) { + ld.inputQueueRead(ctx, dst) + })} + ld.masterWaiter.EventRegister(entry, waiter.ReadableEvents) + defer ld.masterWaiter.EventUnregister(entry) + inBytes := []byte("hello, tty\n") + n, err := ld.inputQueueWrite(ctx, usermem.BytesIOSequence(inBytes)) + if err != nil { + t.Fatalf("inputQueueWrite: %v", err) + } + if int(n) != len(inBytes) { + t.Fatalf("read wrong length: got %d, want %d", n, len(inBytes)) + } + outStr := string(outBytes[:n]) + inStr := string(inBytes) + if outStr != inStr { + t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr) + } +} diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go index e94a5bac3..609623f9f 100644 --- a/pkg/sentry/fsimpl/devpts/line_discipline.go +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -20,10 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -70,6 +70,10 @@ const ( // +------------------------| output queue |<--------------------------+ // (outputQueueRead) +--------------+ (outputQueueWrite) // +// There is special handling for the ECHO option, where bytes written to the +// input queue are also output back to the terminal by being written to +// l.outQueue by the input queue transformer. +// // Lock order: // termiosMu // inQueue.mu @@ -126,7 +130,6 @@ func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArgument // setTermios sets a linux.Termios for the tty. func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.Lock() - defer l.termiosMu.Unlock() oldCanonEnabled := l.termios.LEnabled(linux.ICANON) // We must copy a Termios struct, not KernelTermios. var t linux.Termios @@ -141,7 +144,10 @@ func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArgument l.inQueue.pushWaitBufLocked(l) l.inQueue.readable = true l.inQueue.mu.Unlock() + l.termiosMu.Unlock() l.replicaWaiter.Notify(waiter.ReadableEvents) + } else { + l.termiosMu.Unlock() } return 0, err @@ -179,33 +185,42 @@ func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { l.termiosMu.RLock() - defer l.termiosMu.RUnlock() - n, pushed, err := l.inQueue.read(ctx, dst, l) + n, pushed, notifyEcho, err := l.inQueue.read(ctx, dst, l) + l.termiosMu.RUnlock() if err != nil { return 0, err } if n > 0 { - l.masterWaiter.Notify(waiter.WritableEvents) + if notifyEcho { + l.masterWaiter.Notify(waiter.ReadableEvents | waiter.WritableEvents) + } else { + l.masterWaiter.Notify(waiter.WritableEvents) + } if pushed { l.replicaWaiter.Notify(waiter.ReadableEvents) } return n, nil + } else if notifyEcho { + l.masterWaiter.Notify(waiter.ReadableEvents) } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { l.termiosMu.RLock() - defer l.termiosMu.RUnlock() - n, err := l.inQueue.write(ctx, src, l) + n, notifyEcho, err := l.inQueue.write(ctx, src, l) + l.termiosMu.RUnlock() if err != nil { return 0, err } + if notifyEcho { + l.masterWaiter.Notify(waiter.ReadableEvents) + } if n > 0 { l.replicaWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { @@ -214,8 +229,9 @@ func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { l.termiosMu.RLock() - defer l.termiosMu.RUnlock() - n, pushed, err := l.outQueue.read(ctx, dst, l) + // Ignore notifyEcho, as it cannot happen when reading from the output queue. + n, pushed, _, err := l.outQueue.read(ctx, dst, l) + l.termiosMu.RUnlock() if err != nil { return 0, err } @@ -226,13 +242,14 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ } return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { l.termiosMu.RLock() - defer l.termiosMu.RUnlock() - n, err := l.outQueue.write(ctx, src, l) + // Ignore notifyEcho, as it cannot happen when writing to the output queue. + n, _, err := l.outQueue.write(ctx, src, l) + l.termiosMu.RUnlock() if err != nil { return 0, err } @@ -240,13 +257,14 @@ func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSeq l.masterWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // transformer is a helper interface to make it easier to stateify queue. type transformer interface { // transform functions require queue's mutex to be held. - transform(*lineDiscipline, *queue, []byte) int + // The boolean indicates whether there was any echoed bytes. + transform(*lineDiscipline, *queue, []byte) (int, bool) } // outputQueueTransformer implements transformer. It performs line discipline @@ -261,7 +279,7 @@ type outputQueueTransformer struct{} // Preconditions: // * l.termiosMu must be held for reading. // * q.mu must be held. -func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { +func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) (int, bool) { // transformOutput is effectively always in noncanonical mode, as the // master termios never has ICANON set. @@ -270,7 +288,7 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte if len(q.readBuf) > 0 { q.readable = true } - return len(buf) + return len(buf), false } var ret int @@ -321,7 +339,7 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte if len(q.readBuf) > 0 { q.readable = true } - return ret + return ret, false } // inputQueueTransformer implements transformer. It performs line discipline @@ -334,15 +352,17 @@ type inputQueueTransformer struct{} // transformed according to flags set in the termios struct. See // drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel // function. +// It returns an extra boolean indicating whether any characters need to be +// echoed, in which case we need to notify readers. // // Preconditions: // * l.termiosMu must be held for reading. // * q.mu must be held. -func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { +func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) (int, bool) { // If there's a line waiting to be read in canonical mode, don't write // anything else to the read buffer. if l.termios.LEnabled(linux.ICANON) && q.readable { - return 0 + return 0, false } maxBytes := nonCanonMaxBytes @@ -351,6 +371,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) } var ret int + var notifyEcho bool for len(buf) > 0 && len(q.readBuf) < canonMaxBytes { size := l.peek(buf) cBytes := append([]byte{}, buf[:size]...) @@ -397,7 +418,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) // Anything written to the readBuf will have to be echoed. if l.termios.LEnabled(linux.ECHO) { l.outQueue.writeBytes(cBytes, l) - l.masterWaiter.Notify(waiter.ReadableEvents) + notifyEcho = true } // If we finish a line, make it available for reading. @@ -412,7 +433,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) q.readable = true } - return ret + return ret, notifyEcho } // shouldDiscard returns whether c should be discarded. In canonical mode, if diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 93c031c89..9a1a245dc 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -17,6 +17,7 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -24,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -80,7 +80,7 @@ func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs // SetStat implements kernfs.Inode.SetStat func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask&linux.STATX_SIZE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) } @@ -132,7 +132,7 @@ func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { @@ -177,7 +177,7 @@ func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go index 47b0f1599..85aeefa43 100644 --- a/pkg/sentry/fsimpl/devpts/queue.go +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -17,12 +17,12 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -98,17 +98,19 @@ func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArg } -// read reads from q to userspace. It returns the number of bytes read as well -// as whether the read caused more readable data to become available (whether +// read reads from q to userspace. It returns: +// - The number of bytes read +// - Whether the read caused more readable data to become available (whether // data was pushed from the wait buffer to the read buffer). +// - Whether any data was echoed back (need to notify readers). // // Preconditions: l.termiosMu must be held for reading. -func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { +func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, bool, error) { q.mu.Lock() defer q.mu.Unlock() if !q.readable { - return 0, false, syserror.ErrWouldBlock + return 0, false, false, linuxerr.ErrWouldBlock } if dst.NumBytes() > canonMaxBytes { @@ -131,19 +133,20 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl return n, nil })) if err != nil { - return 0, false, err + return 0, false, false, err } // Move data from the queue's wait buffer to its read buffer. - nPushed := q.pushWaitBufLocked(l) + nPushed, notifyEcho := q.pushWaitBufLocked(l) - return int64(n), nPushed > 0, nil + return int64(n), nPushed > 0, notifyEcho, nil } // write writes to q from userspace. +// The returned boolean indicates whether any data was echoed back. // // Preconditions: l.termiosMu must be held for reading. -func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) { +func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { q.mu.Lock() defer q.mu.Unlock() @@ -153,7 +156,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip room := waitBufMaxBytes - q.waitBufLen // If out of room, return EAGAIN. if room == 0 && copyLen > 0 { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Cap the size of the wait buffer. if copyLen > room { @@ -173,44 +176,49 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip return n, nil })) if err != nil { - return 0, err + return 0, false, err } // Push data from the wait to the read buffer. - q.pushWaitBufLocked(l) + _, notifyEcho := q.pushWaitBufLocked(l) - return n, nil + return n, notifyEcho, nil } // writeBytes writes to q from b. +// The returned boolean indicates whether any data was echoed back. // // Preconditions: l.termiosMu must be held for reading. -func (q *queue) writeBytes(b []byte, l *lineDiscipline) { +func (q *queue) writeBytes(b []byte, l *lineDiscipline) bool { q.mu.Lock() defer q.mu.Unlock() // Write to the wait buffer. q.waitBufAppend(b) - q.pushWaitBufLocked(l) + _, notifyEcho := q.pushWaitBufLocked(l) + return notifyEcho } // pushWaitBufLocked fills the queue's read buffer with data from the wait // buffer. +// The returned boolean indicates whether any data was echoed back. // // Preconditions: // * l.termiosMu must be held for reading. // * q.mu must be locked. -func (q *queue) pushWaitBufLocked(l *lineDiscipline) int { +func (q *queue) pushWaitBufLocked(l *lineDiscipline) (int, bool) { if q.waitBufLen == 0 { - return 0 + return 0, false } // Move data from the wait to the read buffer. var total int var i int + var notifyEcho bool for i = 0; i < len(q.waitBuf); i++ { - n := q.transform(l, q, q.waitBuf[i]) + n, echo := q.transform(l, q, q.waitBuf[i]) total += n + notifyEcho = notifyEcho || echo if n != len(q.waitBuf[i]) { // The read buffer filled up without consuming the // entire buffer. @@ -223,7 +231,7 @@ func (q *queue) pushWaitBufLocked(l *lineDiscipline) int { q.waitBuf = q.waitBuf[i:] q.waitBufLen -= uint64(total) - return total + return total, notifyEcho } // Precondition: q.mu must be locked. diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go index 96d2054cb..e251897b4 100644 --- a/pkg/sentry/fsimpl/devpts/replica.go +++ b/pkg/sentry/fsimpl/devpts/replica.go @@ -17,13 +17,13 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -92,7 +92,7 @@ func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vf // SetStat implements kernfs.Inode.SetStat func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask&linux.STATX_SIZE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) } @@ -141,7 +141,7 @@ func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, arg t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { @@ -179,7 +179,7 @@ func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, arg return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD index c09fdc7f9..1cb049a29 100644 --- a/pkg/sentry/fsimpl/eventfd/BUILD +++ b/pkg/sentry/fsimpl/eventfd/BUILD @@ -9,11 +9,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fdnotifier", "//pkg/hostarch", "//pkg/log", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index 4f79cfcb7..af5ba5131 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -22,11 +22,11 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -149,7 +149,7 @@ func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem var buf [8]byte if _, err := unix.Read(efd.hostfd, buf[:]); err != nil { if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -167,7 +167,7 @@ func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequenc // We can't complete the read if the value is currently zero. if efd.val == 0 { efd.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } // Update the value based on the mode the event is operating in. @@ -200,7 +200,7 @@ func (efd *EventFileDescription) hostWriteLocked(val uint64) error { hostarch.ByteOrder.PutUint64(buf[:], val) _, err := unix.Write(efd.hostfd, buf[:]) if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -232,7 +232,7 @@ func (efd *EventFileDescription) Signal(val uint64) error { // uint64 minus 1. if val > math.MaxUint64-1-efd.val { efd.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } efd.val += val diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 2dbc6bfd5..e69de29bb 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -1,103 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "dirent_list", - out = "dirent_list.go", - package = "ext", - prefix = "dirent", - template = "//pkg/ilist:generic_list", - types = { - "Element": "*dirent", - "Linker": "*dirent", - }, -) - -go_template_instance( - name = "fstree", - out = "fstree.go", - package = "ext", - prefix = "generic", - template = "//pkg/sentry/vfs/genericfstree:generic_fstree", - types = { - "Dentry": "dentry", - }, -) - -go_library( - name = "ext", - srcs = [ - "block_map_file.go", - "dentry.go", - "directory.go", - "dirent_list.go", - "ext.go", - "extent_file.go", - "file_description.go", - "filesystem.go", - "fstree.go", - "inode.go", - "regular_file.go", - "symlink.go", - "utils.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/fd", - "//pkg/fspath", - "//pkg/log", - "//pkg/marshal", - "//pkg/marshal/primitive", - "//pkg/safemem", - "//pkg/sentry/arch", - "//pkg/sentry/fs", - "//pkg/sentry/fs/lock", - "//pkg/sentry/fsimpl/ext/disklayout", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/memmap", - "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/syscalls/linux", - "//pkg/sentry/vfs", - "//pkg/sync", - "//pkg/syserror", - "//pkg/usermem", - "//pkg/waiter", - ], -) - -go_test( - name = "ext_test", - size = "small", - srcs = [ - "block_map_test.go", - "ext_test.go", - "extent_test.go", - ], - data = [ - "//pkg/sentry/fsimpl/ext:assets/bigfile.txt", - "//pkg/sentry/fsimpl/ext:assets/file.txt", - "//pkg/sentry/fsimpl/ext:assets/tiny.ext2", - "//pkg/sentry/fsimpl/ext:assets/tiny.ext3", - "//pkg/sentry/fsimpl/ext:assets/tiny.ext4", - ], - library = ":ext", - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/fspath", - "//pkg/marshal/primitive", - "//pkg/sentry/contexttest", - "//pkg/sentry/fsimpl/ext/disklayout", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/vfs", - "//pkg/syserror", - "//pkg/test/testutil", - "//pkg/usermem", - "@com_github_google_go_cmp//cmp:go_default_library", - "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", - ], -) diff --git a/pkg/sentry/fsimpl/ext/README.md b/pkg/sentry/fsimpl/ext/README.md deleted file mode 100644 index af00cfda8..000000000 --- a/pkg/sentry/fsimpl/ext/README.md +++ /dev/null @@ -1,117 +0,0 @@ -## EXT(2/3/4) File System - -This is a filesystem driver which supports ext2, ext3 and ext4 filesystems. -Linux has specialized drivers for each variant but none which supports all. This -library takes advantage of ext's backward compatibility and understands the -internal organization of on-disk structures to support all variants. - -This driver implementation diverges from the Linux implementations in being more -forgiving about versioning. For instance, if a filesystem contains both extent -based inodes and classical block map based inodes, this driver will not complain -and interpret them both correctly. While in Linux this would be an issue. This -blurs the line between the three ext fs variants. - -Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has -been superseded by ext4 by large performance gains. Thus it is recommended to -upgrade older filesystem images to ext4 using e2fsprogs for better performance. - -### Read Only - -This driver currently only allows read only operations. A lot of the design -decisions are based on this feature. There are plans to implement write (the -process for which is documented in the future work section). - -### Performance - -One of the biggest wins about this driver is that it directly talks to the -underlying block device (or whatever persistent storage is being used), instead -of making expensive RPCs to a gofer. - -Another advantage is that ext fs supports fast concurrent reads. Currently the -device is represented using a `io.ReaderAt` which allows for concurrent reads. -All reads are directly passed to the device driver which intelligently serves -the read requests in the optimal order. There is no congestion due to locking -while reading in the filesystem level. - -Reads are optimized further in the way file data is transferred over to user -memory. Ext fs directly copies over file data from disk into user memory with no -additional allocations on the way. We can only get faster by preloading file -data into memory (see future work section). - -The internal structures used to represent files, inodes and file descriptors use -a lot of inheritance. With the level of indirection that an interface adds with -an internal pointer, it can quickly fragment a structure across memory. As this -runs along side a full blown kernel (which is memory intensive), having a -fragmented struct might hurt performance. Hence these internal structures, -though interfaced, are tightly packed in memory using the same inheritance -pattern that pkg/sentry/vfs uses. The pkg/sentry/fsimpl/ext/disklayout package -makes an execption to this pattern for reasons documented in the package. - -### Security - -This driver also intends to help sandbox the container better by reducing the -surface of the host kernel that the application touches. It prevents the -application from exploiting vulnerabilities in the host filesystem driver. All -`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly -passed to the device driver in the kernel. Hence this reduces the surface for -attack. - -The application can not affect any host filesystems other than the one passed -via block device by the user. - -### Future Work - -#### Write - -To support write operations we would need to modify the block device underneath. -Currently, the driver does not modify the device at all, not even for updating -the access times for reads. Modifying the filesystem incorrectly can corrupt it -and render it unreadable for other correct ext(x) drivers. Hence caution must be -maintained while modifying metadata structures. - -Ext4 specifically is built for performance and has added a lot of complexity as -to how metadata structures are modified. For instance, files that are organized -via an extent tree which must be balanced and file data blocks must be placed in -the same extent as much as possible to increase locality. Such properties must -be maintained while modifying the tree. - -Ext filesystems boast a lot about locality, which plays a big role in them being -performant. The block allocation algorithm in Linux does a good job in keeping -related data together. This behavior must be maintained as much as possible, -else we might end up degrading the filesystem performance over time. - -Ext4 also supports a wide variety of features which are specialized for varying -use cases. Implementing all of them can get difficult very quickly. - -Ext(x) checksums all its metadata structures to check for corruption, so -modification of any metadata struct must correspond with re-checksumming the -struct. Linux filesystem drivers also order on-disk updates intelligently to not -corrupt the filesystem and also remain performant. The in-memory metadata -structures must be kept in sync with what is on disk. - -There is also replication of some important structures across the filesystem. -All replicas must be updated when their original copy is updated. There is also -provisioning for snapshotting which must be kept in mind, although it should not -affect this implementation unless we allow users to create filesystem snapshots. - -Ext4 also introduced journaling (jbd2). The journal must be updated -appropriately. - -#### Performance - -To improve performance we should implement a buffer cache, and optionally, read -ahead for small files. While doing so we must also keep in mind the memory usage -and have a reasonable cap on how much file data we want to hold in memory. - -#### Features - -Our current implementation will work with most ext4 filesystems for readonly -purposed. However, the following features are not supported yet: - -- Journal -- Snapshotting -- Extended Attributes -- Hash Tree Directories -- Meta Block Groups -- Multiple Mount Protection -- Bigalloc diff --git a/pkg/sentry/fsimpl/ext/assets/README.md b/pkg/sentry/fsimpl/ext/assets/README.md deleted file mode 100644 index 6f1e81b3a..000000000 --- a/pkg/sentry/fsimpl/ext/assets/README.md +++ /dev/null @@ -1,36 +0,0 @@ -### Tiny Ext(2/3/4) Images - -The images are of size 64Kb which supports 64 1k blocks and 16 inodes. This is -the smallest size mkfs.ext(2/3/4) works with. - -These images were generated using the following commands. - -```bash -fallocate -l 64K tiny.ext$VERSION -mkfs.ext$VERSION -j tiny.ext$VERSION -``` - -where `VERSION` is `2`, `3` or `4`. - -You can mount it using: - -```bash -sudo mount -o loop tiny.ext$VERSION $MOUNTPOINT -``` - -`file.txt`, `bigfile.txt` and `symlink.txt` were added to this image by just -mounting it and copying (while preserving links) those files to the mountpoint -directory using: - -```bash -sudo cp -P {file.txt,symlink.txt,bigfile.txt} $MOUNTPOINT -``` - -The files in this directory mirror the contents and organisation of the files -stored in the image. - -You can umount the filesystem using: - -```bash -sudo umount $MOUNTPOINT -``` diff --git a/pkg/sentry/fsimpl/ext/assets/bigfile.txt b/pkg/sentry/fsimpl/ext/assets/bigfile.txt deleted file mode 100644 index 3857cf516..000000000 --- a/pkg/sentry/fsimpl/ext/assets/bigfile.txt +++ /dev/null @@ -1,41 +0,0 @@ -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus faucibus eleifend orci, ut ornare nibh faucibus eu. Cras at condimentum massa. Nullam luctus, elit non porttitor congue, sapien diam feugiat sapien, sed eleifend nulla mauris non arcu. Sed lacinia mauris magna, eu mollis libero varius sit amet. Donec mollis, quam convallis commodo posuere, dolor nisi placerat nisi, in faucibus augue mi eu lorem. In pharetra consectetur faucibus. Ut euismod ex efficitur egestas tincidunt. Maecenas condimentum ut ante in rutrum. Vivamus sed arcu tempor, faucibus turpis et, lacinia diam. - -Sed in lacus vel nisl interdum bibendum in sed justo. Nunc tellus risus, molestie vitae arcu sed, molestie tempus ligula. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc risus neque, volutpat et ante non, ullamcorper condimentum ante. Aliquam sed metus in urna condimentum convallis. Vivamus ut libero mauris. Proin mollis posuere consequat. Vestibulum placerat mollis est et pulvinar. - -Donec rutrum odio ac diam pharetra, id fermentum magna cursus. Pellentesque in dapibus elit, et condimentum orci. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Suspendisse euismod dapibus est, id vestibulum mauris. Nulla facilisi. Nulla cursus gravida nisi. Phasellus vestibulum rutrum lectus, a dignissim mauris hendrerit vitae. In at elementum mauris. Integer vel efficitur velit. Nullam fringilla sapien mi, quis luctus neque efficitur ac. Aenean nec quam dapibus nunc commodo pharetra. Proin sapien mi, fermentum aliquet vulputate non, aliquet porttitor diam. Quisque lacinia, urna et finibus fermentum, nunc lacus vehicula ex, sed congue metus lectus ac quam. Aliquam erat volutpat. Suspendisse sodales, dolor ut tincidunt finibus, augue erat varius tellus, a interdum erat sem at nunc. Vestibulum cursus iaculis sapien, vitae feugiat dui auctor quis. - -Pellentesque nec maximus nulla, eu blandit diam. Maecenas quis arcu ornare, congue ante at, vehicula ipsum. Praesent feugiat mauris rutrum sem fermentum, nec luctus ipsum placerat. Pellentesque placerat ipsum at dignissim fringilla. Vivamus et posuere sem, eget hendrerit felis. Aenean vulputate, augue vel mollis feugiat, justo ipsum mollis dolor, eu mollis elit neque ut ipsum. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Fusce bibendum sem quam, vulputate laoreet mi dapibus imperdiet. Sed a purus non nibh pretium aliquet. Integer eget luctus augue, vitae tincidunt magna. Ut eros enim, egestas eu nulla et, lobortis egestas arcu. Cras id ipsum ac justo lacinia rutrum. Vivamus lectus leo, ultricies sed justo at, pellentesque feugiat magna. Ut sollicitudin neque elit, vel ornare mauris commodo id. - -Duis dapibus orci et sapien finibus finibus. Mauris eleifend, lacus at vestibulum maximus, quam ligula pharetra erat, sit amet dapibus neque elit vitae neque. In bibendum sollicitudin erat, eget ultricies tortor malesuada at. Sed sit amet orci turpis. Donec feugiat ligula nibh, molestie tincidunt lectus elementum id. Donec volutpat maximus nibh, in vulputate felis posuere eu. Cras tincidunt ullamcorper lacus. Phasellus porta lorem auctor, congue magna a, commodo elit. - -Etiam auctor mi quis elit sodales, eu pulvinar arcu condimentum. Aenean imperdiet risus et dapibus tincidunt. Nullam tincidunt dictum dui, sed commodo urna rutrum id. Ut mollis libero vel elit laoreet bibendum. Quisque arcu arcu, tincidunt at ultricies id, vulputate nec metus. In tristique posuere quam sit amet volutpat. Vivamus scelerisque et nunc at dapibus. Fusce finibus libero ut ligula pretium rhoncus. Mauris non elit in arcu finibus imperdiet. Pellentesque nec massa odio. Proin rutrum mauris non sagittis efficitur. Aliquam auctor quam at dignissim faucibus. Ut eget ligula in magna posuere ultricies vitae sit amet turpis. Duis maximus odio nulla. Donec gravida sem tristique tempus scelerisque. - -Interdum et malesuada fames ac ante ipsum primis in faucibus. Fusce pharetra magna vulputate aliquet tempus. Duis id hendrerit arcu. Quisque ut ex elit. Integer velit orci, venenatis ut sapien ac, placerat porttitor dui. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc hendrerit cursus diam, hendrerit finibus ipsum scelerisque ut. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. - -Nulla non euismod neque. Phasellus vel sapien eu metus pulvinar rhoncus. Suspendisse eu mollis tellus, quis vestibulum tortor. Maecenas interdum dolor sed nulla fermentum maximus. Donec imperdiet ullamcorper condimentum. Nam quis nibh ante. Praesent quis tellus ut tortor pulvinar blandit sit amet ut sapien. Vestibulum est orci, pellentesque vitae tristique sit amet, tristique non felis. - -Vivamus sodales pellentesque varius. Sed vel tempus ligula. Nulla tristique nisl vel dui facilisis, ac sodales augue hendrerit. Proin augue nisi, vestibulum quis augue nec, sagittis tincidunt velit. Vestibulum euismod, nulla nec sodales faucibus, urna sapien vulputate magna, id varius metus sapien ut neque. Duis in mollis urna, in scelerisque enim. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nunc condimentum dictum turpis, et egestas neque dapibus eget. Quisque fringilla, dui eu venenatis eleifend, erat nibh lacinia urna, at lacinia lacus sapien eu dui. Duis eu erat ut mi lacinia convallis a sed ex. - -Fusce elit metus, tincidunt nec eleifend a, hendrerit nec ligula. Duis placerat finibus sollicitudin. In euismod porta tellus, in luctus justo bibendum bibendum. Maecenas at magna eleifend lectus tincidunt suscipit ut a ligula. Nulla tempor accumsan felis, fermentum dapibus est eleifend vitae. Mauris urna sem, fringilla at ultricies non, ultrices in arcu. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nam vehicula nunc at laoreet imperdiet. Nunc tristique ut risus id aliquet. Integer eleifend massa orci. - -Vestibulum sed ante sollicitudin nisi fringilla bibendum nec vel quam. Sed pretium augue eu ligula congue pulvinar. Donec vitae magna tincidunt, pharetra lacus id, convallis nulla. Cras viverra nisl nisl, varius convallis leo vulputate nec. Morbi at consequat dui, sed aliquet metus. Sed suscipit fermentum mollis. Maecenas nec mi sodales, tincidunt purus in, tristique mauris. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec interdum mi in velit efficitur, quis ultrices ex imperdiet. Sed vestibulum, magna ut tristique pretium, mi ipsum placerat tellus, non tempor enim augue et ex. Pellentesque eget felis quis ante sodales viverra ac sed lacus. Donec suscipit tempus massa, eget laoreet massa molestie at. - -Aenean fringilla dui non aliquet consectetur. Fusce cursus quam nec orci hendrerit faucibus. Donec consequat suscipit enim, non volutpat lectus auctor interdum. Proin lorem purus, maximus vel orci vitae, suscipit egestas turpis. Donec risus urna, congue a sem eu, aliquet placerat odio. Morbi gravida tristique turpis, quis efficitur enim. Nunc interdum gravida ipsum vel facilisis. Nunc congue finibus sollicitudin. Quisque euismod aliquet lectus et tincidunt. Curabitur ultrices sem ut mi fringilla fermentum. Morbi pretium, nisi sit amet dapibus congue, dolor enim consectetur risus, a interdum ligula odio sed odio. Quisque facilisis, mi at suscipit gravida, nunc sapien cursus justo, ut luctus odio nulla quis leo. Integer condimentum lobortis mauris, non egestas tellus lobortis sit amet. - -In sollicitudin velit ac ante vehicula, vitae varius tortor mollis. In hac habitasse platea dictumst. Quisque et orci lorem. Integer malesuada fringilla luctus. Pellentesque malesuada, mi non lobortis porttitor, ante ligula vulputate ante, nec dictum risus eros sit amet sapien. Nulla aliquam lorem libero, ac varius nulla tristique eget. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut pellentesque mauris orci, vel consequat mi varius a. Ut sit amet elit vulputate, lacinia metus non, fermentum nisl. Pellentesque eu nisi sed quam egestas blandit. Duis sit amet lobortis dolor. Donec consectetur sem interdum, tristique elit sit amet, sodales lacus. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Fusce id aliquam augue. Sed pretium congue risus vitae lacinia. Vestibulum non vulputate risus, ut malesuada justo. - -Sed odio elit, consectetur ac mauris quis, consequat commodo libero. Fusce sodales velit vulputate pulvinar fermentum. Donec iaculis nec nisl eget faucibus. Mauris at dictum velit. Donec fermentum lectus eu viverra volutpat. Aliquam consequat facilisis lorem, cursus consequat dui bibendum ullamcorper. Pellentesque nulla magna, imperdiet at magna et, cursus egestas enim. Nullam semper molestie lectus sit amet semper. Duis eget tincidunt est. Integer id neque risus. Integer ultricies hendrerit vestibulum. Donec blandit blandit sagittis. Nunc consectetur vitae nisi consectetur volutpat. - -Nulla id lorem fermentum, efficitur magna a, hendrerit dui. Vivamus sagittis orci gravida, bibendum quam eget, molestie est. Phasellus nec enim tincidunt, volutpat sapien non, laoreet diam. Nulla posuere enim nec porttitor lobortis. Donec auctor odio ut orci eleifend, ut eleifend purus convallis. Interdum et malesuada fames ac ante ipsum primis in faucibus. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut hendrerit, purus eget viverra tincidunt, sem magna imperdiet libero, et aliquam turpis neque vitae elit. Maecenas semper varius iaculis. Cras non lorem quis quam bibendum eleifend in et libero. Curabitur at purus mauris. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus porta diam sed elit eleifend gravida. - -Nulla facilisi. Ut ultricies diam vel diam consectetur, vel porta augue molestie. Fusce interdum sapien et metus facilisis pellentesque. Nulla convallis sem at nunc vehicula facilisis. Nam ac rutrum purus. Nunc bibendum, dolor sit amet tempus ullamcorper, lorem leo tempor sem, id fringilla nunc augue scelerisque augue. Nullam sit amet rutrum nisl. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Donec sed mauris gravida eros vehicula sagittis at eget orci. Cras elementum, eros at accumsan bibendum, libero neque blandit purus, vitae vestibulum libero massa ac nibh. Integer at placerat nulla. Mauris eu eleifend orci. Aliquam consequat ligula vitae erat porta lobortis. Duis fermentum elit ac aliquet ornare. - -Mauris eget cursus tellus, eget sodales purus. Aliquam malesuada, augue id vulputate finibus, nisi ex bibendum nisl, sit amet laoreet quam urna a dolor. Nullam ultricies, sapien eu laoreet consequat, erat eros dignissim diam, ultrices sodales lectus mauris et leo. Morbi lacinia eu ante at tempus. Sed iaculis finibus magna malesuada efficitur. Donec faucibus erat sit amet elementum feugiat. Praesent a placerat nisi. Etiam lacinia gravida diam, et sollicitudin sapien tincidunt ut. - -Maecenas felis quam, tincidunt vitae venenatis scelerisque, viverra vitae odio. Phasellus enim neque, ultricies suscipit malesuada sit amet, vehicula sit amet purus. Nulla placerat sit amet dui vel tincidunt. Nam quis neque vel magna commodo egestas. Vestibulum sagittis rutrum lorem ut congue. Maecenas vel ultrices tellus. Donec efficitur, urna ac consequat iaculis, lorem felis pharetra eros, eget faucibus orci lectus sit amet arcu. - -Ut a tempus nisi. Nulla facilisi. Praesent vulputate maximus mi et dapibus. Sed sit amet libero ac augue hendrerit efficitur in a sapien. Mauris placerat velit sit amet tellus sollicitudin faucibus. Donec egestas a magna ac suscipit. Duis enim sapien, mollis sed egestas et, vestibulum vel leo. - -Proin quis dapibus dui. Donec eu tincidunt nunc. Vivamus eget purus consectetur, maximus ante vitae, tincidunt elit. Aenean mattis dolor a gravida aliquam. Praesent quis tellus id sem maximus vulputate nec sed nulla. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur metus nulla, volutpat volutpat est eu, hendrerit congue erat. Aliquam sollicitudin augue ante. Sed sollicitudin, magna eu consequat elementum, mi augue ullamcorper felis, molestie imperdiet erat metus iaculis est. Proin ac tortor nisi. Pellentesque quis nisi risus. Integer enim sapien, tincidunt quis tortor id, accumsan venenatis mi. Nulla facilisi. - -Cras pretium sit amet quam congue maximus. Morbi lacus libero, imperdiet commodo massa sed, scelerisque placerat libero. Cras nisl nisi, consectetur sed bibendum eu, venenatis at enim. Proin sodales justo at quam aliquam, a consectetur mi ornare. Donec porta ac est sit amet efficitur. Suspendisse vestibulum tortor id neque imperdiet, id lacinia risus vehicula. Phasellus ac eleifend purus. Mauris vel gravida ante. Aliquam vitae lobortis risus. Sed vehicula consectetur tincidunt. Nam et justo vitae purus molestie consequat. Pellentesque ipsum ex, convallis quis blandit non, gravida et urna. Donec diam ligula amet. diff --git a/pkg/sentry/fsimpl/ext/assets/file.txt b/pkg/sentry/fsimpl/ext/assets/file.txt deleted file mode 100644 index 980a0d5f1..000000000 --- a/pkg/sentry/fsimpl/ext/assets/file.txt +++ /dev/null @@ -1 +0,0 @@ -Hello World! diff --git a/pkg/sentry/fsimpl/ext/assets/symlink.txt b/pkg/sentry/fsimpl/ext/assets/symlink.txt deleted file mode 120000 index 4c330738c..000000000 --- a/pkg/sentry/fsimpl/ext/assets/symlink.txt +++ /dev/null @@ -1 +0,0 @@ -file.txt
\ No newline at end of file diff --git a/pkg/sentry/fsimpl/ext/assets/tiny.ext2 b/pkg/sentry/fsimpl/ext/assets/tiny.ext2 Binary files differdeleted file mode 100644 index 381ade9bf..000000000 --- a/pkg/sentry/fsimpl/ext/assets/tiny.ext2 +++ /dev/null diff --git a/pkg/sentry/fsimpl/ext/assets/tiny.ext3 b/pkg/sentry/fsimpl/ext/assets/tiny.ext3 Binary files differdeleted file mode 100644 index 0e97a324c..000000000 --- a/pkg/sentry/fsimpl/ext/assets/tiny.ext3 +++ /dev/null diff --git a/pkg/sentry/fsimpl/ext/assets/tiny.ext4 b/pkg/sentry/fsimpl/ext/assets/tiny.ext4 Binary files differdeleted file mode 100644 index a6859736d..000000000 --- a/pkg/sentry/fsimpl/ext/assets/tiny.ext4 +++ /dev/null diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD deleted file mode 100644 index 6c5a559fd..000000000 --- a/pkg/sentry/fsimpl/ext/benchmark/BUILD +++ /dev/null @@ -1,17 +0,0 @@ -load("//tools:defs.bzl", "go_test") - -package(licenses = ["notice"]) - -go_test( - name = "benchmark_test", - size = "small", - srcs = ["benchmark_test.go"], - deps = [ - "//pkg/context", - "//pkg/fspath", - "//pkg/sentry/contexttest", - "//pkg/sentry/fsimpl/ext", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/vfs", - ], -) diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go deleted file mode 100644 index 2ee7cc7ac..000000000 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// These benchmarks emulate memfs benchmarks. Ext4 images must be created -// before this benchmark is run using the `make_deep_ext4.sh` script at -// /tmp/image-{depth}.ext4 for all the depths tested below. -// -// The benchmark itself cannot run the script because the script requires -// sudo privileges to create the file system images. -package benchmark_test - -import ( - "fmt" - "os" - "runtime" - "strings" - "testing" - - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -var depths = []int{1, 2, 3, 8, 64, 100} - -const filename = "file.txt" - -// setUp opens imagePath as an ext Filesystem and returns all necessary -// elements required to run tests. If error is nil, it also returns a tear -// down function which must be called after the test is run for clean up. -func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) { - f, err := os.Open(imagePath) - if err != nil { - return nil, nil, nil, nil, err - } - - ctx := contexttest.Context(b) - creds := auth.CredentialsFromContext(ctx) - - // Create VFS. - vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(ctx); err != nil { - return nil, nil, nil, nil, err - } - vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{ - GetFilesystemOptions: vfs.GetFilesystemOptions{ - InternalData: int(f.Fd()), - }, - }) - if err != nil { - f.Close() - return nil, nil, nil, nil, err - } - - root := mntns.Root() - root.IncRef() - - tearDown := func() { - root.DecRef(ctx) - - if err := f.Close(); err != nil { - b.Fatalf("tearDown failed: %v", err) - } - } - return ctx, vfsObj, &root, tearDown, nil -} - -// mount mounts extfs at the path operation passed. Returns a tear down -// function which must be called after the test is run for clean up. -func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vfs.PathOperation) func() { - b.Helper() - - f, err := os.Open(imagePath) - if err != nil { - b.Fatalf("could not open image at %s: %v", imagePath, err) - } - - ctx := contexttest.Context(b) - creds := auth.CredentialsFromContext(ctx) - - if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ - GetFilesystemOptions: vfs.GetFilesystemOptions{ - InternalData: int(f.Fd()), - }, - }); err != nil { - b.Fatalf("failed to mount tmpfs submount: %v", err) - } - return func() { - if err := f.Close(); err != nil { - b.Fatalf("tearDown failed: %v", err) - } - } -} - -// BenchmarkVFS2Ext4fsStat emulates BenchmarkVFS2MemfsStat. -func BenchmarkVFS2Ext4fsStat(b *testing.B) { - for _, depth := range depths { - b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { - ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", depth)) - if err != nil { - b.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - creds := auth.CredentialsFromContext(ctx) - var filePathBuilder strings.Builder - filePathBuilder.WriteByte('/') - for i := 1; i <= depth; i++ { - filePathBuilder.WriteString(fmt.Sprintf("%d", i)) - filePathBuilder.WriteByte('/') - } - filePathBuilder.WriteString(filename) - filePath := filePathBuilder.String() - - runtime.GC() - b.ResetTimer() - for i := 0; i < b.N; i++ { - stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ - Root: *root, - Start: *root, - Path: fspath.Parse(filePath), - FollowFinalSymlink: true, - }, &vfs.StatOptions{}) - if err != nil { - b.Fatalf("stat(%q) failed: %v", filePath, err) - } - // Sanity check. - if stat.Size > 0 { - b.Fatalf("got wrong file size (%d)", stat.Size) - } - } - }) - } -} - -// BenchmarkVFS2ExtfsMountStat emulates BenchmarkVFS2MemfsMountStat. -func BenchmarkVFS2ExtfsMountStat(b *testing.B) { - for _, depth := range depths { - b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { - // Create root extfs with depth 1 so we can mount extfs again at /1/. - ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", 1)) - if err != nil { - b.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - creds := auth.CredentialsFromContext(ctx) - mountPointName := "/1/" - pop := vfs.PathOperation{ - Root: *root, - Start: *root, - Path: fspath.Parse(mountPointName), - } - - // Save the mount point for later use. - mountPoint, err := vfsfs.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) - if err != nil { - b.Fatalf("failed to walk to mount point: %v", err) - } - defer mountPoint.DecRef(ctx) - - // Create extfs submount. - mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop) - defer mountTearDown() - - var filePathBuilder strings.Builder - filePathBuilder.WriteString(mountPointName) - for i := 1; i <= depth; i++ { - filePathBuilder.WriteString(fmt.Sprintf("%d", i)) - filePathBuilder.WriteByte('/') - } - filePathBuilder.WriteString(filename) - filePath := filePathBuilder.String() - - runtime.GC() - b.ResetTimer() - for i := 0; i < b.N; i++ { - stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ - Root: *root, - Start: *root, - Path: fspath.Parse(filePath), - FollowFinalSymlink: true, - }, &vfs.StatOptions{}) - if err != nil { - b.Fatalf("stat(%q) failed: %v", filePath, err) - } - // Sanity check. touch(1) always creates files of size 0 (empty). - if stat.Size > 0 { - b.Fatalf("got wrong file size (%d)", stat.Size) - } - } - }) - } -} diff --git a/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh b/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh deleted file mode 100755 index d0910da1f..000000000 --- a/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script creates an ext4 image with $1 depth of directories and a file in -# the inner most directory. The created file is at path /1/2/.../depth/file.txt. -# The ext4 image is written to $2. The image is temporarily mounted at -# /tmp/mountpoint. This script must be run with sudo privileges. - -# Usage: -# sudo bash make_deep_ext4.sh {depth} {output path} - -# Check positional arguments. -if [ "$#" -ne 2 ]; then - echo "Usage: sudo bash make_deep_ext4.sh {depth} {output path}" - exit 1 -fi - -# Make sure depth is a non-negative number. -if ! [[ "$1" =~ ^[0-9]+$ ]]; then - echo "Depth must be a non-negative number." - exit 1 -fi - -# Create a 1 MB filesystem image at the requested output path. -rm -f $2 -fallocate -l 1M $2 -if [ $? -ne 0 ]; then - echo "fallocate failed" - exit $? -fi - -# Convert that blank into an ext4 image. -mkfs.ext4 -j $2 -if [ $? -ne 0 ]; then - echo "mkfs.ext4 failed" - exit $? -fi - -# Mount the image. -MOUNTPOINT=/tmp/mountpoint -mkdir -p $MOUNTPOINT -mount -o loop $2 $MOUNTPOINT -if [ $? -ne 0 ]; then - echo "mount failed" - exit $? -fi - -# Create nested directories and the file. -if [ "$1" -eq 0 ]; then - FILEPATH=$MOUNTPOINT/file.txt -else - FILEPATH=$MOUNTPOINT/$(seq -s '/' 1 $1)/file.txt -fi -mkdir -p $(dirname $FILEPATH) || exit -touch $FILEPATH - -# Clean up. -umount $MOUNTPOINT -rm -rf $MOUNTPOINT diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go deleted file mode 100644 index 1165234f9..000000000 --- a/pkg/sentry/fsimpl/ext/block_map_file.go +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" - "math" - - "gvisor.dev/gvisor/pkg/marshal/primitive" - "gvisor.dev/gvisor/pkg/syserror" -) - -const ( - // numDirectBlks is the number of direct blocks in ext block map inodes. - numDirectBlks = 12 -) - -// blockMapFile is a type of regular file which uses direct/indirect block -// addressing to store file data. This was deprecated in ext4. -type blockMapFile struct { - regFile regularFile - - // directBlks are the direct blocks numbers. The physical blocks pointed by - // these holds file data. Contains file blocks 0 to 11. - directBlks [numDirectBlks]primitive.Uint32 - - // indirectBlk is the physical block which contains (blkSize/4) direct block - // numbers (as uint32 integers). - indirectBlk primitive.Uint32 - - // doubleIndirectBlk is the physical block which contains (blkSize/4) indirect - // block numbers (as uint32 integers). - doubleIndirectBlk primitive.Uint32 - - // tripleIndirectBlk is the physical block which contains (blkSize/4) doubly - // indirect block numbers (as uint32 integers). - tripleIndirectBlk primitive.Uint32 - - // coverage at (i)th index indicates the amount of file data a node at - // height (i) covers. Height 0 is the direct block. - coverage [4]uint64 -} - -// Compiles only if blockMapFile implements io.ReaderAt. -var _ io.ReaderAt = (*blockMapFile)(nil) - -// newBlockMapFile is the blockMapFile constructor. It initializes the file to -// physical blocks map with (at most) the first 12 (direct) blocks. -func newBlockMapFile(args inodeArgs) (*blockMapFile, error) { - file := &blockMapFile{} - file.regFile.impl = file - file.regFile.inode.init(args, &file.regFile) - - for i := uint(0); i < 4; i++ { - file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i) - } - - blkMap := file.regFile.inode.diskInode.Data() - for i := 0; i < numDirectBlks; i++ { - file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4]) - } - file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4]) - file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4]) - file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4]) - return file, nil -} - -// ReadAt implements io.ReaderAt.ReadAt. -func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { - if len(dst) == 0 { - return 0, nil - } - - if off < 0 { - return 0, syserror.EINVAL - } - - offset := uint64(off) - size := f.regFile.inode.diskInode.Size() - if offset >= size { - return 0, io.EOF - } - - // dirBlksEnd is the file offset until which direct blocks cover file data. - // Direct blocks cover 0 <= file offset < dirBlksEnd. - dirBlksEnd := numDirectBlks * f.coverage[0] - - // indirBlkEnd is the file offset until which the indirect block covers file - // data. The indirect block covers dirBlksEnd <= file offset < indirBlkEnd. - indirBlkEnd := dirBlksEnd + f.coverage[1] - - // doubIndirBlkEnd is the file offset until which the double indirect block - // covers file data. The double indirect block covers the range - // indirBlkEnd <= file offset < doubIndirBlkEnd. - doubIndirBlkEnd := indirBlkEnd + f.coverage[2] - - read := 0 - toRead := len(dst) - if uint64(toRead)+offset > size { - toRead = int(size - offset) - } - for read < toRead { - var err error - var curR int - - // Figure out which block to delegate the read to. - switch { - case offset < dirBlksEnd: - // Direct block. - curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:]) - case offset < indirBlkEnd: - // Indirect block. - curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:]) - case offset < doubIndirBlkEnd: - // Doubly indirect block. - curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:]) - default: - // Triply indirect block. - curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:]) - } - - read += curR - offset += uint64(curR) - if err != nil { - return read, err - } - } - - if read < len(dst) { - return read, io.EOF - } - return read, nil -} - -// read is the recursive step of the ReadAt function. It relies on knowing the -// current node's location on disk (curPhyBlk) and its height in the block map -// tree. A height of 0 shows that the current node is actually holding file -// data. relFileOff tells the offset from which we need to start to reading -// under the current node. It is completely relative to the current node. -func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, dst []byte) (int, error) { - curPhyBlkOff := int64(curPhyBlk) * int64(f.regFile.inode.blkSize) - if height == 0 { - toRead := int(f.regFile.inode.blkSize - relFileOff) - if len(dst) < toRead { - toRead = len(dst) - } - - n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff)) - if n < toRead { - return n, syserror.EIO - } - return n, nil - } - - childCov := f.coverage[height-1] - startIdx := relFileOff / childCov - endIdx := f.regFile.inode.blkSize / 4 // This is exclusive. - wantEndIdx := (relFileOff + uint64(len(dst))) / childCov - wantEndIdx++ // Make this exclusive. - if wantEndIdx < endIdx { - endIdx = wantEndIdx - } - - read := 0 - curChildOff := relFileOff % childCov - for i := startIdx; i < endIdx; i++ { - var childPhyBlk primitive.Uint32 - err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) - if err != nil { - return read, err - } - - n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:]) - read += n - if err != nil { - return read, err - } - - curChildOff = 0 - } - - return read, nil -} - -// getCoverage returns the number of bytes a node at the given height covers. -// Height 0 is the file data block itself. Height 1 is the indirect block. -// -// Formula: blkSize * ((blkSize / 4)^height) -func getCoverage(blkSize uint64, height uint) uint64 { - return blkSize * uint64(math.Pow(float64(blkSize/4), float64(height))) -} diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go deleted file mode 100644 index ed98b482e..000000000 --- a/pkg/sentry/fsimpl/ext/block_map_test.go +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "bytes" - "math/rand" - "testing" - - "github.com/google/go-cmp/cmp" - "gvisor.dev/gvisor/pkg/marshal/primitive" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" -) - -// These consts are for mocking the block map tree. -const ( - mockBMBlkSize = uint32(16) - mockBMDiskSize = 2500 -) - -// TestBlockMapReader stress tests block map reader functionality. It performs -// random length reads from all possible positions in the block map structure. -func TestBlockMapReader(t *testing.T) { - mockBMFile, want := blockMapSetUp(t) - n := len(want) - - for from := 0; from < n; from++ { - got := make([]byte, n-from) - - if read, err := mockBMFile.ReadAt(got, int64(from)); err != nil { - t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err) - } - - if diff := cmp.Diff(got, want[from:]); diff != "" { - t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff) - } - } -} - -// blkNumGen is a number generator which gives block numbers for building the -// block map file on disk. It gives unique numbers in a random order which -// facilitates in creating an extremely fragmented filesystem. -type blkNumGen struct { - nums []uint32 -} - -// newBlkNumGen is the blkNumGen constructor. -func newBlkNumGen() *blkNumGen { - blkNums := &blkNumGen{} - lim := mockBMDiskSize / mockBMBlkSize - blkNums.nums = make([]uint32, lim) - for i := uint32(0); i < lim; i++ { - blkNums.nums[i] = i - } - - rand.Shuffle(int(lim), func(i, j int) { - blkNums.nums[i], blkNums.nums[j] = blkNums.nums[j], blkNums.nums[i] - }) - return blkNums -} - -// next returns the next random block number. -func (n *blkNumGen) next() uint32 { - ret := n.nums[0] - n.nums = n.nums[1:] - return ret -} - -// blockMapSetUp creates a mock disk and a block map file. It initializes the -// block map file with 12 direct block, 1 indirect block, 1 double indirect -// block and 1 triple indirect block (basically fill it till the rim). It -// initializes the disk to reflect the inode. Also returns the file data that -// the inode covers and that is written to disk. -func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { - mockDisk := make([]byte, mockBMDiskSize) - var fileData []byte - blkNums := newBlkNumGen() - off := 0 - data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes()) - - // Write the direct blocks. - for i := 0; i < numDirectBlks; i++ { - curBlkNum := primitive.Uint32(blkNums.next()) - curBlkNum.MarshalBytes(data[off:]) - off += curBlkNum.SizeBytes() - fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...) - } - - // Write to indirect block. - indirectBlk := primitive.Uint32(blkNums.next()) - indirectBlk.MarshalBytes(data[off:]) - off += indirectBlk.SizeBytes() - fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...) - - // Write to double indirect block. - doublyIndirectBlk := primitive.Uint32(blkNums.next()) - doublyIndirectBlk.MarshalBytes(data[off:]) - off += doublyIndirectBlk.SizeBytes() - fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...) - - // Write to triple indirect block. - triplyIndirectBlk := primitive.Uint32(blkNums.next()) - triplyIndirectBlk.MarshalBytes(data[off:]) - fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...) - - args := inodeArgs{ - fs: &filesystem{ - dev: bytes.NewReader(mockDisk), - }, - diskInode: &disklayout.InodeNew{ - InodeOld: disklayout.InodeOld{ - SizeLo: getMockBMFileFize(), - }, - }, - blkSize: uint64(mockBMBlkSize), - } - copy(args.diskInode.Data(), data) - - mockFile, err := newBlockMapFile(args) - if err != nil { - t.Fatalf("newBlockMapFile failed: %v", err) - } - return mockFile, fileData -} - -// writeFileDataToBlock writes random bytes to the block on disk. -func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkNumGen) []byte { - if height == 0 { - start := blkNum * mockBMBlkSize - end := start + mockBMBlkSize - rand.Read(disk[start:end]) - return disk[start:end] - } - - var fileData []byte - for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 { - curBlkNum := primitive.Uint32(blkNums.next()) - curBlkNum.MarshalBytes(disk[off : off+4]) - fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...) - } - return fileData -} - -// getMockBMFileFize gets the size of the mock block map file which is used for -// testing. -func getMockBMFileFize() uint32 { - return uint32(numDirectBlks*getCoverage(uint64(mockBMBlkSize), 0) + getCoverage(uint64(mockBMBlkSize), 1) + getCoverage(uint64(mockBMBlkSize), 2) + getCoverage(uint64(mockBMBlkSize), 3)) -} diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go deleted file mode 100644 index 9bfed883a..000000000 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -// dentry implements vfs.DentryImpl. -// -// +stateify savable -type dentry struct { - vfsd vfs.Dentry - - // Protected by filesystem.mu. - parent *dentry - name string - - // inode is the inode represented by this dentry. Multiple Dentries may - // share a single non-directory Inode (with hard links). inode is - // immutable. - inode *inode -} - -// Compiles only if dentry implements vfs.DentryImpl. -var _ vfs.DentryImpl = (*dentry)(nil) - -// newDentry is the dentry constructor. -func newDentry(in *inode) *dentry { - d := &dentry{ - inode: in, - } - d.vfsd.Init(d) - return d -} - -// IncRef implements vfs.DentryImpl.IncRef. -func (d *dentry) IncRef() { - d.inode.incRef() -} - -// TryIncRef implements vfs.DentryImpl.TryIncRef. -func (d *dentry) TryIncRef() bool { - return d.inode.tryIncRef() -} - -// DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef(ctx context.Context) { - // FIXME(b/134676337): filesystem.mu may not be locked as required by - // inode.decRef(). - d.inode.decRef() -} - -// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -// -// TODO(b/134676337): Implement inotify. -func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} - -// Watches implements vfs.DentryImpl.Watches. -// -// TODO(b/134676337): Implement inotify. -func (d *dentry) Watches() *vfs.Watches { - return nil -} - -// OnZeroWatches implements vfs.Dentry.OnZeroWatches. -// -// TODO(b/134676337): Implement inotify. -func (d *dentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go deleted file mode 100644 index 512b70ede..000000000 --- a/pkg/sentry/fsimpl/ext/directory.go +++ /dev/null @@ -1,312 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" -) - -// directory represents a directory inode. It holds the childList in memory. -// -// +stateify savable -type directory struct { - inode inode - - // childCache maps filenames to dentries for children for which dentries - // have been instantiated. childCache is protected by filesystem.mu. - childCache map[string]*dentry - - // mu serializes the changes to childList. - // Lock Order (outermost locks must be taken first): - // directory.mu - // filesystem.mu - mu sync.Mutex `state:"nosave"` - - // childList is a list containing (1) child dirents and (2) fake dirents - // (with diskDirent == nil) that represent the iteration position of - // directoryFDs. childList is used to support directoryFD.IterDirents() - // efficiently. childList is protected by mu. - childList direntList - - // childMap maps the child's filename to the dirent structure stored in - // childList. This adds some data replication but helps in faster path - // traversal. For consistency, key == childMap[key].diskDirent.FileName(). - // Immutable. - childMap map[string]*dirent -} - -// newDirectory is the directory constructor. -func newDirectory(args inodeArgs, newDirent bool) (*directory, error) { - file := &directory{ - childCache: make(map[string]*dentry), - childMap: make(map[string]*dirent), - } - file.inode.init(args, file) - - // Initialize childList by reading dirents from the underlying file. - if args.diskInode.Flags().Index { - // TODO(b/134676337): Support hash tree directories. Currently only the '.' - // and '..' entries are read in. - - // Users cannot navigate this hash tree directory yet. - log.Warningf("hash tree directory being used which is unsupported") - return file, nil - } - - // The dirents are organized in a linear array in the file data. - // Extract the file data and decode the dirents. - regFile, err := newRegularFile(args) - if err != nil { - return nil, err - } - - // buf is used as scratch space for reading in dirents from disk and - // unmarshalling them into dirent structs. - buf := make([]byte, disklayout.DirentSize) - size := args.diskInode.Size() - for off, inc := uint64(0), uint64(0); off < size; off += inc { - toRead := size - off - if toRead > disklayout.DirentSize { - toRead = disklayout.DirentSize - } - if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead { - return nil, err - } - - var curDirent dirent - if newDirent { - curDirent.diskDirent = &disklayout.DirentNew{} - } else { - curDirent.diskDirent = &disklayout.DirentOld{} - } - curDirent.diskDirent.UnmarshalBytes(buf) - - if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 { - // Inode number and name length fields being set to 0 is used to indicate - // an unused dirent. - file.childList.PushBack(&curDirent) - file.childMap[curDirent.diskDirent.FileName()] = &curDirent - } - - // The next dirent is placed exactly after this dirent record on disk. - inc = uint64(curDirent.diskDirent.RecordSize()) - } - - return file, nil -} - -func (i *inode) isDir() bool { - _, ok := i.impl.(*directory) - return ok -} - -// dirent is the directory.childList node. -// -// +stateify savable -type dirent struct { - diskDirent disklayout.Dirent - - // direntEntry links dirents into their parent directory.childList. - direntEntry -} - -// directoryFD represents a directory file description. It implements -// vfs.FileDescriptionImpl. -// -// +stateify savable -type directoryFD struct { - fileDescription - vfs.DirectoryFileDescriptionDefaultImpl - - // Protected by directory.mu. - iter *dirent - off int64 -} - -// Compiles only if directoryFD implements vfs.FileDescriptionImpl. -var _ vfs.FileDescriptionImpl = (*directoryFD)(nil) - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release(ctx context.Context) { - if fd.iter == nil { - return - } - - dir := fd.inode().impl.(*directory) - dir.mu.Lock() - dir.childList.Remove(fd.iter) - dir.mu.Unlock() - fd.iter = nil -} - -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - extfs := fd.filesystem() - dir := fd.inode().impl.(*directory) - - dir.mu.Lock() - defer dir.mu.Unlock() - - // Ensure that fd.iter exists and is not linked into dir.childList. - var child *dirent - if fd.iter == nil { - // Start iteration at the beginning of dir. - child = dir.childList.Front() - fd.iter = &dirent{} - } else { - // Continue iteration from where we left off. - child = fd.iter.Next() - dir.childList.Remove(fd.iter) - } - for ; child != nil; child = child.Next() { - // Skip other directoryFD iterators. - if child.diskDirent != nil { - childType, ok := child.diskDirent.FileType() - if !ok { - // We will need to read the inode off disk. Do not increment - // ref count here because this inode is not being added to the - // dentry tree. - extfs.mu.Lock() - childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode()) - extfs.mu.Unlock() - if err != nil { - // Usage of the file description after the error is - // undefined. This implementation would continue reading - // from the next dirent. - fd.off++ - dir.childList.InsertAfter(child, fd.iter) - return err - } - childType = fs.ToInodeType(childInode.diskInode.Mode().FileType()) - } - - if err := cb.Handle(vfs.Dirent{ - Name: child.diskDirent.FileName(), - Type: fs.ToDirentType(childType), - Ino: uint64(child.diskDirent.Inode()), - NextOff: fd.off + 1, - }); err != nil { - dir.childList.InsertBefore(child, fd.iter) - return err - } - fd.off++ - } - } - dir.childList.PushBack(fd.iter) - return nil -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - if whence != linux.SEEK_SET && whence != linux.SEEK_CUR { - return 0, syserror.EINVAL - } - - dir := fd.inode().impl.(*directory) - - dir.mu.Lock() - defer dir.mu.Unlock() - - // Find resulting offset. - if whence == linux.SEEK_CUR { - offset += fd.off - } - - if offset < 0 { - // lseek(2) specifies that EINVAL should be returned if the resulting offset - // is negative. - return 0, syserror.EINVAL - } - - n := int64(len(dir.childMap)) - realWantOff := offset - if realWantOff > n { - realWantOff = n - } - realCurOff := fd.off - if realCurOff > n { - realCurOff = n - } - - // Ensure that fd.iter exists and is linked into dir.childList so we can - // intelligently seek from the optimal position. - if fd.iter == nil { - fd.iter = &dirent{} - dir.childList.PushFront(fd.iter) - } - - // Guess that iterating from the current position is optimal. - child := fd.iter - diff := realWantOff - realCurOff // Shows direction and magnitude of travel. - - // See if starting from the beginning or end is better. - abDiff := diff - if diff < 0 { - abDiff = -diff - } - if abDiff > realWantOff { - // Starting from the beginning is best. - child = dir.childList.Front() - diff = realWantOff - } else if abDiff > (n - realWantOff) { - // Starting from the end is best. - child = dir.childList.Back() - // (n - 1) because the last non-nil dirent represents the (n-1)th offset. - diff = realWantOff - (n - 1) - } - - for child != nil { - // Skip other directoryFD iterators. - if child.diskDirent != nil { - if diff == 0 { - if child != fd.iter { - dir.childList.Remove(fd.iter) - dir.childList.InsertBefore(child, fd.iter) - } - - fd.off = offset - return offset, nil - } - - if diff < 0 { - diff++ - child = child.Prev() - } else { - diff-- - child = child.Next() - } - continue - } - - if diff < 0 { - child = child.Prev() - } else { - child = child.Next() - } - } - - // Reaching here indicates that the offset is beyond the end of the childList. - dir.childList.Remove(fd.iter) - dir.childList.PushBack(fd.iter) - fd.off = offset - return offset, nil -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD deleted file mode 100644 index d98a05dd8..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/BUILD +++ /dev/null @@ -1,48 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "disklayout", - srcs = [ - "block_group.go", - "block_group_32.go", - "block_group_64.go", - "dirent.go", - "dirent_new.go", - "dirent_old.go", - "disklayout.go", - "extent.go", - "inode.go", - "inode_new.go", - "inode_old.go", - "superblock.go", - "superblock_32.go", - "superblock_64.go", - "superblock_old.go", - "test_utils.go", - ], - marshal = True, - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/marshal", - "//pkg/sentry/fs", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/time", - ], -) - -go_test( - name = "disklayout_test", - size = "small", - srcs = [ - "block_group_test.go", - "dirent_test.go", - "extent_test.go", - "inode_test.go", - "superblock_test.go", - ], - library = ":disklayout", - deps = ["//pkg/sentry/kernel/time"], -) diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go deleted file mode 100644 index 0d56ae9da..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/marshal" -) - -// BlockGroup represents a Linux ext block group descriptor. An ext file system -// is split into a series of block groups. This provides an access layer to -// information needed to access and use a block group. -// -// Location: -// - The block group descriptor table is always placed in the blocks -// immediately after the block containing the superblock. -// - The 1st block group descriptor in the original table is in the -// (sb.FirstDataBlock() + 1)th block. -// - See SuperBlock docs to see where the block group descriptor table is -// replicated. -// - sb.BgDescSize() must be used as the block group descriptor entry size -// while reading the table from disk. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. -type BlockGroup interface { - marshal.Marshallable - - // InodeTable returns the absolute block number of the block containing the - // inode table. This points to an array of Inode structs. Inode tables are - // statically allocated at mkfs time. The superblock records the number of - // inodes per group (length of this table) and the size of each inode struct. - InodeTable() uint64 - - // BlockBitmap returns the absolute block number of the block containing the - // block bitmap. This bitmap tracks the usage of data blocks within this block - // group and has its own checksum. - BlockBitmap() uint64 - - // InodeBitmap returns the absolute block number of the block containing the - // inode bitmap. This bitmap tracks the usage of this group's inode table - // entries and has its own checksum. - InodeBitmap() uint64 - - // ExclusionBitmap returns the absolute block number of the snapshot exclusion - // bitmap. - ExclusionBitmap() uint64 - - // FreeBlocksCount returns the number of free blocks in the group. - FreeBlocksCount() uint32 - - // FreeInodesCount returns the number of free inodes in the group. - FreeInodesCount() uint32 - - // DirectoryCount returns the number of inodes that represent directories - // under this block group. - DirectoryCount() uint32 - - // UnusedInodeCount returns the number of unused inodes beyond the last used - // inode in this group's inode table. As a result, we needn’t scan past the - // (InodesPerGroup - UnusedInodeCount())th entry in the inode table. - UnusedInodeCount() uint32 - - // BlockBitmapChecksum returns the block bitmap checksum. This is calculated - // using crc32c(FS UUID + group number + entire bitmap). - BlockBitmapChecksum() uint32 - - // InodeBitmapChecksum returns the inode bitmap checksum. This is calculated - // using crc32c(FS UUID + group number + entire bitmap). - InodeBitmapChecksum() uint32 - - // Checksum returns this block group's checksum. - // - // If SbMetadataCsum feature is set: - // - checksum is crc32c(FS UUID + group number + group descriptor - // structure) & 0xFFFF. - // - // If SbGdtCsum feature is set: - // - checksum is crc16(FS UUID + group number + group descriptor - // structure). - // - // SbMetadataCsum and SbGdtCsum should not be both set. - // If they are, Linux warns and asks to run fsck. - Checksum() uint16 - - // Flags returns BGFlags which represents the block group flags. - Flags() BGFlags -} - -// These are the different block group flags. -const ( - // BgInodeUninit indicates that inode table and bitmap are not initialized. - BgInodeUninit uint16 = 0x1 - - // BgBlockUninit indicates that block bitmap is not initialized. - BgBlockUninit uint16 = 0x2 - - // BgInodeZeroed indicates that inode table is zeroed. - BgInodeZeroed uint16 = 0x4 -) - -// BGFlags represents all the different combinations of block group flags. -type BGFlags struct { - InodeUninit bool - BlockUninit bool - InodeZeroed bool -} - -// ToInt converts a BGFlags struct back to its 16-bit representation. -func (f BGFlags) ToInt() uint16 { - var res uint16 - - if f.InodeUninit { - res |= BgInodeUninit - } - if f.BlockUninit { - res |= BgBlockUninit - } - if f.InodeZeroed { - res |= BgInodeZeroed - } - - return res -} - -// BGFlagsFromInt converts the 16-bit flag representation to a BGFlags struct. -func BGFlagsFromInt(flags uint16) BGFlags { - return BGFlags{ - InodeUninit: flags&BgInodeUninit > 0, - BlockUninit: flags&BgBlockUninit > 0, - InodeZeroed: flags&BgInodeZeroed > 0, - } -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go deleted file mode 100644 index a35fa22a0..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// BlockGroup32Bit emulates the first half of struct ext4_group_desc in -// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and -// 32-bit ext4 filesystems. It implements BlockGroup interface. -// -// +marshal -type BlockGroup32Bit struct { - BlockBitmapLo uint32 - InodeBitmapLo uint32 - InodeTableLo uint32 - FreeBlocksCountLo uint16 - FreeInodesCountLo uint16 - UsedDirsCountLo uint16 - FlagsRaw uint16 - ExcludeBitmapLo uint32 - BlockBitmapChecksumLo uint16 - InodeBitmapChecksumLo uint16 - ItableUnusedLo uint16 - ChecksumRaw uint16 -} - -// Compiles only if BlockGroup32Bit implements BlockGroup. -var _ BlockGroup = (*BlockGroup32Bit)(nil) - -// InodeTable implements BlockGroup.InodeTable. -func (bg *BlockGroup32Bit) InodeTable() uint64 { return uint64(bg.InodeTableLo) } - -// BlockBitmap implements BlockGroup.BlockBitmap. -func (bg *BlockGroup32Bit) BlockBitmap() uint64 { return uint64(bg.BlockBitmapLo) } - -// InodeBitmap implements BlockGroup.InodeBitmap. -func (bg *BlockGroup32Bit) InodeBitmap() uint64 { return uint64(bg.InodeBitmapLo) } - -// ExclusionBitmap implements BlockGroup.ExclusionBitmap. -func (bg *BlockGroup32Bit) ExclusionBitmap() uint64 { return uint64(bg.ExcludeBitmapLo) } - -// FreeBlocksCount implements BlockGroup.FreeBlocksCount. -func (bg *BlockGroup32Bit) FreeBlocksCount() uint32 { return uint32(bg.FreeBlocksCountLo) } - -// FreeInodesCount implements BlockGroup.FreeInodesCount. -func (bg *BlockGroup32Bit) FreeInodesCount() uint32 { return uint32(bg.FreeInodesCountLo) } - -// DirectoryCount implements BlockGroup.DirectoryCount. -func (bg *BlockGroup32Bit) DirectoryCount() uint32 { return uint32(bg.UsedDirsCountLo) } - -// UnusedInodeCount implements BlockGroup.UnusedInodeCount. -func (bg *BlockGroup32Bit) UnusedInodeCount() uint32 { return uint32(bg.ItableUnusedLo) } - -// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. -func (bg *BlockGroup32Bit) BlockBitmapChecksum() uint32 { return uint32(bg.BlockBitmapChecksumLo) } - -// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. -func (bg *BlockGroup32Bit) InodeBitmapChecksum() uint32 { return uint32(bg.InodeBitmapChecksumLo) } - -// Checksum implements BlockGroup.Checksum. -func (bg *BlockGroup32Bit) Checksum() uint16 { return bg.ChecksumRaw } - -// Flags implements BlockGroup.Flags. -func (bg *BlockGroup32Bit) Flags() BGFlags { return BGFlagsFromInt(bg.FlagsRaw) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go deleted file mode 100644 index d54d1d345..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// BlockGroup64Bit emulates struct ext4_group_desc in fs/ext4/ext4.h. -// It is the block group descriptor struct for 64-bit ext4 filesystems. -// It implements BlockGroup interface. It is an extension of the 32-bit -// version of BlockGroup. -// -// +marshal -type BlockGroup64Bit struct { - // We embed the 32-bit struct here because 64-bit version is just an extension - // of the 32-bit version. - BlockGroup32Bit - - // 64-bit specific fields. - BlockBitmapHi uint32 - InodeBitmapHi uint32 - InodeTableHi uint32 - FreeBlocksCountHi uint16 - FreeInodesCountHi uint16 - UsedDirsCountHi uint16 - ItableUnusedHi uint16 - ExcludeBitmapHi uint32 - BlockBitmapChecksumHi uint16 - InodeBitmapChecksumHi uint16 - _ uint32 // Padding to 64 bytes. -} - -// Compiles only if BlockGroup64Bit implements BlockGroup. -var _ BlockGroup = (*BlockGroup64Bit)(nil) - -// Methods to override. Checksum() and Flags() are not overridden. - -// InodeTable implements BlockGroup.InodeTable. -func (bg *BlockGroup64Bit) InodeTable() uint64 { - return (uint64(bg.InodeTableHi) << 32) | uint64(bg.InodeTableLo) -} - -// BlockBitmap implements BlockGroup.BlockBitmap. -func (bg *BlockGroup64Bit) BlockBitmap() uint64 { - return (uint64(bg.BlockBitmapHi) << 32) | uint64(bg.BlockBitmapLo) -} - -// InodeBitmap implements BlockGroup.InodeBitmap. -func (bg *BlockGroup64Bit) InodeBitmap() uint64 { - return (uint64(bg.InodeBitmapHi) << 32) | uint64(bg.InodeBitmapLo) -} - -// ExclusionBitmap implements BlockGroup.ExclusionBitmap. -func (bg *BlockGroup64Bit) ExclusionBitmap() uint64 { - return (uint64(bg.ExcludeBitmapHi) << 32) | uint64(bg.ExcludeBitmapLo) -} - -// FreeBlocksCount implements BlockGroup.FreeBlocksCount. -func (bg *BlockGroup64Bit) FreeBlocksCount() uint32 { - return (uint32(bg.FreeBlocksCountHi) << 16) | uint32(bg.FreeBlocksCountLo) -} - -// FreeInodesCount implements BlockGroup.FreeInodesCount. -func (bg *BlockGroup64Bit) FreeInodesCount() uint32 { - return (uint32(bg.FreeInodesCountHi) << 16) | uint32(bg.FreeInodesCountLo) -} - -// DirectoryCount implements BlockGroup.DirectoryCount. -func (bg *BlockGroup64Bit) DirectoryCount() uint32 { - return (uint32(bg.UsedDirsCountHi) << 16) | uint32(bg.UsedDirsCountLo) -} - -// UnusedInodeCount implements BlockGroup.UnusedInodeCount. -func (bg *BlockGroup64Bit) UnusedInodeCount() uint32 { - return (uint32(bg.ItableUnusedHi) << 16) | uint32(bg.ItableUnusedLo) -} - -// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum. -func (bg *BlockGroup64Bit) BlockBitmapChecksum() uint32 { - return (uint32(bg.BlockBitmapChecksumHi) << 16) | uint32(bg.BlockBitmapChecksumLo) -} - -// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum. -func (bg *BlockGroup64Bit) InodeBitmapChecksum() uint32 { - return (uint32(bg.InodeBitmapChecksumHi) << 16) | uint32(bg.InodeBitmapChecksumLo) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go deleted file mode 100644 index 568c8cb4c..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/marshal" - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -const ( - // MaxFileName is the maximum length of an ext fs file's name. - MaxFileName = 255 - - // DirentSize is the size of ext dirent structures. - DirentSize = 263 -) - -var ( - // inodeTypeByFileType maps ext4 file types to vfs inode types. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#ftype. - inodeTypeByFileType = map[uint8]fs.InodeType{ - 0: fs.Anonymous, - 1: fs.RegularFile, - 2: fs.Directory, - 3: fs.CharacterDevice, - 4: fs.BlockDevice, - 5: fs.Pipe, - 6: fs.Socket, - 7: fs.Symlink, - } -) - -// The Dirent interface should be implemented by structs representing ext -// directory entries. These are for the linear classical directories which -// just store a list of dirent structs. A directory is a series of data blocks -// where is each data block contains a linear array of dirents. The last entry -// of the block has a record size that takes it to the end of the block. The -// end of the directory is when you read dirInode.Size() bytes from the blocks. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. -type Dirent interface { - marshal.Marshallable - - // Inode returns the absolute inode number of the underlying inode. - // Inode number 0 signifies an unused dirent. - Inode() uint32 - - // RecordSize returns the record length of this dirent on disk. The next - // dirent in the dirent list should be read after these many bytes from - // the current dirent. Must be a multiple of 4. - RecordSize() uint16 - - // FileName returns the name of the file. Can be at most 255 is length. - FileName() string - - // FileType returns the inode type of the underlying inode. This is a - // performance hack so that we do not have to read the underlying inode struct - // to know the type of inode. This will only work when the SbDirentFileType - // feature is set. If not, the second returned value will be false indicating - // that user code has to use the inode mode to extract the file type. - FileType() (fs.InodeType, bool) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go deleted file mode 100644 index 51f9c2946..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -// DirentNew represents the ext4 directory entry struct. This emulates Linux's -// ext4_dir_entry_2 struct. The FileName can not be more than 255 bytes so we -// only need 8 bits to store the NameLength. As a result, NameLength has been -// shortened and the other 8 bits are used to encode the file type. Use the -// FileTypeRaw field only if the SbDirentFileType feature is set. -// -// Note: This struct can be of variable size on disk. The one described below -// is of maximum size and the FileName beyond NameLength bytes might contain -// garbage. -// -// +marshal -type DirentNew struct { - InodeNumber uint32 - RecordLength uint16 - NameLength uint8 - FileTypeRaw uint8 - FileNameRaw [MaxFileName]byte `marshal:"unaligned"` -} - -// Compiles only if DirentNew implements Dirent. -var _ Dirent = (*DirentNew)(nil) - -// Inode implements Dirent.Inode. -func (d *DirentNew) Inode() uint32 { return d.InodeNumber } - -// RecordSize implements Dirent.RecordSize. -func (d *DirentNew) RecordSize() uint16 { return d.RecordLength } - -// FileName implements Dirent.FileName. -func (d *DirentNew) FileName() string { - return string(d.FileNameRaw[:d.NameLength]) -} - -// FileType implements Dirent.FileType. -func (d *DirentNew) FileType() (fs.InodeType, bool) { - if inodeType, ok := inodeTypeByFileType[d.FileTypeRaw]; ok { - return inodeType, true - } - - panic(fmt.Sprintf("unknown file type %v", d.FileTypeRaw)) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go deleted file mode 100644 index d4b19e086..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import "gvisor.dev/gvisor/pkg/sentry/fs" - -// DirentOld represents the old directory entry struct which does not contain -// the file type. This emulates Linux's ext4_dir_entry struct. -// -// Note: This struct can be of variable size on disk. The one described below -// is of maximum size and the FileName beyond NameLength bytes might contain -// garbage. -// -// +marshal -type DirentOld struct { - InodeNumber uint32 - RecordLength uint16 - NameLength uint16 - FileNameRaw [MaxFileName]byte `marshal:"unaligned"` -} - -// Compiles only if DirentOld implements Dirent. -var _ Dirent = (*DirentOld)(nil) - -// Inode implements Dirent.Inode. -func (d *DirentOld) Inode() uint32 { return d.InodeNumber } - -// RecordSize implements Dirent.RecordSize. -func (d *DirentOld) RecordSize() uint16 { return d.RecordLength } - -// FileName implements Dirent.FileName. -func (d *DirentOld) FileName() string { - return string(d.FileNameRaw[:d.NameLength]) -} - -// FileType implements Dirent.FileType. -func (d *DirentOld) FileType() (fs.InodeType, bool) { - return fs.Anonymous, false -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go deleted file mode 100644 index 0834e9ba8..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package disklayout provides Linux ext file system's disk level structures -// which can be directly read into from the underlying device. Structs aim to -// emulate structures `exactly` how they are layed out on disk. -// -// This library aims to be compatible with all ext(2/3/4) systems so it -// provides a generic interface for all major structures and various -// implementations (for different versions). The user code is responsible for -// using appropriate implementations based on the underlying device. -// -// Interfacing all major structures here serves a few purposes: -// - Abstracts away the complexity of the underlying structure from client -// code. The client only has to figure out versioning on set up and then -// can use these as black boxes and pass it higher up the stack. -// - Having pointer receivers forces the user to use pointers to these -// heavy structs. Hence, prevents the client code from unintentionally -// copying these by value while passing the interface around. -// - Version-based implementation selection is resolved on set up hence -// avoiding per call overhead of choosing implementation. -// - All interface methods are pretty light weight (do not take in any -// parameters by design). Passing pointer arguments to interface methods -// can lead to heap allocation as the compiler won't be able to perform -// escape analysis on an unknown implementation at compile time. -// -// Notes: -// - All structures on disk are in little-endian order. Only jbd2 (journal) -// structures are in big-endian order. -// - All OS dependent fields in these structures will be interpretted using -// the Linux version of that field. -// - The suffix `Lo` in field names stands for lower bits of that field. -// - The suffix `Hi` in field names stands for upper bits of that field. -// - The suffix `Raw` has been added to indicate that the field is not split -// into Lo and Hi fields and also to resolve name collision with the -// respective interface. -package disklayout diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go deleted file mode 100644 index b13999bfc..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/extent.go +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/marshal" -) - -// Extents were introduced in ext4 and provide huge performance gains in terms -// data locality and reduced metadata block usage. Extents are organized in -// extent trees. The root node is contained in inode.BlocksRaw. -// -// Terminology: -// - Physical Block: -// Filesystem data block which is addressed normally wrt the entire -// filesystem (addressed with 48 bits). -// -// - File Block: -// Data block containing *only* file data and addressed wrt to the file -// with only 32 bits. The (i)th file block contains file data from -// byte (i * sb.BlockSize()) to ((i+1) * sb.BlockSize()). - -const ( - // ExtentHeaderSize is the size of the header of an extent tree node. - ExtentHeaderSize = 12 - - // ExtentEntrySize is the size of an entry in an extent tree node. - // This size is the same for both leaf and internal nodes. - ExtentEntrySize = 12 - - // ExtentMagic is the magic number which must be present in the header. - ExtentMagic = 0xf30a -) - -// ExtentEntryPair couples an in-memory ExtendNode with the ExtentEntry that -// points to it. We want to cache these structs in memory to avoid repeated -// disk reads. -// -// Note: This struct itself does not represent an on-disk struct. -type ExtentEntryPair struct { - // Entry points to the child node on disk. - Entry ExtentEntry - // Node points to child node in memory. Is nil if the current node is a leaf. - Node *ExtentNode -} - -// ExtentNode represents an extent tree node. For internal nodes, all Entries -// will be ExtendIdxs. For leaf nodes, they will all be Extents. -// -// Note: This struct itself does not represent an on-disk struct. -type ExtentNode struct { - Header ExtentHeader - Entries []ExtentEntryPair -} - -// ExtentEntry represents an extent tree node entry. The entry can either be -// an ExtentIdx or Extent itself. This exists to simplify navigation logic. -type ExtentEntry interface { - marshal.Marshallable - - // FileBlock returns the first file block number covered by this entry. - FileBlock() uint32 - - // PhysicalBlock returns the child physical block that this entry points to. - PhysicalBlock() uint64 -} - -// ExtentHeader emulates the ext4_extent_header struct in ext4. Each extent -// tree node begins with this and is followed by `NumEntries` number of: -// - Extent if `Depth` == 0 -// - ExtentIdx otherwise -// -// +marshal -type ExtentHeader struct { - // Magic in the extent magic number, must be 0xf30a. - Magic uint16 - - // NumEntries indicates the number of valid entries following the header. - NumEntries uint16 - - // MaxEntries that could follow the header. Used while adding entries. - MaxEntries uint16 - - // Height represents the distance of this node from the farthest leaf. Please - // note that Linux incorrectly calls this `Depth` (which means the distance - // of the node from the root). - Height uint16 - _ uint32 -} - -// ExtentIdx emulates the ext4_extent_idx struct in ext4. Only present in -// internal nodes. Sorted in ascending order based on FirstFileBlock since -// Linux does a binary search on this. This points to a block containing the -// child node. -// -// +marshal -type ExtentIdx struct { - FirstFileBlock uint32 - ChildBlockLo uint32 - ChildBlockHi uint16 - _ uint16 -} - -// Compiles only if ExtentIdx implements ExtentEntry. -var _ ExtentEntry = (*ExtentIdx)(nil) - -// FileBlock implements ExtentEntry.FileBlock. -func (ei *ExtentIdx) FileBlock() uint32 { - return ei.FirstFileBlock -} - -// PhysicalBlock implements ExtentEntry.PhysicalBlock. It returns the -// physical block number of the child block. -func (ei *ExtentIdx) PhysicalBlock() uint64 { - return (uint64(ei.ChildBlockHi) << 32) | uint64(ei.ChildBlockLo) -} - -// Extent represents the ext4_extent struct in ext4. Only present in leaf -// nodes. Sorted in ascending order based on FirstFileBlock since Linux does a -// binary search on this. This points to an array of data blocks containing the -// file data. It covers `Length` data blocks starting from `StartBlock`. -// -// +marshal -type Extent struct { - FirstFileBlock uint32 - Length uint16 - StartBlockHi uint16 - StartBlockLo uint32 -} - -// Compiles only if Extent implements ExtentEntry. -var _ ExtentEntry = (*Extent)(nil) - -// FileBlock implements ExtentEntry.FileBlock. -func (e *Extent) FileBlock() uint32 { - return e.FirstFileBlock -} - -// PhysicalBlock implements ExtentEntry.PhysicalBlock. It returns the -// physical block number of the first data block this extent covers. -func (e *Extent) PhysicalBlock() uint64 { - return (uint64(e.StartBlockHi) << 32) | uint64(e.StartBlockLo) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go deleted file mode 100644 index ef25040a9..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/inode.go +++ /dev/null @@ -1,277 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/marshal" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/time" -) - -// Special inodes. See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#special-inodes. -const ( - // RootDirInode is the inode number of the root directory inode. - RootDirInode = 2 -) - -// The Inode interface must be implemented by structs representing ext inodes. -// The inode stores all the metadata pertaining to the file (except for the -// file name which is held by the directory entry). It does NOT expose all -// fields and should be extended if need be. -// -// Some file systems (e.g. FAT) use the directory entry to store all this -// information. Ext file systems do not so that they can support hard links. -// However, ext4 cheats a little bit and duplicates the file type in the -// directory entry for performance gains. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. -type Inode interface { - marshal.Marshallable - - // Mode returns the linux file mode which is majorly used to extract - // information like: - // - File permissions (read/write/execute by user/group/others). - // - Sticky, set UID and GID bits. - // - File type. - // - // Masks to extract this information are provided in pkg/abi/linux/file.go. - Mode() linux.FileMode - - // UID returns the owner UID. - UID() auth.KUID - - // GID returns the owner GID. - GID() auth.KGID - - // Size returns the size of the file in bytes. - Size() uint64 - - // InodeSize returns the size of this inode struct in bytes. - // In ext2 and ext3, the inode struct and inode disk record size was fixed at - // 128 bytes. Ext4 makes it possible for the inode struct to be bigger. - // However, accessing any field beyond the 128 bytes marker must be verified - // using this method. - InodeSize() uint16 - - // AccessTime returns the last access time. Shows when the file was last read. - // - // If InExtendedAttr is set, then this should NOT be used because the - // underlying field is used to store the extended attribute value checksum. - AccessTime() time.Time - - // ChangeTime returns the last change time. Shows when the file meta data - // (like permissions) was last changed. - // - // If InExtendedAttr is set, then this should NOT be used because the - // underlying field is used to store the lower 32 bits of the attribute - // value’s reference count. - ChangeTime() time.Time - - // ModificationTime returns the last modification time. Shows when the file - // content was last modified. - // - // If InExtendedAttr is set, then this should NOT be used because - // the underlying field contains the number of the inode that owns the - // extended attribute. - ModificationTime() time.Time - - // DeletionTime returns the deletion time. Inodes are marked as deleted by - // writing to the underlying field. FS tools can restore files until they are - // actually overwritten. - DeletionTime() time.Time - - // LinksCount returns the number of hard links to this inode. - // - // Normally there is an upper limit on the number of hard links: - // - ext2/ext3 = 32,000 - // - ext4 = 65,000 - // - // This implies that an ext4 directory cannot have more than 64,998 - // subdirectories because each subdirectory will have a hard link to the - // directory via the `..` entry. The directory has hard link via the `.` entry - // of its own. And finally the inode is initiated with 1 hard link (itself). - // - // The underlying value is reset to 1 if all the following hold: - // - Inode is a directory. - // - SbDirNlink is enabled. - // - Number of hard links is incremented past 64,999. - // Hard link value of 1 for a directory would indicate that the number of hard - // links is unknown because a directory can have minimum 2 hard links (itself - // and `.` entry). - LinksCount() uint16 - - // Flags returns InodeFlags which represents the inode flags. - Flags() InodeFlags - - // Data returns the underlying inode.i_block array as a slice so it's - // modifiable. This field is special and is used to store various kinds of - // things depending on the filesystem version and inode type. The underlying - // field name in Linux is a little misleading. - // - In ext2/ext3, it contains the block map. - // - In ext4, it contains the extent tree root node. - // - For inline files, it contains the file contents. - // - For symlinks, it contains the link path (if it fits here). - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block. - Data() []byte -} - -// Inode flags. This is not comprehensive and flags which were not used in -// the Linux kernel have been excluded. -const ( - // InSync indicates that all writes to the file must be synchronous. - InSync = 0x8 - - // InImmutable indicates that this file is immutable. - InImmutable = 0x10 - - // InAppend indicates that this file can only be appended to. - InAppend = 0x20 - - // InNoDump indicates that teh dump(1) utility should not dump this file. - InNoDump = 0x40 - - // InNoAccessTime indicates that the access time of this inode must not be - // updated. - InNoAccessTime = 0x80 - - // InIndex indicates that this directory has hashed indexes. - InIndex = 0x1000 - - // InJournalData indicates that file data must always be written through a - // journal device. - InJournalData = 0x4000 - - // InDirSync indicates that all the directory entiry data must be written - // synchronously. - InDirSync = 0x10000 - - // InTopDir indicates that this inode is at the top of the directory hierarchy. - InTopDir = 0x20000 - - // InHugeFile indicates that this is a huge file. - InHugeFile = 0x40000 - - // InExtents indicates that this inode uses extents. - InExtents = 0x80000 - - // InExtendedAttr indicates that this inode stores a large extended attribute - // value in its data blocks. - InExtendedAttr = 0x200000 - - // InInline indicates that this inode has inline data. - InInline = 0x10000000 - - // InReserved indicates that this inode is reserved for the ext4 library. - InReserved = 0x80000000 -) - -// InodeFlags represents all possible combinations of inode flags. It aims to -// cover the bit masks and provide a more user-friendly interface. -type InodeFlags struct { - Sync bool - Immutable bool - Append bool - NoDump bool - NoAccessTime bool - Index bool - JournalData bool - DirSync bool - TopDir bool - HugeFile bool - Extents bool - ExtendedAttr bool - Inline bool - Reserved bool -} - -// ToInt converts inode flags back to its 32-bit rep. -func (f InodeFlags) ToInt() uint32 { - var res uint32 - - if f.Sync { - res |= InSync - } - if f.Immutable { - res |= InImmutable - } - if f.Append { - res |= InAppend - } - if f.NoDump { - res |= InNoDump - } - if f.NoAccessTime { - res |= InNoAccessTime - } - if f.Index { - res |= InIndex - } - if f.JournalData { - res |= InJournalData - } - if f.DirSync { - res |= InDirSync - } - if f.TopDir { - res |= InTopDir - } - if f.HugeFile { - res |= InHugeFile - } - if f.Extents { - res |= InExtents - } - if f.ExtendedAttr { - res |= InExtendedAttr - } - if f.Inline { - res |= InInline - } - if f.Reserved { - res |= InReserved - } - - return res -} - -// InodeFlagsFromInt converts the integer representation of inode flags to -// a InodeFlags struct. -func InodeFlagsFromInt(f uint32) InodeFlags { - return InodeFlags{ - Sync: f&InSync > 0, - Immutable: f&InImmutable > 0, - Append: f&InAppend > 0, - NoDump: f&InNoDump > 0, - NoAccessTime: f&InNoAccessTime > 0, - Index: f&InIndex > 0, - JournalData: f&InJournalData > 0, - DirSync: f&InDirSync > 0, - TopDir: f&InTopDir > 0, - HugeFile: f&InHugeFile > 0, - Extents: f&InExtents > 0, - ExtendedAttr: f&InExtendedAttr > 0, - Inline: f&InInline > 0, - Reserved: f&InReserved > 0, - } -} - -// These masks define how users can view/modify inode flags. The rest of the -// flags are for internal kernel usage only. -const ( - InUserReadFlagMask = 0x4BDFFF - InUserWriteFlagMask = 0x4B80FF -) diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go deleted file mode 100644 index a4503f5cf..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import "gvisor.dev/gvisor/pkg/sentry/kernel/time" - -// InodeNew represents ext4 inode structure which can be bigger than -// OldInodeSize. The actual size of this struct should be determined using -// inode.ExtraInodeSize. Accessing any field here should be verified with the -// actual size. The extra space between the end of the inode struct and end of -// the inode record can be used to store extended attr. -// -// If the TimeExtra fields are in scope, the lower 2 bits of those are used -// to extend their counter part to be 34 bits wide; the rest (upper) 30 bits -// are used to provide nanoscond precision. Hence, these timestamps will now -// overflow in May 2446. -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. -// -// +marshal -type InodeNew struct { - InodeOld - - ExtraInodeSize uint16 - ChecksumHi uint16 - ChangeTimeExtra uint32 - ModificationTimeExtra uint32 - AccessTimeExtra uint32 - CreationTime uint32 - CreationTimeExtra uint32 - VersionHi uint32 - ProjectID uint32 -} - -// Compiles only if InodeNew implements Inode. -var _ Inode = (*InodeNew)(nil) - -// fromExtraTime decodes the extra time and constructs the kernel time struct -// with nanosecond precision. -func fromExtraTime(lo int32, extra uint32) time.Time { - // See description above InodeNew for format. - seconds := (int64(extra&0x3) << 32) + int64(lo) - nanoseconds := int64(extra >> 2) - return time.FromUnix(seconds, nanoseconds) -} - -// Only override methods which change due to ext4 specific fields. - -// Size implements Inode.Size. -func (in *InodeNew) Size() uint64 { - return (uint64(in.SizeHi) << 32) | uint64(in.SizeLo) -} - -// InodeSize implements Inode.InodeSize. -func (in *InodeNew) InodeSize() uint16 { - return OldInodeSize + in.ExtraInodeSize -} - -// ChangeTime implements Inode.ChangeTime. -func (in *InodeNew) ChangeTime() time.Time { - // Apply new timestamp logic if inode.ChangeTimeExtra is in scope. - if in.ExtraInodeSize >= 8 { - return fromExtraTime(in.ChangeTimeRaw, in.ChangeTimeExtra) - } - - return in.InodeOld.ChangeTime() -} - -// ModificationTime implements Inode.ModificationTime. -func (in *InodeNew) ModificationTime() time.Time { - // Apply new timestamp logic if inode.ModificationTimeExtra is in scope. - if in.ExtraInodeSize >= 12 { - return fromExtraTime(in.ModificationTimeRaw, in.ModificationTimeExtra) - } - - return in.InodeOld.ModificationTime() -} - -// AccessTime implements Inode.AccessTime. -func (in *InodeNew) AccessTime() time.Time { - // Apply new timestamp logic if inode.AccessTimeExtra is in scope. - if in.ExtraInodeSize >= 16 { - return fromExtraTime(in.AccessTimeRaw, in.AccessTimeExtra) - } - - return in.InodeOld.AccessTime() -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go deleted file mode 100644 index e6b28babf..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/time" -) - -const ( - // OldInodeSize is the inode size in ext2/ext3. - OldInodeSize = 128 -) - -// InodeOld implements Inode interface. It emulates ext2/ext3 inode struct. -// Inode struct size and record size are both 128 bytes for this. -// -// All fields representing time are in seconds since the epoch. Which means that -// they will overflow in January 2038. -// -// +marshal -type InodeOld struct { - ModeRaw uint16 - UIDLo uint16 - SizeLo uint32 - - // The time fields are signed integers because they could be negative to - // represent time before the epoch. - AccessTimeRaw int32 - ChangeTimeRaw int32 - ModificationTimeRaw int32 - DeletionTimeRaw int32 - - GIDLo uint16 - LinksCountRaw uint16 - BlocksCountLo uint32 - FlagsRaw uint32 - VersionLo uint32 // This is OS dependent. - DataRaw [60]byte - Generation uint32 - FileACLLo uint32 - SizeHi uint32 - ObsoFaddr uint32 - - // OS dependent fields have been inlined here. - BlocksCountHi uint16 - FileACLHi uint16 - UIDHi uint16 - GIDHi uint16 - ChecksumLo uint16 - _ uint16 -} - -// Compiles only if InodeOld implements Inode. -var _ Inode = (*InodeOld)(nil) - -// Mode implements Inode.Mode. -func (in *InodeOld) Mode() linux.FileMode { return linux.FileMode(in.ModeRaw) } - -// UID implements Inode.UID. -func (in *InodeOld) UID() auth.KUID { - return auth.KUID((uint32(in.UIDHi) << 16) | uint32(in.UIDLo)) -} - -// GID implements Inode.GID. -func (in *InodeOld) GID() auth.KGID { - return auth.KGID((uint32(in.GIDHi) << 16) | uint32(in.GIDLo)) -} - -// Size implements Inode.Size. -func (in *InodeOld) Size() uint64 { - // In ext2/ext3, in.SizeHi did not exist, it was instead named in.DirACL. - return uint64(in.SizeLo) -} - -// InodeSize implements Inode.InodeSize. -func (in *InodeOld) InodeSize() uint16 { return OldInodeSize } - -// AccessTime implements Inode.AccessTime. -func (in *InodeOld) AccessTime() time.Time { - return time.FromUnix(int64(in.AccessTimeRaw), 0) -} - -// ChangeTime implements Inode.ChangeTime. -func (in *InodeOld) ChangeTime() time.Time { - return time.FromUnix(int64(in.ChangeTimeRaw), 0) -} - -// ModificationTime implements Inode.ModificationTime. -func (in *InodeOld) ModificationTime() time.Time { - return time.FromUnix(int64(in.ModificationTimeRaw), 0) -} - -// DeletionTime implements Inode.DeletionTime. -func (in *InodeOld) DeletionTime() time.Time { - return time.FromUnix(int64(in.DeletionTimeRaw), 0) -} - -// LinksCount implements Inode.LinksCount. -func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw } - -// Flags implements Inode.Flags. -func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) } - -// Data implements Inode.Data. -func (in *InodeOld) Data() []byte { return in.DataRaw[:] } diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go deleted file mode 100644 index 90744e956..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "fmt" - "strconv" - "testing" - - "gvisor.dev/gvisor/pkg/sentry/kernel/time" -) - -// TestInodeSize tests that the inode structs are of the correct size. -func TestInodeSize(t *testing.T) { - var iOld InodeOld - assertSize(t, &iOld, OldInodeSize) - - // This was updated from 156 bytes to 160 bytes in Oct 2015. - var iNew InodeNew - assertSize(t, &iNew, 160) -} - -// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in -// ext4 inode structs are decoded correctly. -// -// These tests are derived from the table under https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. -func TestTimestampSeconds(t *testing.T) { - type timestampTest struct { - // msbSet tells if the most significant bit of InodeOld.[X]TimeRaw is set. - // If this is set then the 32-bit time is negative. - msbSet bool - - // lowerBound tells if we should take the lowest possible value of - // InodeOld.[X]TimeRaw while satisfying test.msbSet condition. If set to - // false it tells to take the highest possible value. - lowerBound bool - - // extraBits is InodeNew.[X]TimeExtra. - extraBits uint32 - - // want is the kernel time struct that is expected. - want time.Time - } - - tests := []timestampTest{ - // 1901-12-13 - { - msbSet: true, - lowerBound: true, - extraBits: 0, - want: time.FromUnix(int64(-0x80000000), 0), - }, - - // 1969-12-31 - { - msbSet: true, - lowerBound: false, - extraBits: 0, - want: time.FromUnix(int64(-1), 0), - }, - - // 1970-01-01 - { - msbSet: false, - lowerBound: true, - extraBits: 0, - want: time.FromUnix(int64(0), 0), - }, - - // 2038-01-19 - { - msbSet: false, - lowerBound: false, - extraBits: 0, - want: time.FromUnix(int64(0x7fffffff), 0), - }, - - // 2038-01-19 - { - msbSet: true, - lowerBound: true, - extraBits: 1, - want: time.FromUnix(int64(0x80000000), 0), - }, - - // 2106-02-07 - { - msbSet: true, - lowerBound: false, - extraBits: 1, - want: time.FromUnix(int64(0xffffffff), 0), - }, - - // 2106-02-07 - { - msbSet: false, - lowerBound: true, - extraBits: 1, - want: time.FromUnix(int64(0x100000000), 0), - }, - - // 2174-02-25 - { - msbSet: false, - lowerBound: false, - extraBits: 1, - want: time.FromUnix(int64(0x17fffffff), 0), - }, - - // 2174-02-25 - { - msbSet: true, - lowerBound: true, - extraBits: 2, - want: time.FromUnix(int64(0x180000000), 0), - }, - - // 2242-03-16 - { - msbSet: true, - lowerBound: false, - extraBits: 2, - want: time.FromUnix(int64(0x1ffffffff), 0), - }, - - // 2242-03-16 - { - msbSet: false, - lowerBound: true, - extraBits: 2, - want: time.FromUnix(int64(0x200000000), 0), - }, - - // 2310-04-04 - { - msbSet: false, - lowerBound: false, - extraBits: 2, - want: time.FromUnix(int64(0x27fffffff), 0), - }, - - // 2310-04-04 - { - msbSet: true, - lowerBound: true, - extraBits: 3, - want: time.FromUnix(int64(0x280000000), 0), - }, - - // 2378-04-22 - { - msbSet: true, - lowerBound: false, - extraBits: 3, - want: time.FromUnix(int64(0x2ffffffff), 0), - }, - - // 2378-04-22 - { - msbSet: false, - lowerBound: true, - extraBits: 3, - want: time.FromUnix(int64(0x300000000), 0), - }, - - // 2446-05-10 - { - msbSet: false, - lowerBound: false, - extraBits: 3, - want: time.FromUnix(int64(0x37fffffff), 0), - }, - } - - lowerMSB0 := int32(0) // binary: 00000000 00000000 00000000 00000000 - upperMSB0 := int32(0x7fffffff) // binary: 01111111 11111111 11111111 11111111 - lowerMSB1 := int32(-0x80000000) // binary: 10000000 00000000 00000000 00000000 - upperMSB1 := int32(-1) // binary: 11111111 11111111 11111111 11111111 - - get32BitTime := func(test timestampTest) int32 { - if test.msbSet { - if test.lowerBound { - return lowerMSB1 - } - - return upperMSB1 - } - - if test.lowerBound { - return lowerMSB0 - } - - return upperMSB0 - } - - getTestName := func(test timestampTest) string { - return fmt.Sprintf( - "Tests time decoding with epoch bits 0b%s and 32-bit raw time: MSB set=%t, lower bound=%t", - strconv.FormatInt(int64(test.extraBits), 2), - test.msbSet, - test.lowerBound, - ) - } - - for _, test := range tests { - t.Run(getTestName(test), func(t *testing.T) { - if got := fromExtraTime(get32BitTime(test), test.extraBits); got != test.want { - t.Errorf("Expected: %v, Got: %v", test.want, got) - } - }) - } -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go deleted file mode 100644 index 70948ebe9..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go +++ /dev/null @@ -1,477 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -import ( - "gvisor.dev/gvisor/pkg/marshal" -) - -const ( - // SbOffset is the absolute offset at which the superblock is placed. - SbOffset = 1024 -) - -// SuperBlock should be implemented by structs representing the ext superblock. -// The superblock holds a lot of information about the enclosing filesystem. -// This interface aims to provide access methods to important information held -// by the superblock. It does NOT expose all fields of the superblock, only the -// ones necessary. This can be expanded when need be. -// -// Location and replication: -// - The superblock is located at offset 1024 in block group 0. -// - Redundant copies of the superblock and group descriptors are kept in -// all groups if SbSparse feature flag is NOT set. If it is set, the -// replicas only exist in groups whose group number is either 0 or a -// power of 3, 5, or 7. -// - There is also a sparse superblock feature v2 in which there are just -// two replicas saved in the block groups pointed by sb.s_backup_bgs. -// -// Replicas should eventually be updated if the superblock is updated. -// -// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. -type SuperBlock interface { - marshal.Marshallable - - // InodesCount returns the total number of inodes in this filesystem. - InodesCount() uint32 - - // BlocksCount returns the total number of data blocks in this filesystem. - BlocksCount() uint64 - - // FreeBlocksCount returns the number of free blocks in this filesystem. - FreeBlocksCount() uint64 - - // FreeInodesCount returns the number of free inodes in this filesystem. - FreeInodesCount() uint32 - - // MountCount returns the number of mounts since the last fsck. - MountCount() uint16 - - // MaxMountCount returns the number of mounts allowed beyond which a fsck is - // needed. - MaxMountCount() uint16 - - // FirstDataBlock returns the absolute block number of the first data block, - // which contains the super block itself. - // - // If the filesystem has 1kb data blocks then this should return 1. For all - // other configurations, this typically returns 0. - FirstDataBlock() uint32 - - // BlockSize returns the size of one data block in this filesystem. - // This can be calculated by 2^(10 + sb.s_log_block_size). This ensures that - // the smallest block size is 1kb. - BlockSize() uint64 - - // BlocksPerGroup returns the number of data blocks in a block group. - BlocksPerGroup() uint32 - - // ClusterSize returns block cluster size (set during mkfs time by admin). - // This can be calculated by 2^(10 + sb.s_log_cluster_size). This ensures that - // the smallest cluster size is 1kb. - // - // sb.s_log_cluster_size must equal sb.s_log_block_size if bigalloc feature - // is NOT set and consequently BlockSize() = ClusterSize() in that case. - ClusterSize() uint64 - - // ClustersPerGroup returns: - // - number of clusters per group if bigalloc is enabled. - // - BlocksPerGroup() otherwise. - ClustersPerGroup() uint32 - - // InodeSize returns the size of the inode disk record size in bytes. Use this - // to iterate over inode arrays on disk. - // - // In ext2 and ext3: - // - Each inode had a disk record of 128 bytes. - // - The inode struct size was fixed at 128 bytes. - // - // In ext4 its possible to allocate larger on-disk inodes: - // - Inode disk record size = sb.s_inode_size (function return value). - // = 256 (default) - // - Inode struct size = 128 + inode.i_extra_isize. - // = 128 + 32 = 160 (default) - InodeSize() uint16 - - // InodesPerGroup returns the number of inodes in a block group. - InodesPerGroup() uint32 - - // BgDescSize returns the size of the block group descriptor struct. - // - // In ext2, ext3, ext4 (without 64-bit feature), the block group descriptor - // is only 32 bytes long. - // In ext4 with 64-bit feature, the block group descriptor expands to AT LEAST - // 64 bytes. It might be bigger than that. - BgDescSize() uint16 - - // CompatibleFeatures returns the CompatFeatures struct which holds all the - // compatible features this fs supports. - CompatibleFeatures() CompatFeatures - - // IncompatibleFeatures returns the CompatFeatures struct which holds all the - // incompatible features this fs supports. - IncompatibleFeatures() IncompatFeatures - - // ReadOnlyCompatibleFeatures returns the CompatFeatures struct which holds all the - // readonly compatible features this fs supports. - ReadOnlyCompatibleFeatures() RoCompatFeatures - - // Magic() returns the magic signature which must be 0xef53. - Magic() uint16 - - // Revision returns the superblock revision. Superblock struct fields from - // offset 0x54 till 0x150 should only be used if superblock has DynamicRev. - Revision() SbRevision -} - -// SbRevision is the type for superblock revisions. -type SbRevision uint32 - -// Super block revisions. -const ( - // OldRev is the good old (original) format. - OldRev SbRevision = 0 - - // DynamicRev is v2 format w/ dynamic inode sizes. - DynamicRev SbRevision = 1 -) - -// Superblock compatible features. -// This is not exhaustive, unused features are not listed. -const ( - // SbDirPrealloc indicates directory preallocation. - SbDirPrealloc = 0x1 - - // SbHasJournal indicates the presence of a journal. jbd2 should only work - // with this being set. - SbHasJournal = 0x4 - - // SbExtAttr indicates extended attributes support. - SbExtAttr = 0x8 - - // SbResizeInode indicates that the fs has reserved GDT blocks (right after - // group descriptors) for fs expansion. - SbResizeInode = 0x10 - - // SbDirIndex indicates that the fs has directory indices. - SbDirIndex = 0x20 - - // SbSparseV2 stands for Sparse superblock version 2. - SbSparseV2 = 0x200 -) - -// CompatFeatures represents a superblock's compatible feature set. If the -// kernel does not understand any of these feature, it can still read/write -// to this fs. -type CompatFeatures struct { - DirPrealloc bool - HasJournal bool - ExtAttr bool - ResizeInode bool - DirIndex bool - SparseV2 bool -} - -// ToInt converts superblock compatible features back to its 32-bit rep. -func (f CompatFeatures) ToInt() uint32 { - var res uint32 - - if f.DirPrealloc { - res |= SbDirPrealloc - } - if f.HasJournal { - res |= SbHasJournal - } - if f.ExtAttr { - res |= SbExtAttr - } - if f.ResizeInode { - res |= SbResizeInode - } - if f.DirIndex { - res |= SbDirIndex - } - if f.SparseV2 { - res |= SbSparseV2 - } - - return res -} - -// CompatFeaturesFromInt converts the integer representation of superblock -// compatible features to CompatFeatures struct. -func CompatFeaturesFromInt(f uint32) CompatFeatures { - return CompatFeatures{ - DirPrealloc: f&SbDirPrealloc > 0, - HasJournal: f&SbHasJournal > 0, - ExtAttr: f&SbExtAttr > 0, - ResizeInode: f&SbResizeInode > 0, - DirIndex: f&SbDirIndex > 0, - SparseV2: f&SbSparseV2 > 0, - } -} - -// Superblock incompatible features. -// This is not exhaustive, unused features are not listed. -const ( - // SbDirentFileType indicates that directory entries record the file type. - // We should use struct DirentNew for dirents then. - SbDirentFileType = 0x2 - - // SbRecovery indicates that the filesystem needs recovery. - SbRecovery = 0x4 - - // SbJournalDev indicates that the filesystem has a separate journal device. - SbJournalDev = 0x8 - - // SbMetaBG indicates that the filesystem is using Meta block groups. Moves - // the group descriptors from the congested first block group into the first - // group of each metablock group to increase the maximum block groups limit - // and hence support much larger filesystems. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#meta-block-groups. - SbMetaBG = 0x10 - - // SbExtents indicates that the filesystem uses extents. Must be set in ext4 - // filesystems. - SbExtents = 0x40 - - // SbIs64Bit indicates that this filesystem addresses blocks with 64-bits. - // Hence can support 2^64 data blocks. - SbIs64Bit = 0x80 - - // SbMMP indicates that this filesystem has multiple mount protection. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#multiple-mount-protection. - SbMMP = 0x100 - - // SbFlexBg indicates that this filesystem has flexible block groups. Several - // block groups are tied into one logical block group so that all the metadata - // for the block groups (bitmaps and inode tables) are close together for - // faster loading. Consequently, large files will be continuous on disk. - // However, this does not affect the placement of redundant superblocks and - // group descriptors. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#flexible-block-groups. - SbFlexBg = 0x200 - - // SbLargeDir shows that large directory enabled. Directory htree can be 3 - // levels deep. Directory htrees are allowed to be 2 levels deep otherwise. - SbLargeDir = 0x4000 - - // SbInlineData allows inline data in inodes for really small files. - SbInlineData = 0x8000 - - // SbEncrypted indicates that this fs contains encrypted inodes. - SbEncrypted = 0x10000 -) - -// IncompatFeatures represents a superblock's incompatible feature set. If the -// kernel does not understand any of these feature, it should refuse to mount. -type IncompatFeatures struct { - DirentFileType bool - Recovery bool - JournalDev bool - MetaBG bool - Extents bool - Is64Bit bool - MMP bool - FlexBg bool - LargeDir bool - InlineData bool - Encrypted bool -} - -// ToInt converts superblock incompatible features back to its 32-bit rep. -func (f IncompatFeatures) ToInt() uint32 { - var res uint32 - - if f.DirentFileType { - res |= SbDirentFileType - } - if f.Recovery { - res |= SbRecovery - } - if f.JournalDev { - res |= SbJournalDev - } - if f.MetaBG { - res |= SbMetaBG - } - if f.Extents { - res |= SbExtents - } - if f.Is64Bit { - res |= SbIs64Bit - } - if f.MMP { - res |= SbMMP - } - if f.FlexBg { - res |= SbFlexBg - } - if f.LargeDir { - res |= SbLargeDir - } - if f.InlineData { - res |= SbInlineData - } - if f.Encrypted { - res |= SbEncrypted - } - - return res -} - -// IncompatFeaturesFromInt converts the integer representation of superblock -// incompatible features to IncompatFeatures struct. -func IncompatFeaturesFromInt(f uint32) IncompatFeatures { - return IncompatFeatures{ - DirentFileType: f&SbDirentFileType > 0, - Recovery: f&SbRecovery > 0, - JournalDev: f&SbJournalDev > 0, - MetaBG: f&SbMetaBG > 0, - Extents: f&SbExtents > 0, - Is64Bit: f&SbIs64Bit > 0, - MMP: f&SbMMP > 0, - FlexBg: f&SbFlexBg > 0, - LargeDir: f&SbLargeDir > 0, - InlineData: f&SbInlineData > 0, - Encrypted: f&SbEncrypted > 0, - } -} - -// Superblock readonly compatible features. -// This is not exhaustive, unused features are not listed. -const ( - // SbSparse indicates sparse superblocks. Only groups with number either 0 or - // a power of 3, 5, or 7 will have redundant copies of the superblock and - // block descriptors. - SbSparse = 0x1 - - // SbLargeFile indicates that this fs has been used to store a file >= 2GiB. - SbLargeFile = 0x2 - - // SbHugeFile indicates that this fs contains files whose sizes are - // represented in units of logicals blocks, not 512-byte sectors. - SbHugeFile = 0x8 - - // SbGdtCsum indicates that group descriptors have checksums. - SbGdtCsum = 0x10 - - // SbDirNlink indicates that the new subdirectory limit is 64,999. Ext3 has a - // 32,000 subdirectory limit. - SbDirNlink = 0x20 - - // SbExtraIsize indicates that large inodes exist on this filesystem. - SbExtraIsize = 0x40 - - // SbHasSnapshot indicates the existence of a snapshot. - SbHasSnapshot = 0x80 - - // SbQuota enables usage tracking for all quota types. - SbQuota = 0x100 - - // SbBigalloc maps to the bigalloc feature. When set, the minimum allocation - // unit becomes a cluster rather than a data block. Then block bitmaps track - // clusters, not data blocks. - // - // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#bigalloc. - SbBigalloc = 0x200 - - // SbMetadataCsum indicates that the fs supports metadata checksumming. - SbMetadataCsum = 0x400 - - // SbReadOnly marks this filesystem as readonly. Should refuse to mount in - // read/write mode. - SbReadOnly = 0x1000 -) - -// RoCompatFeatures represents a superblock's readonly compatible feature set. -// If the kernel does not understand any of these feature, it can still mount -// readonly. But if the user wants to mount read/write, the kernel should -// refuse to mount. -type RoCompatFeatures struct { - Sparse bool - LargeFile bool - HugeFile bool - GdtCsum bool - DirNlink bool - ExtraIsize bool - HasSnapshot bool - Quota bool - Bigalloc bool - MetadataCsum bool - ReadOnly bool -} - -// ToInt converts superblock readonly compatible features to its 32-bit rep. -func (f RoCompatFeatures) ToInt() uint32 { - var res uint32 - - if f.Sparse { - res |= SbSparse - } - if f.LargeFile { - res |= SbLargeFile - } - if f.HugeFile { - res |= SbHugeFile - } - if f.GdtCsum { - res |= SbGdtCsum - } - if f.DirNlink { - res |= SbDirNlink - } - if f.ExtraIsize { - res |= SbExtraIsize - } - if f.HasSnapshot { - res |= SbHasSnapshot - } - if f.Quota { - res |= SbQuota - } - if f.Bigalloc { - res |= SbBigalloc - } - if f.MetadataCsum { - res |= SbMetadataCsum - } - if f.ReadOnly { - res |= SbReadOnly - } - - return res -} - -// RoCompatFeaturesFromInt converts the integer representation of superblock -// readonly compatible features to RoCompatFeatures struct. -func RoCompatFeaturesFromInt(f uint32) RoCompatFeatures { - return RoCompatFeatures{ - Sparse: f&SbSparse > 0, - LargeFile: f&SbLargeFile > 0, - HugeFile: f&SbHugeFile > 0, - GdtCsum: f&SbGdtCsum > 0, - DirNlink: f&SbDirNlink > 0, - ExtraIsize: f&SbExtraIsize > 0, - HasSnapshot: f&SbHasSnapshot > 0, - Quota: f&SbQuota > 0, - Bigalloc: f&SbBigalloc > 0, - MetadataCsum: f&SbMetadataCsum > 0, - ReadOnly: f&SbReadOnly > 0, - } -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go deleted file mode 100644 index 4dc6080fb..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of -// the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if -// RevLevel = DynamicRev and 64-bit feature is disabled. -// -// +marshal -type SuperBlock32Bit struct { - // We embed the old superblock struct here because the 32-bit version is just - // an extension of the old version. - SuperBlockOld - - FirstInode uint32 - InodeSizeRaw uint16 - BlockGroupNumber uint16 - FeatureCompat uint32 - FeatureIncompat uint32 - FeatureRoCompat uint32 - UUID [16]byte - VolumeName [16]byte - LastMounted [64]byte - AlgoUsageBitmap uint32 - PreallocBlocks uint8 - PreallocDirBlocks uint8 - ReservedGdtBlocks uint16 - JournalUUID [16]byte - JournalInum uint32 - JournalDev uint32 - LastOrphan uint32 - HashSeed [4]uint32 - DefaultHashVersion uint8 - JnlBackupType uint8 - BgDescSizeRaw uint16 - DefaultMountOpts uint32 - FirstMetaBg uint32 - MkfsTime uint32 - JnlBlocks [17]uint32 -} - -// Compiles only if SuperBlock32Bit implements SuperBlock. -var _ SuperBlock = (*SuperBlock32Bit)(nil) - -// Only override methods which change based on the additional fields above. -// Not overriding SuperBlock.BgDescSize because it would still return 32 here. - -// InodeSize implements SuperBlock.InodeSize. -func (sb *SuperBlock32Bit) InodeSize() uint16 { - return sb.InodeSizeRaw -} - -// CompatibleFeatures implements SuperBlock.CompatibleFeatures. -func (sb *SuperBlock32Bit) CompatibleFeatures() CompatFeatures { - return CompatFeaturesFromInt(sb.FeatureCompat) -} - -// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. -func (sb *SuperBlock32Bit) IncompatibleFeatures() IncompatFeatures { - return IncompatFeaturesFromInt(sb.FeatureIncompat) -} - -// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. -func (sb *SuperBlock32Bit) ReadOnlyCompatibleFeatures() RoCompatFeatures { - return RoCompatFeaturesFromInt(sb.FeatureRoCompat) -} diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go deleted file mode 100644 index 2c9039327..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlock64Bit implements SuperBlock and represents the 64-bit version of -// the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly -// 1024 bytes (smallest possible block size) and hence the superblock always -// fits in no more than one data block. Should only be used when the 64-bit -// feature is set. -// -// +marshal -type SuperBlock64Bit struct { - // We embed the 32-bit struct here because 64-bit version is just an extension - // of the 32-bit version. - SuperBlock32Bit - - BlocksCountHi uint32 - ReservedBlocksCountHi uint32 - FreeBlocksCountHi uint32 - MinInodeSize uint16 - WantInodeSize uint16 - Flags uint32 - RaidStride uint16 - MmpInterval uint16 - MmpBlock uint64 - RaidStripeWidth uint32 - LogGroupsPerFlex uint8 - ChecksumType uint8 - _ uint16 - KbytesWritten uint64 - SnapshotInum uint32 - SnapshotID uint32 - SnapshotRsrvBlocksCount uint64 - SnapshotList uint32 - ErrorCount uint32 - FirstErrorTime uint32 - FirstErrorInode uint32 - FirstErrorBlock uint64 - FirstErrorFunction [32]byte - FirstErrorLine uint32 - LastErrorTime uint32 - LastErrorInode uint32 - LastErrorLine uint32 - LastErrorBlock uint64 - LastErrorFunction [32]byte - MountOpts [64]byte - UserQuotaInum uint32 - GroupQuotaInum uint32 - OverheadBlocks uint32 - BackupBgs [2]uint32 - EncryptAlgos [4]uint8 - EncryptPwSalt [16]uint8 - LostFoundInode uint32 - ProjectQuotaInode uint32 - ChecksumSeed uint32 - WtimeHi uint8 - MtimeHi uint8 - MkfsTimeHi uint8 - LastCheckHi uint8 - FirstErrorTimeHi uint8 - LastErrorTimeHi uint8 - _ [2]uint8 - Encoding uint16 - EncodingFlags uint16 - _ [95]uint32 - Checksum uint32 -} - -// Compiles only if SuperBlock64Bit implements SuperBlock. -var _ SuperBlock = (*SuperBlock64Bit)(nil) - -// Only override methods which change based on the 64-bit feature. - -// BlocksCount implements SuperBlock.BlocksCount. -func (sb *SuperBlock64Bit) BlocksCount() uint64 { - return (uint64(sb.BlocksCountHi) << 32) | uint64(sb.BlocksCountLo) -} - -// FreeBlocksCount implements SuperBlock.FreeBlocksCount. -func (sb *SuperBlock64Bit) FreeBlocksCount() uint64 { - return (uint64(sb.FreeBlocksCountHi) << 32) | uint64(sb.FreeBlocksCountLo) -} - -// BgDescSize implements SuperBlock.BgDescSize. -func (sb *SuperBlock64Bit) BgDescSize() uint16 { return sb.BgDescSizeRaw } diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go deleted file mode 100644 index e4709f23c..000000000 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package disklayout - -// SuperBlockOld implements SuperBlock and represents the old version of the -// superblock struct. Should be used only if RevLevel = OldRev. -// -// +marshal -type SuperBlockOld struct { - InodesCountRaw uint32 - BlocksCountLo uint32 - ReservedBlocksCount uint32 - FreeBlocksCountLo uint32 - FreeInodesCountRaw uint32 - FirstDataBlockRaw uint32 - LogBlockSize uint32 - LogClusterSize uint32 - BlocksPerGroupRaw uint32 - ClustersPerGroupRaw uint32 - InodesPerGroupRaw uint32 - Mtime uint32 - Wtime uint32 - MountCountRaw uint16 - MaxMountCountRaw uint16 - MagicRaw uint16 - State uint16 - Errors uint16 - MinorRevLevel uint16 - LastCheck uint32 - CheckInterval uint32 - CreatorOS uint32 - RevLevel uint32 - DefResUID uint16 - DefResGID uint16 -} - -// Compiles only if SuperBlockOld implements SuperBlock. -var _ SuperBlock = (*SuperBlockOld)(nil) - -// InodesCount implements SuperBlock.InodesCount. -func (sb *SuperBlockOld) InodesCount() uint32 { return sb.InodesCountRaw } - -// BlocksCount implements SuperBlock.BlocksCount. -func (sb *SuperBlockOld) BlocksCount() uint64 { return uint64(sb.BlocksCountLo) } - -// FreeBlocksCount implements SuperBlock.FreeBlocksCount. -func (sb *SuperBlockOld) FreeBlocksCount() uint64 { return uint64(sb.FreeBlocksCountLo) } - -// FreeInodesCount implements SuperBlock.FreeInodesCount. -func (sb *SuperBlockOld) FreeInodesCount() uint32 { return sb.FreeInodesCountRaw } - -// MountCount implements SuperBlock.MountCount. -func (sb *SuperBlockOld) MountCount() uint16 { return sb.MountCountRaw } - -// MaxMountCount implements SuperBlock.MaxMountCount. -func (sb *SuperBlockOld) MaxMountCount() uint16 { return sb.MaxMountCountRaw } - -// FirstDataBlock implements SuperBlock.FirstDataBlock. -func (sb *SuperBlockOld) FirstDataBlock() uint32 { return sb.FirstDataBlockRaw } - -// BlockSize implements SuperBlock.BlockSize. -func (sb *SuperBlockOld) BlockSize() uint64 { return 1 << (10 + sb.LogBlockSize) } - -// BlocksPerGroup implements SuperBlock.BlocksPerGroup. -func (sb *SuperBlockOld) BlocksPerGroup() uint32 { return sb.BlocksPerGroupRaw } - -// ClusterSize implements SuperBlock.ClusterSize. -func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterSize) } - -// ClustersPerGroup implements SuperBlock.ClustersPerGroup. -func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } - -// InodeSize implements SuperBlock.InodeSize. -func (sb *SuperBlockOld) InodeSize() uint16 { return OldInodeSize } - -// InodesPerGroup implements SuperBlock.InodesPerGroup. -func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } - -// BgDescSize implements SuperBlock.BgDescSize. -func (sb *SuperBlockOld) BgDescSize() uint16 { return 32 } - -// CompatibleFeatures implements SuperBlock.CompatibleFeatures. -func (sb *SuperBlockOld) CompatibleFeatures() CompatFeatures { return CompatFeatures{} } - -// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures. -func (sb *SuperBlockOld) IncompatibleFeatures() IncompatFeatures { return IncompatFeatures{} } - -// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures. -func (sb *SuperBlockOld) ReadOnlyCompatibleFeatures() RoCompatFeatures { return RoCompatFeatures{} } - -// Magic implements SuperBlock.Magic. -func (sb *SuperBlockOld) Magic() uint16 { return sb.MagicRaw } - -// Revision implements SuperBlock.Revision. -func (sb *SuperBlockOld) Revision() SbRevision { return SbRevision(sb.RevLevel) } diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go deleted file mode 100644 index 38fb7962b..000000000 --- a/pkg/sentry/fsimpl/ext/ext.go +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package ext implements readonly ext(2/3/4) filesystems. -package ext - -import ( - "errors" - "fmt" - "io" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// Name is the name of this filesystem. -const Name = "ext" - -// FilesystemType implements vfs.FilesystemType. -// -// +stateify savable -type FilesystemType struct{} - -// getDeviceFd returns an io.ReaderAt to the underlying device. -// Currently there are two ways of mounting an ext(2/3/4) fs: -// 1. Specify a mount with our internal special MountType in the OCI spec. -// 2. Expose the device to the container and mount it from application layer. -func getDeviceFd(source string, opts vfs.GetFilesystemOptions) (io.ReaderAt, error) { - if opts.InternalData == nil { - // User mount call. - // TODO(b/134676337): Open the device specified by `source` and return that. - panic("unimplemented") - } - - // GetFilesystem call originated from within the sentry. - devFd, ok := opts.InternalData.(int) - if !ok { - return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device") - } - - if devFd < 0 { - return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd) - } - - // The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership - // of the file descriptor and hence will not close it when it is garbage - // collected. - return fd.NewReadWriter(devFd), nil -} - -// isCompatible checks if the superblock has feature sets which are compatible. -// We only need to check the superblock incompatible feature set since we are -// mounting readonly. We will also need to check readonly compatible feature -// set when mounting for read/write. -func isCompatible(sb disklayout.SuperBlock) bool { - // Please note that what is being checked is limited based on the fact that we - // are mounting readonly and that we are not journaling. When mounting - // read/write or with a journal, this must be reevaluated. - incompatFeatures := sb.IncompatibleFeatures() - if incompatFeatures.MetaBG { - log.Warningf("ext fs: meta block groups are not supported") - return false - } - if incompatFeatures.MMP { - log.Warningf("ext fs: multiple mount protection is not supported") - return false - } - if incompatFeatures.Encrypted { - log.Warningf("ext fs: encrypted inodes not supported") - return false - } - if incompatFeatures.InlineData { - log.Warningf("ext fs: inline files not supported") - return false - } - return true -} - -// Name implements vfs.FilesystemType.Name. -func (FilesystemType) Name() string { - return Name -} - -// Release implements vfs.FilesystemType.Release. -func (FilesystemType) Release(ctx context.Context) {} - -// GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - // TODO(b/134676337): Ensure that the user is mounting readonly. If not, - // EACCESS should be returned according to mount(2). Filesystem independent - // flags (like readonly) are currently not available in pkg/sentry/vfs. - - devMinor, err := vfsObj.GetAnonBlockDevMinor() - if err != nil { - return nil, nil, err - } - - dev, err := getDeviceFd(source, opts) - if err != nil { - return nil, nil, err - } - - fs := filesystem{ - dev: dev, - inodeCache: make(map[uint32]*inode), - devMinor: devMinor, - } - fs.vfsfs.Init(vfsObj, &fsType, &fs) - fs.sb, err = readSuperBlock(dev) - if err != nil { - fs.vfsfs.DecRef(ctx) - return nil, nil, err - } - - if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { - // mount(2) specifies that EINVAL should be returned if the superblock is - // invalid. - fs.vfsfs.DecRef(ctx) - return nil, nil, syserror.EINVAL - } - - // Refuse to mount if the filesystem is incompatible. - if !isCompatible(fs.sb) { - fs.vfsfs.DecRef(ctx) - return nil, nil, syserror.EINVAL - } - - fs.bgs, err = readBlockGroups(dev, fs.sb) - if err != nil { - fs.vfsfs.DecRef(ctx) - return nil, nil, err - } - - rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) - if err != nil { - fs.vfsfs.DecRef(ctx) - return nil, nil, err - } - rootInode.incRef() - - return &fs.vfsfs, &newDentry(rootInode).vfsd, nil -} diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go deleted file mode 100644 index d9fd4590c..000000000 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ /dev/null @@ -1,926 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "fmt" - "io" - "os" - "path" - "sort" - "testing" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/test/testutil" - "gvisor.dev/gvisor/pkg/usermem" -) - -const ( - assetsDir = "pkg/sentry/fsimpl/ext/assets" -) - -var ( - ext2ImagePath = path.Join(assetsDir, "tiny.ext2") - ext3ImagePath = path.Join(assetsDir, "tiny.ext3") - ext4ImagePath = path.Join(assetsDir, "tiny.ext4") -) - -// setUp opens imagePath as an ext Filesystem and returns all necessary -// elements required to run tests. If error is non-nil, it also returns a tear -// down function which must be called after the test is run for clean up. -func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) { - localImagePath, err := testutil.FindFile(imagePath) - if err != nil { - return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err) - } - - f, err := os.Open(localImagePath) - if err != nil { - return nil, nil, nil, nil, err - } - - ctx := contexttest.Context(t) - creds := auth.CredentialsFromContext(ctx) - - // Create VFS. - vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(ctx); err != nil { - t.Fatalf("VFS init: %v", err) - } - vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{ - GetFilesystemOptions: vfs.GetFilesystemOptions{ - InternalData: int(f.Fd()), - }, - }) - if err != nil { - f.Close() - return nil, nil, nil, nil, err - } - - root := mntns.Root() - root.IncRef() - - tearDown := func() { - root.DecRef(ctx) - - if err := f.Close(); err != nil { - t.Fatalf("tearDown failed: %v", err) - } - } - return ctx, vfsObj, &root, tearDown, nil -} - -// TODO(b/134676337): Test vfs.FilesystemImpl.ReadlinkAt and -// vfs.FilesystemImpl.StatFSAt which are not implemented in -// vfs.VirtualFilesystem yet. - -// TestSeek tests vfs.FileDescriptionImpl.Seek functionality. -func TestSeek(t *testing.T) { - type seekTest struct { - name string - image string - path string - } - - tests := []seekTest{ - { - name: "ext4 root dir seek", - image: ext4ImagePath, - path: "/", - }, - { - name: "ext3 root dir seek", - image: ext3ImagePath, - path: "/", - }, - { - name: "ext2 root dir seek", - image: ext2ImagePath, - path: "/", - }, - { - name: "ext4 reg file seek", - image: ext4ImagePath, - path: "/file.txt", - }, - { - name: "ext3 reg file seek", - image: ext3ImagePath, - path: "/file.txt", - }, - { - name: "ext2 reg file seek", - image: ext2ImagePath, - path: "/file.txt", - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - ctx, vfsfs, root, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - fd, err := vfsfs.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt failed: %v", err) - } - - if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { - t.Errorf("expected seek position 0, got %d and error %v", n, err) - } - - stat, err := fd.Stat(ctx, vfs.StatOptions{}) - if err != nil { - t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err) - } - - // We should be able to seek beyond the end of file. - size := int64(stat.Size) - if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", size, n, err) - } - - // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL { - t.Errorf("expected error EINVAL but got %v", err) - } - - if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err) - } - - // Make sure negative offsets work with SEEK_CUR. - if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) - } - - // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL { - t.Errorf("expected error EINVAL but got %v", err) - } - - // Make sure SEEK_END works with regular files. - if _, ok := fd.Impl().(*regularFileFD); ok { - // Seek back to 0. - if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", 0, n, err) - } - - // Seek forward beyond EOF. - if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { - t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) - } - - // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL { - t.Errorf("expected error EINVAL but got %v", err) - } - } - }) - } -} - -// TestStatAt tests filesystem.StatAt functionality. -func TestStatAt(t *testing.T) { - type statAtTest struct { - name string - image string - path string - want linux.Statx - } - - tests := []statAtTest{ - { - name: "ext4 statx small file", - image: ext4ImagePath, - path: "/file.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13, - }, - }, - { - name: "ext3 statx small file", - image: ext3ImagePath, - path: "/file.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13, - }, - }, - { - name: "ext2 statx small file", - image: ext2ImagePath, - path: "/file.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13, - }, - }, - { - name: "ext4 statx big file", - image: ext4ImagePath, - path: "/bigfile.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13042, - }, - }, - { - name: "ext3 statx big file", - image: ext3ImagePath, - path: "/bigfile.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13042, - }, - }, - { - name: "ext2 statx big file", - image: ext2ImagePath, - path: "/bigfile.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0644 | linux.ModeRegular, - Size: 13042, - }, - }, - { - name: "ext4 statx symlink file", - image: ext4ImagePath, - path: "/symlink.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0777 | linux.ModeSymlink, - Size: 8, - }, - }, - { - name: "ext3 statx symlink file", - image: ext3ImagePath, - path: "/symlink.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0777 | linux.ModeSymlink, - Size: 8, - }, - }, - { - name: "ext2 statx symlink file", - image: ext2ImagePath, - path: "/symlink.txt", - want: linux.Statx{ - Blksize: 0x400, - Nlink: 1, - UID: 0, - GID: 0, - Mode: 0777 | linux.ModeSymlink, - Size: 8, - }, - }, - } - - // Ignore the fields that are not supported by filesystem.StatAt yet and - // those which are likely to change as the image does. - ignoredFields := map[string]bool{ - "Attributes": true, - "AttributesMask": true, - "Atime": true, - "Blocks": true, - "Btime": true, - "Ctime": true, - "DevMajor": true, - "DevMinor": true, - "Ino": true, - "Mask": true, - "Mtime": true, - "RdevMajor": true, - "RdevMinor": true, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - ctx, vfsfs, root, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - got, err := vfsfs.StatAt(ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, - &vfs.StatOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.StatAt failed for file %s in image %s: %v", test.path, test.image, err) - } - - cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool { - _, ok := ignoredFields[p.String()] - return ok - }, cmp.Ignore()) - if diff := cmp.Diff(got, test.want, cmpIgnoreFields, cmpopts.IgnoreUnexported(linux.Statx{})); diff != "" { - t.Errorf("stat mismatch (-want +got):\n%s", diff) - } - }) - } -} - -// TestRead tests the read functionality for vfs file descriptions. -func TestRead(t *testing.T) { - type readTest struct { - name string - image string - absPath string - } - - tests := []readTest{ - { - name: "ext4 read small file", - image: ext4ImagePath, - absPath: "/file.txt", - }, - { - name: "ext3 read small file", - image: ext3ImagePath, - absPath: "/file.txt", - }, - { - name: "ext2 read small file", - image: ext2ImagePath, - absPath: "/file.txt", - }, - { - name: "ext4 read big file", - image: ext4ImagePath, - absPath: "/bigfile.txt", - }, - { - name: "ext3 read big file", - image: ext3ImagePath, - absPath: "/bigfile.txt", - }, - { - name: "ext2 read big file", - image: ext2ImagePath, - absPath: "/bigfile.txt", - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - ctx, vfsfs, root, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - fd, err := vfsfs.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt failed: %v", err) - } - - // Get a local file descriptor and compare its functionality with a vfs file - // description for the same file. - localFile, err := testutil.FindFile(path.Join(assetsDir, test.absPath)) - if err != nil { - t.Fatalf("testutil.FindFile failed for %s: %v", test.absPath, err) - } - - f, err := os.Open(localFile) - if err != nil { - t.Fatalf("os.Open failed for %s: %v", localFile, err) - } - defer f.Close() - - // Read the entire file by reading one byte repeatedly. Doing this stress - // tests the underlying file reader implementation. - got := make([]byte, 1) - want := make([]byte, 1) - for { - n, err := f.Read(want) - fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) - - if diff := cmp.Diff(got, want); diff != "" { - t.Errorf("file data mismatch (-want +got):\n%s", diff) - } - - // Make sure there is no more file data left after getting EOF. - if n == 0 || err == io.EOF { - if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { - t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image) - } - - break - } - - if err != nil { - t.Fatalf("read failed: %v", err) - } - } - }) - } -} - -// iterDirentsCb is a simple callback which just keeps adding the dirents to an -// internal list. Implements vfs.IterDirentsCallback. -type iterDirentsCb struct { - dirents []vfs.Dirent -} - -// Compiles only if iterDirentCb implements vfs.IterDirentsCallback. -var _ vfs.IterDirentsCallback = (*iterDirentsCb)(nil) - -// newIterDirentsCb is the iterDirent -func newIterDirentCb() *iterDirentsCb { - return &iterDirentsCb{dirents: make([]vfs.Dirent, 0)} -} - -// Handle implements vfs.IterDirentsCallback.Handle. -func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error { - cb.dirents = append(cb.dirents, dirent) - return nil -} - -// TestIterDirents tests the FileDescriptionImpl.IterDirents functionality. -func TestIterDirents(t *testing.T) { - type iterDirentTest struct { - name string - image string - path string - want []vfs.Dirent - } - - wantDirents := []vfs.Dirent{ - { - Name: ".", - Type: linux.DT_DIR, - }, - { - Name: "..", - Type: linux.DT_DIR, - }, - { - Name: "lost+found", - Type: linux.DT_DIR, - }, - { - Name: "file.txt", - Type: linux.DT_REG, - }, - { - Name: "bigfile.txt", - Type: linux.DT_REG, - }, - { - Name: "symlink.txt", - Type: linux.DT_LNK, - }, - } - tests := []iterDirentTest{ - { - name: "ext4 root dir iteration", - image: ext4ImagePath, - path: "/", - want: wantDirents, - }, - { - name: "ext3 root dir iteration", - image: ext3ImagePath, - path: "/", - want: wantDirents, - }, - { - name: "ext2 root dir iteration", - image: ext2ImagePath, - path: "/", - want: wantDirents, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - ctx, vfsfs, root, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - fd, err := vfsfs.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt failed: %v", err) - } - - cb := &iterDirentsCb{} - if err = fd.IterDirents(ctx, cb); err != nil { - t.Fatalf("dir fd.IterDirents() failed: %v", err) - } - - sort.Slice(cb.dirents, func(i int, j int) bool { return cb.dirents[i].Name < cb.dirents[j].Name }) - sort.Slice(test.want, func(i int, j int) bool { return test.want[i].Name < test.want[j].Name }) - - // Ignore the inode number and offset of dirents because those are likely to - // change as the underlying image changes. - cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool { - return p.String() == "Ino" || p.String() == "NextOff" - }, cmp.Ignore()) - if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" { - t.Errorf("dirents mismatch (-want +got):\n%s", diff) - } - }) - } -} - -// TestRootDir tests that the root directory inode is correctly initialized and -// returned from setUp. -func TestRootDir(t *testing.T) { - type inodeProps struct { - Mode linux.FileMode - UID auth.KUID - GID auth.KGID - Size uint64 - InodeSize uint16 - Links uint16 - Flags disklayout.InodeFlags - } - - type rootDirTest struct { - name string - image string - wantInode inodeProps - } - - tests := []rootDirTest{ - { - name: "ext4 root dir", - image: ext4ImagePath, - wantInode: inodeProps{ - Mode: linux.ModeDirectory | 0755, - Size: 0x400, - InodeSize: 0x80, - Links: 3, - Flags: disklayout.InodeFlags{Extents: true}, - }, - }, - { - name: "ext3 root dir", - image: ext3ImagePath, - wantInode: inodeProps{ - Mode: linux.ModeDirectory | 0755, - Size: 0x400, - InodeSize: 0x80, - Links: 3, - }, - }, - { - name: "ext2 root dir", - image: ext2ImagePath, - wantInode: inodeProps{ - Mode: linux.ModeDirectory | 0755, - Size: 0x400, - InodeSize: 0x80, - Links: 3, - }, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - _, _, vd, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - d, ok := vd.Dentry().Impl().(*dentry) - if !ok { - t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl()) - } - - // Offload inode contents into local structs for comparison. - gotInode := inodeProps{ - Mode: d.inode.diskInode.Mode(), - UID: d.inode.diskInode.UID(), - GID: d.inode.diskInode.GID(), - Size: d.inode.diskInode.Size(), - InodeSize: d.inode.diskInode.InodeSize(), - Links: d.inode.diskInode.LinksCount(), - Flags: d.inode.diskInode.Flags(), - } - - if diff := cmp.Diff(gotInode, test.wantInode); diff != "" { - t.Errorf("inode mismatch (-want +got):\n%s", diff) - } - }) - } -} - -// TestFilesystemInit tests that the filesystem superblock and block group -// descriptors are correctly read in and initialized. -func TestFilesystemInit(t *testing.T) { - // sb only contains the immutable properties of the superblock. - type sb struct { - InodesCount uint32 - BlocksCount uint64 - MaxMountCount uint16 - FirstDataBlock uint32 - BlockSize uint64 - BlocksPerGroup uint32 - ClusterSize uint64 - ClustersPerGroup uint32 - InodeSize uint16 - InodesPerGroup uint32 - BgDescSize uint16 - Magic uint16 - Revision disklayout.SbRevision - CompatFeatures disklayout.CompatFeatures - IncompatFeatures disklayout.IncompatFeatures - RoCompatFeatures disklayout.RoCompatFeatures - } - - // bg only contains the immutable properties of the block group descriptor. - type bg struct { - InodeTable uint64 - BlockBitmap uint64 - InodeBitmap uint64 - ExclusionBitmap uint64 - Flags disklayout.BGFlags - } - - type fsInitTest struct { - name string - image string - wantSb sb - wantBgs []bg - } - - tests := []fsInitTest{ - { - name: "ext4 filesystem init", - image: ext4ImagePath, - wantSb: sb{ - InodesCount: 0x10, - BlocksCount: 0x40, - MaxMountCount: 0xffff, - FirstDataBlock: 0x1, - BlockSize: 0x400, - BlocksPerGroup: 0x2000, - ClusterSize: 0x400, - ClustersPerGroup: 0x2000, - InodeSize: 0x80, - InodesPerGroup: 0x10, - BgDescSize: 0x40, - Magic: linux.EXT_SUPER_MAGIC, - Revision: disklayout.DynamicRev, - CompatFeatures: disklayout.CompatFeatures{ - ExtAttr: true, - ResizeInode: true, - DirIndex: true, - }, - IncompatFeatures: disklayout.IncompatFeatures{ - DirentFileType: true, - Extents: true, - Is64Bit: true, - FlexBg: true, - }, - RoCompatFeatures: disklayout.RoCompatFeatures{ - Sparse: true, - LargeFile: true, - HugeFile: true, - DirNlink: true, - ExtraIsize: true, - MetadataCsum: true, - }, - }, - wantBgs: []bg{ - { - InodeTable: 0x23, - BlockBitmap: 0x3, - InodeBitmap: 0x13, - Flags: disklayout.BGFlags{ - InodeZeroed: true, - }, - }, - }, - }, - { - name: "ext3 filesystem init", - image: ext3ImagePath, - wantSb: sb{ - InodesCount: 0x10, - BlocksCount: 0x40, - MaxMountCount: 0xffff, - FirstDataBlock: 0x1, - BlockSize: 0x400, - BlocksPerGroup: 0x2000, - ClusterSize: 0x400, - ClustersPerGroup: 0x2000, - InodeSize: 0x80, - InodesPerGroup: 0x10, - BgDescSize: 0x20, - Magic: linux.EXT_SUPER_MAGIC, - Revision: disklayout.DynamicRev, - CompatFeatures: disklayout.CompatFeatures{ - ExtAttr: true, - ResizeInode: true, - DirIndex: true, - }, - IncompatFeatures: disklayout.IncompatFeatures{ - DirentFileType: true, - }, - RoCompatFeatures: disklayout.RoCompatFeatures{ - Sparse: true, - LargeFile: true, - }, - }, - wantBgs: []bg{ - { - InodeTable: 0x5, - BlockBitmap: 0x3, - InodeBitmap: 0x4, - Flags: disklayout.BGFlags{ - InodeZeroed: true, - }, - }, - }, - }, - { - name: "ext2 filesystem init", - image: ext2ImagePath, - wantSb: sb{ - InodesCount: 0x10, - BlocksCount: 0x40, - MaxMountCount: 0xffff, - FirstDataBlock: 0x1, - BlockSize: 0x400, - BlocksPerGroup: 0x2000, - ClusterSize: 0x400, - ClustersPerGroup: 0x2000, - InodeSize: 0x80, - InodesPerGroup: 0x10, - BgDescSize: 0x20, - Magic: linux.EXT_SUPER_MAGIC, - Revision: disklayout.DynamicRev, - CompatFeatures: disklayout.CompatFeatures{ - ExtAttr: true, - ResizeInode: true, - DirIndex: true, - }, - IncompatFeatures: disklayout.IncompatFeatures{ - DirentFileType: true, - }, - RoCompatFeatures: disklayout.RoCompatFeatures{ - Sparse: true, - LargeFile: true, - }, - }, - wantBgs: []bg{ - { - InodeTable: 0x5, - BlockBitmap: 0x3, - InodeBitmap: 0x4, - Flags: disklayout.BGFlags{ - InodeZeroed: true, - }, - }, - }, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - _, _, vd, tearDown, err := setUp(t, test.image) - if err != nil { - t.Fatalf("setUp failed: %v", err) - } - defer tearDown() - - fs, ok := vd.Mount().Filesystem().Impl().(*filesystem) - if !ok { - t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl()) - } - - // Offload superblock and block group descriptors contents into - // local structs for comparison. - totalFreeInodes := uint32(0) - totalFreeBlocks := uint64(0) - gotSb := sb{ - InodesCount: fs.sb.InodesCount(), - BlocksCount: fs.sb.BlocksCount(), - MaxMountCount: fs.sb.MaxMountCount(), - FirstDataBlock: fs.sb.FirstDataBlock(), - BlockSize: fs.sb.BlockSize(), - BlocksPerGroup: fs.sb.BlocksPerGroup(), - ClusterSize: fs.sb.ClusterSize(), - ClustersPerGroup: fs.sb.ClustersPerGroup(), - InodeSize: fs.sb.InodeSize(), - InodesPerGroup: fs.sb.InodesPerGroup(), - BgDescSize: fs.sb.BgDescSize(), - Magic: fs.sb.Magic(), - Revision: fs.sb.Revision(), - CompatFeatures: fs.sb.CompatibleFeatures(), - IncompatFeatures: fs.sb.IncompatibleFeatures(), - RoCompatFeatures: fs.sb.ReadOnlyCompatibleFeatures(), - } - gotNumBgs := len(fs.bgs) - gotBgs := make([]bg, gotNumBgs) - for i := 0; i < gotNumBgs; i++ { - gotBgs[i].InodeTable = fs.bgs[i].InodeTable() - gotBgs[i].BlockBitmap = fs.bgs[i].BlockBitmap() - gotBgs[i].InodeBitmap = fs.bgs[i].InodeBitmap() - gotBgs[i].ExclusionBitmap = fs.bgs[i].ExclusionBitmap() - gotBgs[i].Flags = fs.bgs[i].Flags() - - totalFreeInodes += fs.bgs[i].FreeInodesCount() - totalFreeBlocks += uint64(fs.bgs[i].FreeBlocksCount()) - } - - if diff := cmp.Diff(gotSb, test.wantSb); diff != "" { - t.Errorf("superblock mismatch (-want +got):\n%s", diff) - } - - if diff := cmp.Diff(gotBgs, test.wantBgs); diff != "" { - t.Errorf("block group descriptors mismatch (-want +got):\n%s", diff) - } - - if diff := cmp.Diff(totalFreeInodes, fs.sb.FreeInodesCount()); diff != "" { - t.Errorf("total free inodes mismatch (-want +got):\n%s", diff) - } - - if diff := cmp.Diff(totalFreeBlocks, fs.sb.FreeBlocksCount()); diff != "" { - t.Errorf("total free blocks mismatch (-want +got):\n%s", diff) - } - }) - } -} diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go deleted file mode 100644 index 778460107..000000000 --- a/pkg/sentry/fsimpl/ext/extent_file.go +++ /dev/null @@ -1,239 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" - "sort" - - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/syserror" -) - -// extentFile is a type of regular file which uses extents to store file data. -// -// +stateify savable -type extentFile struct { - regFile regularFile - - // root is the root extent node. This lives in the 60 byte diskInode.Data(). - // Immutable. - root disklayout.ExtentNode -} - -// Compiles only if extentFile implements io.ReaderAt. -var _ io.ReaderAt = (*extentFile)(nil) - -// newExtentFile is the extent file constructor. It reads the entire extent -// tree into memory. -// TODO(b/134676337): Build extent tree on demand to reduce memory usage. -func newExtentFile(args inodeArgs) (*extentFile, error) { - file := &extentFile{} - file.regFile.impl = file - file.regFile.inode.init(args, &file.regFile) - err := file.buildExtTree() - if err != nil { - return nil, err - } - return file, nil -} - -// buildExtTree builds the extent tree by reading it from disk by doing -// running a simple DFS. It first reads the root node from the inode struct in -// memory. Then it recursively builds the rest of the tree by reading it off -// disk. -// -// Precondition: inode flag InExtents must be set. -func (f *extentFile) buildExtTree() error { - rootNodeData := f.regFile.inode.diskInode.Data() - - f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize]) - - // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. - if f.root.Header.NumEntries > 4 { - // read(2) specifies that EINVAL should be returned if the file is unsuitable - // for reading. - return syserror.EINVAL - } - - f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries) - for i, off := uint16(0), disklayout.ExtentEntrySize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize { - var curEntry disklayout.ExtentEntry - if f.root.Header.Height == 0 { - // Leaf node. - curEntry = &disklayout.Extent{} - } else { - // Internal node. - curEntry = &disklayout.ExtentIdx{} - } - curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize]) - f.root.Entries[i].Entry = curEntry - } - - // If this node is internal, perform DFS. - if f.root.Header.Height > 0 { - for i := uint16(0); i < f.root.Header.NumEntries; i++ { - var err error - if f.root.Entries[i].Node, err = f.buildExtTreeFromDisk(f.root.Entries[i].Entry); err != nil { - return err - } - } - } - - return nil -} - -// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively -// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to -// by the ExtentEntry. -func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) { - var header disklayout.ExtentHeader - off := entry.PhysicalBlock() * f.regFile.inode.blkSize - err := readFromDisk(f.regFile.inode.fs.dev, int64(off), &header) - if err != nil { - return nil, err - } - - entries := make([]disklayout.ExtentEntryPair, header.NumEntries) - for i, off := uint16(0), off+disklayout.ExtentEntrySize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize { - var curEntry disklayout.ExtentEntry - if header.Height == 0 { - // Leaf node. - curEntry = &disklayout.Extent{} - } else { - // Internal node. - curEntry = &disklayout.ExtentIdx{} - } - - err := readFromDisk(f.regFile.inode.fs.dev, int64(off), curEntry) - if err != nil { - return nil, err - } - entries[i].Entry = curEntry - } - - // If this node is internal, perform DFS. - if header.Height > 0 { - for i := uint16(0); i < header.NumEntries; i++ { - var err error - entries[i].Node, err = f.buildExtTreeFromDisk(entries[i].Entry) - if err != nil { - return nil, err - } - } - } - - return &disklayout.ExtentNode{header, entries}, nil -} - -// ReadAt implements io.ReaderAt.ReadAt. -func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) { - if len(dst) == 0 { - return 0, nil - } - - if off < 0 { - return 0, syserror.EINVAL - } - - if uint64(off) >= f.regFile.inode.diskInode.Size() { - return 0, io.EOF - } - - n, err := f.read(&f.root, uint64(off), dst) - if n < len(dst) && err == nil { - err = io.EOF - } - return n, err -} - -// read is the recursive step of extentFile.ReadAt which traverses the extent -// tree from the node passed and reads file data. -func (f *extentFile) read(node *disklayout.ExtentNode, off uint64, dst []byte) (int, error) { - // Perform a binary search for the node covering bytes starting at r.fileOff. - // A highly fragmented filesystem can have upto 340 entries and so linear - // search should be avoided. Finds the first entry which does not cover the - // file block we want and subtracts 1 to get the desired index. - fileBlk := uint32(off / f.regFile.inode.blkSize) - n := len(node.Entries) - found := sort.Search(n, func(i int) bool { - return node.Entries[i].Entry.FileBlock() > fileBlk - }) - 1 - - // We should be in this recursive step only if the data we want exists under - // the current node. - if found < 0 { - panic("searching for a file block in an extent entry which does not cover it") - } - - read := 0 - toRead := len(dst) - var curR int - var err error - for i := found; i < n && read < toRead; i++ { - if node.Header.Height == 0 { - curR, err = f.readFromExtent(node.Entries[i].Entry.(*disklayout.Extent), off, dst[read:]) - } else { - curR, err = f.read(node.Entries[i].Node, off, dst[read:]) - } - - read += curR - off += uint64(curR) - if err != nil { - return read, err - } - } - - return read, nil -} - -// readFromExtent reads file data from the extent. It takes advantage of the -// sequential nature of extents and reads file data from multiple blocks in one -// call. -// -// A non-nil error indicates that this is a partial read and there is probably -// more to read from this extent. The caller should propagate the error upward -// and not move to the next extent in the tree. -// -// A subsequent call to extentReader.Read should continue reading from where we -// left off as expected. -func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byte) (int, error) { - curFileBlk := uint32(off / f.regFile.inode.blkSize) - exFirstFileBlk := ex.FileBlock() - exLastFileBlk := exFirstFileBlk + uint32(ex.Length) // This is exclusive. - - // We should be in this recursive step only if the data we want exists under - // the current extent. - if curFileBlk < exFirstFileBlk || exLastFileBlk <= curFileBlk { - panic("searching for a file block in an extent which does not cover it") - } - - curPhyBlk := uint64(curFileBlk-exFirstFileBlk) + ex.PhysicalBlock() - readStart := curPhyBlk*f.regFile.inode.blkSize + (off % f.regFile.inode.blkSize) - - endPhyBlk := ex.PhysicalBlock() + uint64(ex.Length) - extentEnd := endPhyBlk * f.regFile.inode.blkSize // This is exclusive. - - toRead := int(extentEnd - readStart) - if len(dst) < toRead { - toRead = len(dst) - } - - n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], int64(readStart)) - if n < toRead { - return n, syserror.EIO - } - return n, nil -} diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go deleted file mode 100644 index 985f76ac0..000000000 --- a/pkg/sentry/fsimpl/ext/extent_test.go +++ /dev/null @@ -1,266 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "bytes" - "math/rand" - "testing" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" -) - -const ( - // mockExtentBlkSize is the mock block size used for testing. - // No block has more than 1 header + 4 entries. - mockExtentBlkSize = uint64(64) -) - -// The tree described below looks like: -// -// 0.{Head}[Idx][Idx] -// / \ -// / \ -// 1.{Head}[Ext][Ext] 2.{Head}[Idx] -// / | \ -// [Phy] [Phy, Phy] 3.{Head}[Ext] -// | -// [Phy, Phy, Phy] -// -// Legend: -// - Head = ExtentHeader -// - Idx = ExtentIdx -// - Ext = Extent -// - Phy = Physical Block -// -// Please note that ext4 might not construct extent trees looking like this. -// This is purely for testing the tree traversal logic. -var ( - node3 = &disklayout.ExtentNode{ - Header: disklayout.ExtentHeader{ - Magic: disklayout.ExtentMagic, - NumEntries: 1, - MaxEntries: 4, - Height: 0, - }, - Entries: []disklayout.ExtentEntryPair{ - { - Entry: &disklayout.Extent{ - FirstFileBlock: 3, - Length: 3, - StartBlockLo: 6, - }, - Node: nil, - }, - }, - } - - node2 = &disklayout.ExtentNode{ - Header: disklayout.ExtentHeader{ - Magic: disklayout.ExtentMagic, - NumEntries: 1, - MaxEntries: 4, - Height: 1, - }, - Entries: []disklayout.ExtentEntryPair{ - { - Entry: &disklayout.ExtentIdx{ - FirstFileBlock: 3, - ChildBlockLo: 2, - }, - Node: node3, - }, - }, - } - - node1 = &disklayout.ExtentNode{ - Header: disklayout.ExtentHeader{ - Magic: disklayout.ExtentMagic, - NumEntries: 2, - MaxEntries: 4, - Height: 0, - }, - Entries: []disklayout.ExtentEntryPair{ - { - Entry: &disklayout.Extent{ - FirstFileBlock: 0, - Length: 1, - StartBlockLo: 3, - }, - Node: nil, - }, - { - Entry: &disklayout.Extent{ - FirstFileBlock: 1, - Length: 2, - StartBlockLo: 4, - }, - Node: nil, - }, - }, - } - - node0 = &disklayout.ExtentNode{ - Header: disklayout.ExtentHeader{ - Magic: disklayout.ExtentMagic, - NumEntries: 2, - MaxEntries: 4, - Height: 2, - }, - Entries: []disklayout.ExtentEntryPair{ - { - Entry: &disklayout.ExtentIdx{ - FirstFileBlock: 0, - ChildBlockLo: 0, - }, - Node: node1, - }, - { - Entry: &disklayout.ExtentIdx{ - FirstFileBlock: 3, - ChildBlockLo: 1, - }, - Node: node2, - }, - }, - } -) - -// TestExtentReader stress tests extentReader functionality. It performs random -// length reads from all possible positions in the extent tree. -func TestExtentReader(t *testing.T) { - mockExtentFile, want := extentTreeSetUp(t, node0) - n := len(want) - - for from := 0; from < n; from++ { - got := make([]byte, n-from) - - if read, err := mockExtentFile.ReadAt(got, int64(from)); err != nil { - t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err) - } - - if diff := cmp.Diff(got, want[from:]); diff != "" { - t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff) - } - } -} - -// TestBuildExtentTree tests the extent tree building logic. -func TestBuildExtentTree(t *testing.T) { - mockExtentFile, _ := extentTreeSetUp(t, node0) - - opt := cmpopts.IgnoreUnexported(disklayout.ExtentIdx{}, disklayout.ExtentHeader{}) - if diff := cmp.Diff(&mockExtentFile.root, node0, opt); diff != "" { - t.Errorf("extent tree mismatch (-want +got):\n%s", diff) - } -} - -// extentTreeSetUp writes the passed extent tree to a mock disk as an extent -// tree. It also constucts a mock extent file with the same tree built in it. -// It also writes random data file data and returns it. -func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []byte) { - t.Helper() - - mockDisk := make([]byte, mockExtentBlkSize*10) - mockExtentFile := &extentFile{} - args := inodeArgs{ - fs: &filesystem{ - dev: bytes.NewReader(mockDisk), - }, - diskInode: &disklayout.InodeNew{ - InodeOld: disklayout.InodeOld{ - SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), - }, - }, - blkSize: mockExtentBlkSize, - } - mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile) - - fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize) - - if err := mockExtentFile.buildExtTree(); err != nil { - t.Fatalf("inode.buildExtTree failed: %v", err) - } - return mockExtentFile, fileData -} - -// writeTree writes the tree represented by `root` to the inode and disk. It -// also writes random file data on disk. -func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte { - rootData := in.diskInode.Data() - root.Header.MarshalBytes(rootData) - off := root.Header.SizeBytes() - for _, ep := range root.Entries { - ep.Entry.MarshalBytes(rootData[off:]) - off += ep.Entry.SizeBytes() - } - - var fileData []byte - for _, ep := range root.Entries { - if root.Header.Height == 0 { - fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...) - } else { - fileData = append(fileData, writeTreeToDisk(disk, ep)...) - } - } - return fileData -} - -// writeTreeToDisk is the recursive step for writeTree which writes the tree -// on the disk only. Also writes random file data on disk. -func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte { - nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:] - curNode.Node.Header.MarshalBytes(nodeData) - off := curNode.Node.Header.SizeBytes() - for _, ep := range curNode.Node.Entries { - ep.Entry.MarshalBytes(nodeData[off:]) - off += ep.Entry.SizeBytes() - } - - var fileData []byte - for _, ep := range curNode.Node.Entries { - if curNode.Node.Header.Height == 0 { - fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...) - } else { - fileData = append(fileData, writeTreeToDisk(disk, ep)...) - } - } - return fileData -} - -// writeFileDataToExtent writes random bytes to the blocks on disk that the -// passed extent points to. -func writeFileDataToExtent(disk []byte, ex *disklayout.Extent) []byte { - phyExStartBlk := ex.PhysicalBlock() - phyExStartOff := phyExStartBlk * mockExtentBlkSize - phyExEndOff := phyExStartOff + uint64(ex.Length)*mockExtentBlkSize - rand.Read(disk[phyExStartOff:phyExEndOff]) - return disk[phyExStartOff:phyExEndOff] -} - -// getNumPhyBlks returns the number of physical blocks covered under the node. -func getNumPhyBlks(node *disklayout.ExtentNode) uint32 { - var res uint32 - for _, ep := range node.Entries { - if node.Header.Height == 0 { - res += uint32(ep.Entry.(*disklayout.Extent).Length) - } else { - res += getNumPhyBlks(ep.Node) - } - } - return res -} diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go deleted file mode 100644 index 90b086468..000000000 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// fileDescription is embedded by ext implementations of -// vfs.FileDescriptionImpl. -type fileDescription struct { - vfsfd vfs.FileDescription - vfs.FileDescriptionDefaultImpl - vfs.LockFD -} - -func (fd *fileDescription) filesystem() *filesystem { - return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) -} - -func (fd *fileDescription) inode() *inode { - return fd.vfsfd.Dentry().Impl().(*dentry).inode -} - -// Stat implements vfs.FileDescriptionImpl.Stat. -func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { - var stat linux.Statx - fd.inode().statTo(&stat) - return stat, nil -} - -// SetStat implements vfs.FileDescriptionImpl.SetStat. -func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - if opts.Stat.Mask == 0 { - return nil - } - return syserror.EPERM -} - -// SetStat implements vfs.FileDescriptionImpl.StatFS. -func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { - var stat linux.Statfs - fd.filesystem().statTo(&stat) - return stat, nil -} - -// Sync implements vfs.FileDescriptionImpl.Sync. -func (fd *fileDescription) Sync(ctx context.Context) error { - return nil -} diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go deleted file mode 100644 index d4fc484a2..000000000 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ /dev/null @@ -1,555 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "errors" - "io" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" -) - -var ( - // errResolveDirent indicates that the vfs.ResolvingPath.Component() does - // not exist on the dentry tree but does exist on disk. So it has to be read in - // using the in-memory dirent and added to the dentry tree. Usually indicates - // the need to lock filesystem.mu for writing. - errResolveDirent = errors.New("resolve path component using dirent") -) - -// filesystem implements vfs.FilesystemImpl. -// -// +stateify savable -type filesystem struct { - vfsfs vfs.Filesystem - - // mu serializes changes to the Dentry tree. - mu sync.RWMutex `state:"nosave"` - - // dev represents the underlying fs device. It does not require protection - // because io.ReaderAt permits concurrent read calls to it. It translates to - // the pread syscall which passes on the read request directly to the device - // driver. Device drivers are intelligent in serving multiple concurrent read - // requests in the optimal order (taking locality into consideration). - dev io.ReaderAt - - // inodeCache maps absolute inode numbers to the corresponding Inode struct. - // Inodes should be removed from this once their reference count hits 0. - // - // Protected by mu because most additions (see IterDirents) and all removals - // from this corresponds to a change in the dentry tree. - inodeCache map[uint32]*inode - - // sb represents the filesystem superblock. Immutable after initialization. - sb disklayout.SuperBlock - - // bgs represents all the block group descriptors for the filesystem. - // Immutable after initialization. - bgs []disklayout.BlockGroup - - // devMinor is this filesystem's device minor number. Immutable after - // initialization. - devMinor uint32 -} - -// Compiles only if filesystem implements vfs.FilesystemImpl. -var _ vfs.FilesystemImpl = (*filesystem)(nil) - -// stepLocked resolves rp.Component() in parent directory vfsd. The write -// parameter passed tells if the caller has acquired filesystem.mu for writing -// or not. If set to true, an existing inode on disk can be added to the dentry -// tree if not present already. -// -// stepLocked is loosely analogous to fs/namei.c:walk_component(). -// -// Preconditions: -// * filesystem.mu must be locked (for writing if write param is true). -// * !rp.Done(). -// * inode == vfsd.Impl().(*Dentry).inode. -func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { - if !inode.isDir() { - return nil, nil, syserror.ENOTDIR - } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { - return nil, nil, err - } - - for { - name := rp.Component() - if name == "." { - rp.Advance() - return vfsd, inode, nil - } - d := vfsd.Impl().(*dentry) - if name == ".." { - isRoot, err := rp.CheckRoot(ctx, vfsd) - if err != nil { - return nil, nil, err - } - if isRoot || d.parent == nil { - rp.Advance() - return vfsd, inode, nil - } - if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { - return nil, nil, err - } - rp.Advance() - return &d.parent.vfsd, d.parent.inode, nil - } - - dir := inode.impl.(*directory) - child, ok := dir.childCache[name] - if !ok { - // We may need to instantiate a new dentry for this child. - childDirent, ok := dir.childMap[name] - if !ok { - // The underlying inode does not exist on disk. - return nil, nil, syserror.ENOENT - } - - if !write { - // filesystem.mu must be held for writing to add to the dentry tree. - return nil, nil, errResolveDirent - } - - // Create and add the component's dirent to the dentry tree. - fs := rp.Mount().Filesystem().Impl().(*filesystem) - childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode()) - if err != nil { - return nil, nil, err - } - // incRef because this is being added to the dentry tree. - childInode.incRef() - child = newDentry(childInode) - child.parent = d - child.name = name - dir.childCache[name] = child - } - if err := rp.CheckMount(ctx, &child.vfsd); err != nil { - return nil, nil, err - } - if child.inode.isSymlink() && rp.ShouldFollowSymlink() { - if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil { - return nil, nil, err - } - continue - } - rp.Advance() - return &child.vfsd, child.inode, nil - } -} - -// walkLocked resolves rp to an existing file. The write parameter -// passed tells if the caller has acquired filesystem.mu for writing or not. -// If set to true, additions can be made to the dentry tree while walking. -// If errResolveDirent is returned, the walk needs to be continued with an -// upgraded filesystem.mu. -// -// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). -// -// Preconditions: -// * filesystem.mu must be locked (for writing if write param is true). -func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode - for !rp.Done() { - var err error - vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) - if err != nil { - return nil, nil, err - } - } - if rp.MustBeDir() && !inode.isDir() { - return nil, nil, syserror.ENOTDIR - } - return vfsd, inode, nil -} - -// walkParentLocked resolves all but the last path component of rp to an -// existing directory. It does not check that the returned directory is -// searchable by the provider of rp. The write parameter passed tells if the -// caller has acquired filesystem.mu for writing or not. If set to true, -// additions can be made to the dentry tree while walking. -// If errResolveDirent is returned, the walk needs to be continued with an -// upgraded filesystem.mu. -// -// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat(). -// -// Preconditions: -// * filesystem.mu must be locked (for writing if write param is true). -// * !rp.Done(). -func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode - for !rp.Final() { - var err error - vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) - if err != nil { - return nil, nil, err - } - } - if !inode.isDir() { - return nil, nil, syserror.ENOTDIR - } - return vfsd, inode, nil -} - -// walk resolves rp to an existing file. If parent is set to true, it resolves -// the rp till the parent of the last component which should be an existing -// directory. If parent is false then resolves rp entirely. Attemps to resolve -// the path as far as it can with a read lock and upgrades the lock if needed. -func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { - var ( - vfsd *vfs.Dentry - inode *inode - err error - ) - - // Try walking with the hopes that all dentries have already been pulled out - // of disk. This reduces congestion (allows concurrent walks). - fs.mu.RLock() - if parent { - vfsd, inode, err = walkParentLocked(ctx, rp, false) - } else { - vfsd, inode, err = walkLocked(ctx, rp, false) - } - fs.mu.RUnlock() - - if err == errResolveDirent { - // Upgrade lock and continue walking. Lock upgrading in the middle of the - // walk is fine as this is a read only filesystem. - fs.mu.Lock() - if parent { - vfsd, inode, err = walkParentLocked(ctx, rp, true) - } else { - vfsd, inode, err = walkLocked(ctx, rp, true) - } - fs.mu.Unlock() - } - - return vfsd, inode, err -} - -// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in. -// It creates a new one with the given inode number if one does not exist. -// The caller must increment the ref count if adding this to the dentry tree. -// -// Precondition: must be holding fs.mu for writing. -func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) { - if in, ok := fs.inodeCache[inodeNum]; ok { - return in, nil - } - - in, err := newInode(fs, inodeNum) - if err != nil { - return nil, err - } - - fs.inodeCache[inodeNum] = in - return in, nil -} - -// statTo writes the statfs fields to the output parameter. -func (fs *filesystem) statTo(stat *linux.Statfs) { - stat.Type = uint64(fs.sb.Magic()) - stat.BlockSize = int64(fs.sb.BlockSize()) - stat.Blocks = fs.sb.BlocksCount() - stat.BlocksFree = fs.sb.FreeBlocksCount() - stat.BlocksAvailable = fs.sb.FreeBlocksCount() - stat.Files = uint64(fs.sb.InodesCount()) - stat.FilesFree = uint64(fs.sb.FreeInodesCount()) - stat.NameLength = disklayout.MaxFileName - stat.FragmentSize = int64(fs.sb.BlockSize()) - // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID. -} - -// AccessAt implements vfs.Filesystem.Impl.AccessAt. -func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - return inode.checkPermissions(rp.Credentials(), ats) -} - -// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. -func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return nil, err - } - - if opts.CheckSearchable { - if !inode.isDir() { - return nil, syserror.ENOTDIR - } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { - return nil, err - } - } - - inode.incRef() - return vfsd, nil -} - -// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. -func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(ctx, rp, true) - if err != nil { - return nil, err - } - inode.incRef() - return vfsd, nil -} - -// OpenAt implements vfs.FilesystemImpl.OpenAt. -func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - vfsd, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return nil, err - } - - // EROFS is returned if write access is needed. - if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 { - return nil, syserror.EROFS - } - return inode.open(rp, vfsd, &opts) -} - -// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. -func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return "", err - } - symlink, ok := inode.impl.(*symlink) - if !ok { - return "", syserror.EINVAL - } - return symlink.target, nil -} - -// StatAt implements vfs.FilesystemImpl.StatAt. -func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return linux.Statx{}, err - } - var stat linux.Statx - inode.statTo(&stat) - return stat, nil -} - -// StatFSAt implements vfs.FilesystemImpl.StatFSAt. -func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - if _, _, err := fs.walk(ctx, rp, false); err != nil { - return linux.Statfs{}, err - } - - var stat linux.Statfs - fs.statTo(&stat) - return stat, nil -} - -// Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release(ctx context.Context) { - fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) -} - -// Sync implements vfs.FilesystemImpl.Sync. -func (fs *filesystem) Sync(ctx context.Context) error { - // This is a readonly filesystem for now. - return nil -} - -// The vfs.FilesystemImpl functions below return EROFS because their respective -// man pages say that EROFS must be returned if the path resolves to a file on -// this read-only filesystem. - -// LinkAt implements vfs.FilesystemImpl.LinkAt. -func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - if rp.Done() { - return syserror.EEXIST - } - - if _, _, err := fs.walk(ctx, rp, true); err != nil { - return err - } - - return syserror.EROFS -} - -// MkdirAt implements vfs.FilesystemImpl.MkdirAt. -func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - if rp.Done() { - return syserror.EEXIST - } - - if _, _, err := fs.walk(ctx, rp, true); err != nil { - return err - } - - return syserror.EROFS -} - -// MknodAt implements vfs.FilesystemImpl.MknodAt. -func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - if rp.Done() { - return syserror.EEXIST - } - - _, _, err := fs.walk(ctx, rp, true) - if err != nil { - return err - } - - return syserror.EROFS -} - -// RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { - if rp.Done() { - return syserror.ENOENT - } - - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - - return syserror.EROFS -} - -// RmdirAt implements vfs.FilesystemImpl.RmdirAt. -func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - - if !inode.isDir() { - return syserror.ENOTDIR - } - - return syserror.EROFS -} - -// SetStatAt implements vfs.FilesystemImpl.SetStatAt. -func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - - return syserror.EROFS -} - -// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. -func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - if rp.Done() { - return syserror.EEXIST - } - - _, _, err := fs.walk(ctx, rp, true) - if err != nil { - return err - } - - return syserror.EROFS -} - -// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. -func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - - if inode.isDir() { - return syserror.EISDIR - } - - return syserror.EROFS -} - -// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. -func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { - _, inode, err := fs.walk(ctx, rp, false) - if err != nil { - return nil, err - } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { - return nil, err - } - - // TODO(b/134676337): Support sockets. - return nil, syserror.ECONNREFUSED -} - -// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. -func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return nil, err - } - return nil, syserror.ENOTSUP -} - -// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. -func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return "", err - } - return "", syserror.ENOTSUP -} - -// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. -func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - return syserror.ENOTSUP -} - -// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. -func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { - _, _, err := fs.walk(ctx, rp, false) - if err != nil { - return err - } - return syserror.ENOTSUP -} - -// PrependPath implements vfs.FilesystemImpl.PrependPath. -func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { - fs.mu.RLock() - defer fs.mu.RUnlock() - return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) -} - -// MountOptions implements vfs.FilesystemImpl.MountOptions. -func (fs *filesystem) MountOptions() string { - return "" -} diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go deleted file mode 100644 index 4a555bf72..000000000 --- a/pkg/sentry/fsimpl/ext/inode.go +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "fmt" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// inode represents an ext inode. -// -// inode uses the same inheritance pattern that pkg/sentry/vfs structures use. -// This has been done to increase memory locality. -// -// Implementations: -// inode -- -// |-- dir -// |-- symlink -// |-- regular-- -// |-- extent file -// |-- block map file -// -// +stateify savable -type inode struct { - // refs is a reference count. refs is accessed using atomic memory operations. - refs int64 - - // fs is the containing filesystem. - fs *filesystem - - // inodeNum is the inode number of this inode on disk. This is used to - // identify inodes within the ext filesystem. - inodeNum uint32 - - // blkSize is the fs data block size. Same as filesystem.sb.BlockSize(). - blkSize uint64 - - // diskInode gives us access to the inode struct on disk. Immutable. - diskInode disklayout.Inode - - locks vfs.FileLocks - - // This is immutable. The first field of the implementations must have inode - // as the first field to ensure temporality. - impl interface{} -} - -// incRef increments the inode ref count. -func (in *inode) incRef() { - atomic.AddInt64(&in.refs, 1) -} - -// tryIncRef tries to increment the ref count. Returns true if successful. -func (in *inode) tryIncRef() bool { - for { - refs := atomic.LoadInt64(&in.refs) - if refs == 0 { - return false - } - if atomic.CompareAndSwapInt64(&in.refs, refs, refs+1) { - return true - } - } -} - -// decRef decrements the inode ref count and releases the inode resources if -// the ref count hits 0. -// -// Precondition: Must have locked filesystem.mu. -func (in *inode) decRef() { - if refs := atomic.AddInt64(&in.refs, -1); refs == 0 { - delete(in.fs.inodeCache, in.inodeNum) - } else if refs < 0 { - panic("ext.inode.decRef() called without holding a reference") - } -} - -// newInode is the inode constructor. Reads the inode off disk. Identifies -// inodes based on the absolute inode number on disk. -func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { - if inodeNum == 0 { - panic("inode number 0 on ext filesystems is not possible") - } - - inodeRecordSize := fs.sb.InodeSize() - var diskInode disklayout.Inode - if inodeRecordSize == disklayout.OldInodeSize { - diskInode = &disklayout.InodeOld{} - } else { - diskInode = &disklayout.InodeNew{} - } - - // Calculate where the inode is actually placed. - inodesPerGrp := fs.sb.InodesPerGroup() - blkSize := fs.sb.BlockSize() - inodeTableOff := fs.bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize - inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp)) - - if err := readFromDisk(fs.dev, int64(inodeOff), diskInode); err != nil { - return nil, err - } - - // Build the inode based on its type. - args := inodeArgs{ - fs: fs, - inodeNum: inodeNum, - blkSize: blkSize, - diskInode: diskInode, - } - - switch diskInode.Mode().FileType() { - case linux.ModeSymlink: - f, err := newSymlink(args) - if err != nil { - return nil, err - } - return &f.inode, nil - case linux.ModeRegular: - f, err := newRegularFile(args) - if err != nil { - return nil, err - } - return &f.inode, nil - case linux.ModeDirectory: - f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType) - if err != nil { - return nil, err - } - return &f.inode, nil - default: - // TODO(b/134676337): Return appropriate errors for sockets, pipes and devices. - return nil, syserror.EINVAL - } -} - -type inodeArgs struct { - fs *filesystem - inodeNum uint32 - blkSize uint64 - diskInode disklayout.Inode -} - -func (in *inode) init(args inodeArgs, impl interface{}) { - in.fs = args.fs - in.inodeNum = args.inodeNum - in.blkSize = args.blkSize - in.diskInode = args.diskInode - in.impl = impl -} - -// open creates and returns a file description for the dentry passed in. -func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(opts) - if err := in.checkPermissions(rp.Credentials(), ats); err != nil { - return nil, err - } - mnt := rp.Mount() - switch in.impl.(type) { - case *regularFile: - var fd regularFileFD - fd.LockFD.Init(&in.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil - case *directory: - // Can't open directories writably. This check is not necessary for a read - // only filesystem but will be required when write is implemented. - if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR - } - var fd directoryFD - fd.LockFD.Init(&in.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil - case *symlink: - if opts.Flags&linux.O_PATH == 0 { - // Can't open symlinks without O_PATH. - return nil, syserror.ELOOP - } - var fd symlinkFD - fd.LockFD.Init(&in.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil - default: - panic(fmt.Sprintf("unknown inode type: %T", in.impl)) - } -} - -func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID()) -} - -// statTo writes the statx fields to the output parameter. -func (in *inode) statTo(stat *linux.Statx) { - stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | - linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | - linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME - stat.Blksize = uint32(in.blkSize) - stat.Mode = uint16(in.diskInode.Mode()) - stat.Nlink = uint32(in.diskInode.LinksCount()) - stat.UID = uint32(in.diskInode.UID()) - stat.GID = uint32(in.diskInode.GID()) - stat.Ino = uint64(in.inodeNum) - stat.Size = in.diskInode.Size() - stat.Atime = in.diskInode.AccessTime().StatxTimestamp() - stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp() - stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp() - stat.DevMajor = linux.UNNAMED_MAJOR - stat.DevMinor = in.fs.devMinor - // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks - // (including metadata blocks) required to represent this file. -} - -// getBGNum returns the block group number that a given inode belongs to. -func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 { - return (inodeNum - 1) / inodesPerGrp -} - -// getBGOff returns the offset at which the given inode lives in the block -// group's inode table, i.e. the index of the inode in the inode table. -func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 { - return (inodeNum - 1) % inodesPerGrp -} diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go deleted file mode 100644 index 5ad9befcd..000000000 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// regularFile represents a regular file's inode. This too follows the -// inheritance pattern prevelant in the vfs layer described in -// pkg/sentry/vfs/README.md. -// -// +stateify savable -type regularFile struct { - inode inode - - // This is immutable. The first field of fileReader implementations must be - // regularFile to ensure temporality. - // io.ReaderAt is more strict than io.Reader in the sense that a partial read - // is always accompanied by an error. If a read spans past the end of file, a - // partial read (within file range) is done and io.EOF is returned. - impl io.ReaderAt -} - -// newRegularFile is the regularFile constructor. It figures out what kind of -// file this is and initializes the fileReader. -func newRegularFile(args inodeArgs) (*regularFile, error) { - if args.diskInode.Flags().Extents { - file, err := newExtentFile(args) - if err != nil { - return nil, err - } - return &file.regFile, nil - } - - file, err := newBlockMapFile(args) - if err != nil { - return nil, err - } - return &file.regFile, nil -} - -func (in *inode) isRegular() bool { - _, ok := in.impl.(*regularFile) - return ok -} - -// directoryFD represents a directory file description. It implements -// vfs.FileDescriptionImpl. -// -// +stateify savable -type regularFileFD struct { - fileDescription - vfs.LockFD - - // off is the file offset. off is accessed using atomic memory operations. - off int64 - - // offMu serializes operations that may mutate off. - offMu sync.Mutex `state:"nosave"` -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release(context.Context) {} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - safeReader := safemem.FromIOReaderAt{ - ReaderAt: fd.inode().impl.(*regularFile).impl, - Offset: offset, - } - - // Copies data from disk directly into usermem without any intermediate - // allocations (if dst is converted into BlockSeq such that it does not need - // safe copying). - return dst.CopyOutFrom(ctx, safeReader) -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - n, err := fd.PRead(ctx, dst, fd.off, opts) - fd.offMu.Lock() - fd.off += n - fd.offMu.Unlock() - return n, err -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - // write(2) specifies that EBADF must be returned if the fd is not open for - // writing. - return 0, syserror.EBADF -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.offMu.Lock() - fd.off += n - fd.offMu.Unlock() - return n, err -} - -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - return syserror.ENOTDIR -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fd.offMu.Lock() - defer fd.offMu.Unlock() - switch whence { - case linux.SEEK_SET: - // Use offset as specified. - case linux.SEEK_CUR: - offset += fd.off - case linux.SEEK_END: - offset += int64(fd.inode().diskInode.Size()) - default: - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - fd.off = offset - return offset, nil -} - -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - // TODO(b/134676337): Implement mmap(2). - return syserror.ENODEV -} diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go deleted file mode 100644 index 5e2bcc837..000000000 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// symlink represents a symlink inode. -// -// +stateify savable -type symlink struct { - inode inode - target string // immutable -} - -// newSymlink is the symlink constructor. It reads out the symlink target from -// the inode (however it might have been stored). -func newSymlink(args inodeArgs) (*symlink, error) { - var link []byte - - // If the symlink target is lesser than 60 bytes, its stores in inode.Data(). - // Otherwise either extents or block maps will be used to store the link. - size := args.diskInode.Size() - if size < 60 { - link = args.diskInode.Data()[:size] - } else { - // Create a regular file out of this inode and read out the target. - regFile, err := newRegularFile(args) - if err != nil { - return nil, err - } - - link = make([]byte, size) - if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size { - return nil, err - } - } - - file := &symlink{target: string(link)} - file.inode.init(args, file) - return file, nil -} - -func (in *inode) isSymlink() bool { - _, ok := in.impl.(*symlink) - return ok -} - -// symlinkFD represents a symlink file description and implements -// vfs.FileDescriptionImpl. which may only be used if open options contains -// O_PATH. For this reason most of the functions return EBADF. -// -// +stateify savable -type symlinkFD struct { - fileDescription - vfs.NoLockFD -} - -// Compiles only if symlinkFD implements vfs.FileDescriptionImpl. -var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil) - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *symlinkFD) Release(context.Context) {} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EBADF -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EBADF -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EBADF -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EBADF -} - -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - return syserror.ENOTDIR -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - return 0, syserror.EBADF -} - -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - return syserror.EBADF -} diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go deleted file mode 100644 index 58ef7b9b8..000000000 --- a/pkg/sentry/fsimpl/ext/utils.go +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" - - "gvisor.dev/gvisor/pkg/marshal" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/syserror" -) - -// readFromDisk performs a binary read from disk into the given struct from -// the absolute offset provided. -func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error { - n := v.SizeBytes() - buf := make([]byte, n) - if read, _ := dev.ReadAt(buf, abOff); read < int(n) { - return syserror.EIO - } - - v.UnmarshalBytes(buf) - return nil -} - -// readSuperBlock reads the SuperBlock from block group 0 in the underlying -// device. There are three versions of the superblock. This function identifies -// and returns the correct version. -func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) { - var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{} - if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { - return nil, err - } - if sb.Revision() == disklayout.OldRev { - return sb, nil - } - - sb = &disklayout.SuperBlock32Bit{} - if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { - return nil, err - } - if !sb.IncompatibleFeatures().Is64Bit { - return sb, nil - } - - sb = &disklayout.SuperBlock64Bit{} - if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { - return nil, err - } - return sb, nil -} - -// blockGroupsCount returns the number of block groups in the ext fs. -func blockGroupsCount(sb disklayout.SuperBlock) uint64 { - blocksCount := sb.BlocksCount() - blocksPerGroup := uint64(sb.BlocksPerGroup()) - - // Round up the result. float64 can compromise precision so do it manually. - return (blocksCount + blocksPerGroup - 1) / blocksPerGroup -} - -// readBlockGroups reads the block group descriptor table from block group 0 in -// the underlying device. -func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { - bgCount := blockGroupsCount(sb) - bgdSize := uint64(sb.BgDescSize()) - is64Bit := sb.IncompatibleFeatures().Is64Bit - bgds := make([]disklayout.BlockGroup, bgCount) - - for i, off := uint64(0), uint64(sb.FirstDataBlock()+1)*sb.BlockSize(); i < bgCount; i, off = i+1, off+bgdSize { - if is64Bit { - bgds[i] = &disklayout.BlockGroup64Bit{} - } else { - bgds[i] = &disklayout.BlockGroup32Bit{} - } - - if err := readFromDisk(dev, int64(off), bgds[i]); err != nil { - return nil, err - } - } - return bgds, nil -} diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 3a4777fbe..05c4fbeb2 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -46,6 +46,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/marshal", @@ -58,7 +59,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -76,13 +76,13 @@ go_test( library = ":fuse", deps = [ "//pkg/abi/linux", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/marshal", "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go index 077bf9307..d404edaf0 100644 --- a/pkg/sentry/fsimpl/fuse/connection.go +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -252,11 +252,11 @@ func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { } if !conn.connected { - return nil, syserror.ENOTCONN + return nil, linuxerr.ENOTCONN } if conn.connInitError { - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } fut, err := conn.callFuture(t, r) @@ -306,7 +306,7 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes conn.mu.Unlock() // we checked connected before, // this must be due to aborted connection. - return nil, syserror.ECONNABORTED + return nil, linuxerr.ECONNABORTED } conn.mu.Unlock() diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go index 78ea6a31e..1fddd858e 100644 --- a/pkg/sentry/fsimpl/fuse/connection_test.go +++ b/pkg/sentry/fsimpl/fuse/connection_test.go @@ -19,9 +19,9 @@ import ( "testing" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) // TestConnectionInitBlock tests if initialization @@ -104,7 +104,7 @@ func TestConnectionAbort(t *testing.T) { // After abort, Call() should return directly with ENOTCONN. req := conn.NewRequest(creds, 0, 0, 0, testObj) _, err = conn.Call(task, req) - if err != syserror.ENOTCONN { + if !linuxerr.Equals(linuxerr.ENOTCONN, err) { t.Fatalf("Incorrect error code received for Call() after connection aborted") } diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 5d2bae14e..0f855ac59 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -18,11 +18,11 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -37,7 +37,7 @@ type fuseDevice struct{} // Open implements vfs.Device.Open. func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { if !kernel.FUSEEnabled { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } var fd DeviceFD @@ -122,17 +122,17 @@ func (fd *DeviceFD) Release(ctx context.Context) { func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Read implements vfs.FileDescriptionImpl.Read. func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } // We require that any Read done on this filesystem have a sane minimum @@ -149,7 +149,7 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R // If the read buffer is too small, error out. if dst.NumBytes() < int64(minBuffSize) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.mu.Lock() @@ -191,7 +191,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts } if req == nil { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // We already checked the size: dst must be able to fit the whole request. @@ -204,7 +204,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts return 0, err } if n != len(req.data) { - return 0, syserror.EIO + return 0, linuxerr.EIO } if req.hdr.Opcode == linux.FUSE_WRITE { @@ -213,7 +213,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts return 0, err } if written != len(req.payload) { - return 0, syserror.EIO + return 0, linuxerr.EIO } n += int(written) } @@ -234,10 +234,10 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Write implements vfs.FileDescriptionImpl.Write. @@ -251,12 +251,12 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs. func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } // Return ENODEV if the filesystem is umounted. if fd.fs.umounted { - return 0, syserror.ENODEV + return 0, linuxerr.ENODEV } var cn, n int64 @@ -293,7 +293,7 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt // Assert that the header isn't read into the writeBuf yet. if fd.writeCursor >= hdrLen { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // We don't have the full common response header yet. @@ -322,7 +322,7 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt if !ok { // Server sent us a response for a request we never sent, // or for which we already received a reply (e.g. aborted), an unlikely event. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } delete(fd.completions, hdr.Unique) @@ -391,10 +391,10 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if fd.fs == nil { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // sendResponse sends a response to the waiting task (if any). @@ -434,7 +434,7 @@ func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUS if !ok { // A response for a request we never sent, // or for which we already received a reply (e.g. aborted). - return syserror.EINVAL + return linuxerr.EINVAL } delete(fd.completions, respHdr.Unique) diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index 04250d796..8951b5ba8 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -20,11 +20,11 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -186,7 +186,7 @@ func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem. // "would block". n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{}) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go index fcc5d9a2a..9611edd5a 100644 --- a/pkg/sentry/fsimpl/fuse/directory.go +++ b/pkg/sentry/fsimpl/fuse/directory.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -32,27 +32,27 @@ type directoryFD struct { // Allocate implements directoryFD.Allocate. func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.EISDIR + return linuxerr.EISDIR } // PRead implements vfs.FileDescriptionImpl.PRead. func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Read implements vfs.FileDescriptionImpl.Read. func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Write implements vfs.FileDescriptionImpl.Write. func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 167c899e2..af16098d2 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -23,13 +23,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -121,30 +121,30 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt deviceDescriptorStr, ok := mopts["fd"] if !ok { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option fd missing") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } delete(mopts, "fd") deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */) if err != nil { ctx.Debugf("fusefs.FilesystemType.GetFilesystem: invalid fd: %q (%v)", deviceDescriptorStr, err) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name()) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fuseFDGeneric := kernelTask.GetFileVFS2(int32(deviceDescriptor)) if fuseFDGeneric == nil { - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } defer fuseFDGeneric.DecRef(ctx) fuseFD, ok := fuseFDGeneric.Impl().(*DeviceFD) if !ok { log.Warningf("%s.GetFilesystem: device FD is %T, not a FUSE device", fsType.Name, fuseFDGeneric) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Parse and set all the other supported FUSE mount options. @@ -154,17 +154,17 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt uid, err := strconv.ParseUint(uidStr, 10, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), uidStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) if !kuid.Ok() { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.uid = kuid } else { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option user_id missing") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if gidStr, ok := mopts["group_id"]; ok { @@ -172,17 +172,17 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt gid, err := strconv.ParseUint(gidStr, 10, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), gidStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) if !kgid.Ok() { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.gid = kgid } else { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option group_id missing") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if modeStr, ok := mopts["rootmode"]; ok { @@ -190,12 +190,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt mode, err := strconv.ParseUint(modeStr, 8, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.rootMode = linux.FileMode(mode) } else { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option rootmode missing") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Set the maxInFlightRequests option. @@ -206,7 +206,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt maxRead, err := strconv.ParseUint(maxReadStr, 10, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if maxRead < fuseMinMaxRead { maxRead = fuseMinMaxRead @@ -229,7 +229,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Check for unparsed options. if len(mopts) != 0 { log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Create a new FUSE filesystem. @@ -258,7 +258,7 @@ func newFUSEFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, fsTyp conn, err := newFUSEConnection(ctx, fuseFD, opts) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fs := &filesystem{ @@ -375,7 +375,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a creds.RealKGID != i.fs.opts.gid || creds.EffectiveKGID != i.fs.opts.gid || creds.SavedKGID != i.fs.opts.gid { - return syserror.EACCES + return linuxerr.EACCES } } @@ -393,10 +393,10 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentr isDir := i.InodeAttrs.Mode().IsDir() // return error if specified to open directory but inode is not a directory. if !isDir && opts.Mode.IsDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS { - return nil, syserror.EOVERFLOW + return nil, linuxerr.EOVERFLOW } var fd *fileDescription @@ -418,7 +418,7 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentr kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context") - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // Build the request. @@ -440,7 +440,7 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentr if err != nil { return nil, err } - if err := res.Error(); err == syserror.ENOSYS && !isDir { + if err := res.Error(); linuxerr.Equals(linuxerr.ENOSYS, err) && !isDir { i.fs.conn.noOpen = true } else if err != nil { return nil, err @@ -512,7 +512,7 @@ func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID) - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } in := linux.FUSECreateIn{ CreateMeta: linux.FUSECreateMeta{ @@ -552,7 +552,7 @@ func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) err kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) - return syserror.EINVAL + return linuxerr.EINVAL } in := linux.FUSEUnlinkIn{Name: name} req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in) @@ -596,7 +596,7 @@ func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMo kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload) res, err := i.fs.conn.Call(kernelTask, req) @@ -611,7 +611,7 @@ func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMo return nil, err } if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) { - return nil, syserror.EIO + return nil, linuxerr.EIO } child := i.fs.newInode(ctx, out.NodeID, out.Attr) return child, nil @@ -626,13 +626,13 @@ func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, // Readlink implements kernfs.Inode.Readlink. func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if i.Mode().FileType()&linux.S_IFLNK == 0 { - return "", syserror.EINVAL + return "", linuxerr.EINVAL } if len(i.link) == 0 { kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context") - return "", syserror.EINVAL + return "", linuxerr.EINVAL } req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{}) res, err := i.fs.conn.Call(kernelTask, req) @@ -728,7 +728,7 @@ func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOp task := kernel.TaskFromContext(ctx) if task == nil { log.Warningf("couldn't get kernel task from context") - return linux.FUSEAttr{}, syserror.EINVAL + return linux.FUSEAttr{}, linuxerr.EINVAL } creds := auth.CredentialsFromContext(ctx) @@ -833,7 +833,7 @@ func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre task := kernel.TaskFromContext(ctx) if task == nil { log.Warningf("couldn't get kernel task from context") - return syserror.EINVAL + return linuxerr.EINVAL } // We should retain the original file type when assigning new mode. diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go index 66ea889f9..fe119aa43 100644 --- a/pkg/sentry/fsimpl/fuse/read_write.go +++ b/pkg/sentry/fsimpl/fuse/read_write.go @@ -20,11 +20,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) // ReadInPages sends FUSE_READ requests for the size after round it up to @@ -39,7 +39,7 @@ func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off ui t := kernel.TaskFromContext(ctx) if t == nil { log.Warningf("fusefs.Read: couldn't get kernel task from context") - return nil, 0, syserror.EINVAL + return nil, 0, linuxerr.EINVAL } // Round up to a multiple of page size. @@ -155,7 +155,7 @@ func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, t := kernel.TaskFromContext(ctx) if t == nil { log.Warningf("fusefs.Read: couldn't get kernel task from context") - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // One request cannnot exceed either maxWrite or maxPages. @@ -220,7 +220,7 @@ func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, // Write more than requested? EIO. if out.Size > toWrite { - return 0, syserror.EIO + return 0, linuxerr.EIO } written += out.Size diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go index 5bdd096c3..38cde8208 100644 --- a/pkg/sentry/fsimpl/fuse/regular_file.go +++ b/pkg/sentry/fsimpl/fuse/regular_file.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -39,14 +39,14 @@ type regularFileFD struct { // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } size := dst.NumBytes() @@ -56,7 +56,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs } else if size > math.MaxUint32 { // FUSE only supports uint32 for size. // Overflow. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // TODO(gvisor.dev/issue/3678): Add direct IO support. @@ -107,7 +107,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs return 0, err } if int64(cp) != toCopy { - return 0, syserror.EIO + return 0, linuxerr.EIO } copied += toCopy } @@ -143,14 +143,14 @@ func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts // final offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, offset, syserror.EOPNOTSUPP + return 0, offset, linuxerr.EOPNOTSUPP } inode := fd.inode() @@ -171,11 +171,11 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off if srclen > math.MaxUint32 { // FUSE only supports uint32 for size. // Overflow. - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } if end := offset + srclen; end < offset { // Overflow. - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } srclen, err = vfs.CheckLimit(ctx, offset, srclen) @@ -204,7 +204,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off return 0, offset, err } if int64(cp) != srclen { - return 0, offset, syserror.EIO + return 0, offset, linuxerr.EIO } n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data) @@ -215,7 +215,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off if n == 0 { // We have checked srclen != 0 previously. // If err == nil, then it's a short write and we return EIO. - return 0, offset, syserror.EIO + return 0, offset, linuxerr.EIO } written = int64(n) diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 368272f12..4244f2cf5 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -49,6 +49,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fd", "//pkg/fdnotifier", "//pkg/fspath", @@ -78,7 +79,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/unet", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 177e42649..5c48a9fee 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refsvfs2" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) func (d *dentry) isDir() bool { @@ -297,7 +297,7 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in switch whence { case linux.SEEK_SET: if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset == 0 { // Ensure that the next call to fd.IterDirents() calls @@ -309,13 +309,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in case linux.SEEK_CUR: offset += fd.off if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Don't clear fd.dirents in this case, even if offset == 0. fd.off = offset return fd.off, nil default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index eb09d54c3..00228c469 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" @@ -32,32 +33,19 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // Snapshot current syncable dentries and special file FDs. - fs.renameMu.RLock() fs.syncMu.Lock() ds := make([]*dentry, 0, len(fs.syncableDentries)) for d := range fs.syncableDentries { - // It's safe to use IncRef here even though fs.syncableDentries doesn't - // hold references since we hold fs.renameMu. Note that we can't use - // TryIncRef since cached dentries at zero references should still be - // synced. - d.IncRef() ds = append(ds, d) } - fs.renameMu.RUnlock() sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) for sffd := range fs.specialFileFDs { - // As above, fs.specialFileFDs doesn't hold references. However, unlike - // dentries, an FD that has reached zero references can't be - // resurrected, so we can use TryIncRef. - if sffd.vfsfd.TryIncRef() { - sffds = append(sffds, sffd) - } + sffds = append(sffds, sffd) } fs.syncMu.Unlock() @@ -67,9 +55,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync syncable dentries. for _, d := range ds { - err := d.syncCachedFile(ctx, true /* forFilesystemSync */) - d.DecRef(ctx) - if err != nil { + if err := d.syncCachedFile(ctx, true /* forFilesystemSync */); err != nil { ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) if retErr == nil { retErr = err @@ -80,9 +66,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - err := sffd.sync(ctx, true /* forFilesystemSync */) - sffd.vfsfd.DecRef(ctx) - if err != nil { + if err := sffd.sync(ctx, true /* forFilesystemSync */); err != nil { ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) if retErr == nil { retErr = err @@ -146,6 +130,7 @@ func putDentrySlice(ds *[]*dentry) { // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp **[]*dentry) { fs.renameMu.RUnlock() if *dsp == nil { @@ -158,6 +143,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp ** putDentrySlice(*dsp) } +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() @@ -186,7 +172,7 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[] // Postconditions: The returned dentry's cached metadata is up to date. func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, bool, error) { if !d.isDir() { - return nil, false, syserror.ENOTDIR + return nil, false, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, false, err @@ -244,18 +230,18 @@ afterSymlink: // * dentry at name has been revalidated func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if len(name) > maxFilenameLen { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } if child, ok := parent.children[name]; ok || parent.isSynthetic() { if child == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return child, nil } qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil { - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { parent.cacheNegativeLookupLocked(name) } return nil, err @@ -302,7 +288,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving } } if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -330,7 +316,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, } } if rp.MustBeDir() && !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -359,10 +345,10 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } name := rp.Component() if name == "." || name == ".." { - return syserror.EEXIST + return linuxerr.EEXIST } if parent.isDeleted() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, name, &ds); err != nil { return err @@ -372,20 +358,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir defer parent.dirMu.Unlock() if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } // Check for existence only if caching information is available. Otherwise, // don't check for existence just yet. We will check for existence if the // checks for writability fail below. Existence check is done by the creation // RPCs themselves. if child, ok := parent.children[name]; ok && child != nil { - return syserror.EEXIST + return linuxerr.EEXIST } checkExistence := func() error { - if child, err := fs.getChildLocked(ctx, parent, name, &ds); err != nil && err != syserror.ENOENT { + if child, err := fs.getChildLocked(ctx, parent, name, &ds); err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { return err } else if child != nil { - return syserror.EEXIST + return linuxerr.EEXIST } return nil } @@ -408,11 +394,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return err } if !dir && rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if parent.isSynthetic() { if createInSyntheticDir == nil { - return syserror.EPERM + return linuxerr.EPERM } if err := createInSyntheticDir(parent, name); err != nil { return err @@ -469,14 +455,14 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b name := rp.Component() if dir { if name == "." { - return syserror.EINVAL + return linuxerr.EINVAL } if name == ".." { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } else { if name == "." || name == ".." { - return syserror.EISDIR + return linuxerr.EISDIR } } @@ -499,7 +485,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b child, ok = parent.children[name] if ok && child == nil { // Hit a negative cached entry, child doesn't exist. - return syserror.ENOENT + return linuxerr.ENOENT } } else { child, _, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) @@ -539,8 +525,8 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if child.syntheticChildren != 0 { // This is definitely not an empty directory, irrespective of // fs.opts.interop. - vfsObj.AbortDeleteDentry(&child.vfsd) - return syserror.ENOTEMPTY + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: PrepareDeleteDentry called if child != nil. + return linuxerr.ENOTEMPTY } // If InteropModeShared is in effect and the first call to // PrepareDeleteDentry above succeeded, then child wasn't @@ -549,13 +535,13 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // still exist) would be a waste of time. if child.cachedMetadataAuthoritative() { if !child.isDir() { - vfsObj.AbortDeleteDentry(&child.vfsd) - return syserror.ENOTDIR + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. + return linuxerr.ENOTDIR } for _, grandchild := range child.children { if grandchild != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) - return syserror.ENOTEMPTY + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. + return linuxerr.ENOTEMPTY } } } @@ -564,25 +550,25 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } else { // child must be a non-directory file. if child != nil && child.isDir() { - vfsObj.AbortDeleteDentry(&child.vfsd) - return syserror.EISDIR + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. + return linuxerr.EISDIR } if rp.MustBeDir() { if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. } - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } if parent.isSynthetic() { if child == nil { - return syserror.ENOENT + return linuxerr.ENOENT } } else if child == nil || !child.isSynthetic() { err = parent.file.unlinkAt(ctx, name, flags) if err != nil { if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) + vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. } return err } @@ -600,7 +586,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } if child != nil { - vfsObj.CommitDeleteDentry(ctx, &child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) // +checklocksforce: see above. child.setDeleted() if child.isSynthetic() { parent.syntheticChildren-- @@ -642,7 +628,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op } if opts.CheckSearchable { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -674,11 +660,11 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error { if rp.Mount() != vd.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } d := vd.Dentry().Impl().(*dentry) if d.isDir() { - return syserror.EPERM + return linuxerr.EPERM } gid := auth.KGID(atomic.LoadUint32(&d.gid)) uid := auth.KUID(atomic.LoadUint32(&d.uid)) @@ -687,10 +673,10 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return err } if d.nlink == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } if d.nlink == math.MaxUint32 { - return syserror.EMLINK + return linuxerr.EMLINK } if err := parent.file.link(ctx, d.file, childName); err != nil { return err @@ -715,7 +701,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v mode |= linux.S_ISGID } if _, err := parent.file.mkdir(ctx, name, p9.FileMode(mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)); err != nil { - if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) { return err } ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) @@ -734,7 +720,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v }, func(parent *dentry, name string) error { if !opts.ForSyntheticMountpoint { // Can't create non-synthetic files in synthetic directories. - return syserror.EPERM + return linuxerr.EPERM } parent.createSyntheticChildLocked(&createSyntheticOpts{ name: name, @@ -752,7 +738,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error { creds := rp.Credentials() _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - if err != syserror.EPERM { + if !linuxerr.Equals(linuxerr.EPERM, err) { return err } @@ -764,8 +750,8 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v switch { case err == nil: // Step succeeded, another file exists. - return syserror.EEXIST - case err != syserror.ENOENT: + return linuxerr.EEXIST + case !linuxerr.Equals(linuxerr.ENOENT, err): // Unexpected error. return err } @@ -793,7 +779,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return nil } // Retain error from gofer if synthetic file cannot be created internally. - return syserror.EPERM + return linuxerr.EPERM }, nil) } @@ -804,7 +790,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // support, and it isn't clear that there's any way to implement this in // 9P. if opts.Flags&linux.O_TMPFILE != 0 { - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } mayCreate := opts.Flags&linux.O_CREAT != 0 mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) @@ -824,10 +810,10 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if rp.Done() { // Reject attempts to open mount root directory with O_CREAT. if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if !start.cachedMetadataAuthoritative() { // Refresh dentry's attributes before opening. @@ -854,7 +840,7 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, rp.Component(), &ds); err != nil { return nil, err @@ -862,10 +848,10 @@ afterTrailingSymlink: // Determine whether or not we need to create a file. parent.dirMu.Lock() child, _, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) - if err == syserror.ENOENT && mayCreate { + if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate { if parent.isSynthetic() { parent.dirMu.Unlock() - return nil, syserror.EPERM + return nil, linuxerr.EPERM } fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds) parent.dirMu.Unlock() @@ -876,7 +862,7 @@ afterTrailingSymlink: return nil, err } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } // Open existing child or follow symlink. if child.isSymlink() && rp.ShouldFollowSymlink() { @@ -891,7 +877,7 @@ afterTrailingSymlink: goto afterTrailingSymlink } if rp.MustBeDir() && !child.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } child.IncRef() defer child.DecRef(ctx) @@ -935,14 +921,14 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open case linux.S_IFDIR: // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } // Can't open directories writably. if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if !d.isSynthetic() { if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { @@ -962,10 +948,10 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return &fd.vfsfd, nil case linux.S_IFLNK: // Can't open symlinks without O_PATH, which is handled at the VFS layer. - return nil, syserror.ELOOP + return nil, linuxerr.ELOOP case linux.S_IFSOCK: if d.isSynthetic() { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } if d.fs.iopts.OpenSocketsByConnecting { return d.openSocketByConnecting(ctx, opts) @@ -998,7 +984,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fdObj, err := d.file.connect(ctx, p9.AnonymousSocket) if err != nil { @@ -1019,7 +1005,7 @@ func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptio func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // We assume that the server silently inserts O_NONBLOCK in the open flags // for all named pipes (because all existing gofers do this). @@ -1033,7 +1019,7 @@ func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs. retry: h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) if err != nil { - if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO { + if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && linuxerr.Equals(linuxerr.ENXIO, err) { // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails // with ENXIO if opening the same named pipe with O_WRONLY would // block because there are no readers of the pipe. @@ -1067,7 +1053,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving return nil, err } if d.isDeleted() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -1187,7 +1173,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st return "", err } if !d.isSymlink() { - return "", syserror.EINVAL + return "", linuxerr.EINVAL } return d.readlink(ctx, rp.Mount()) } @@ -1204,24 +1190,24 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if opts.Flags&^linux.RENAME_NOREPLACE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if fs.opts.interop == InteropModeShared && opts.Flags&linux.RENAME_NOREPLACE != 0 { // Requires 9P support to synchronize with other remote filesystem // users. - return syserror.EINVAL + return linuxerr.EINVAL } newName := rp.Component() if newName == "." || newName == ".." { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.EBUSY + return linuxerr.EBUSY } mnt := rp.Mount() if mnt != oldParentVD.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err @@ -1260,7 +1246,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if renamed.isDir() { if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { - return syserror.EINVAL + return linuxerr.EINVAL } if oldParent != newParent { if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { @@ -1269,7 +1255,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } else { if opts.MustBeDir || rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } @@ -1281,28 +1267,28 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer newParent.dirMu.Unlock() } if newParent.isDeleted() { - return syserror.ENOENT + return linuxerr.ENOENT } replaced, err := fs.getChildLocked(ctx, newParent, newName, &ds) - if err != nil && err != syserror.ENOENT { + if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { return err } var replacedVFSD *vfs.Dentry if replaced != nil { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if genericIsAncestorDentry(replaced, renamed) { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } else { if rp.MustBeDir() || renamed.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } } @@ -1507,7 +1493,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath return d.endpoint, nil } } - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index cf69e1b7a..43440ec19 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -46,6 +46,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" @@ -61,7 +62,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" ) @@ -318,7 +318,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt mfp := pgalloc.MemoryFileProviderFromContext(ctx) if mfp == nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } mopts := vfs.GenericParseMountOptions(opts.Data) @@ -354,7 +354,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fsopts.interop = InteropModeShared default: ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } } @@ -365,7 +365,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // In Linux, dfltuid is interpreted as a UID and is converted to a KUID // in the caller's user namespace, but goferfs isn't @@ -378,7 +378,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.dfltgid = auth.KGID(dfltgid) } @@ -390,7 +390,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt msize, err := strconv.ParseUint(msizestr, 10, 32) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: %s=%s", moptMsize, msizestr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.msize = uint32(msize) } @@ -409,7 +409,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt maxCachedDentries, err := strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: %s=%s", moptDentryCacheLimit, str) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } fsopts.maxCachedDentries = maxCachedDentries } @@ -433,14 +433,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Check for unparsed options. if len(mopts) != 0 { ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Handle internal options. iopts, ok := opts.InternalData.(InternalFilesystemOptions) if opts.InternalData != nil && !ok { ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // If !ok, iopts being the zero value is correct. @@ -503,7 +503,7 @@ func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int trans, ok := mopts[moptTransport] if !ok || trans != transportModeFD { ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } delete(mopts, moptTransport) @@ -511,28 +511,28 @@ func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int rfdstr, ok := mopts[moptReadFD] if !ok { ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } delete(mopts, moptReadFD) rfd, err := strconv.Atoi(rfdstr) if err != nil { ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } wfdstr, ok := mopts[moptWriteFD] if !ok { ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } delete(mopts, moptWriteFD) wfd, err := strconv.Atoi(wfdstr) if err != nil { ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } if rfd != wfd { ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd) - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } return rfd, nil } @@ -581,10 +581,10 @@ func (fs *filesystem) Release(ctx context.Context) { d.dataMu.Unlock() // Close host FDs if they exist. if d.readFD >= 0 { - unix.Close(int(d.readFD)) + _ = unix.Close(int(d.readFD)) } if d.writeFD >= 0 && d.readFD != d.writeFD { - unix.Close(int(d.writeFD)) + _ = unix.Close(int(d.writeFD)) } d.readFD = -1 d.writeFD = -1 @@ -864,11 +864,11 @@ func dentryAttrMask() p9.AttrMask { func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { if !mask.Mode { ctx.Warningf("can't create gofer.dentry without file type") - return nil, syserror.EIO + return nil, linuxerr.EIO } if attr.Mode.FileType() == p9.ModeRegular && !mask.Size { ctx.Warningf("can't create regular file gofer.dentry without file size") - return nil, syserror.EIO + return nil, linuxerr.EIO } d := &dentry{ @@ -946,10 +946,10 @@ func (d *dentry) cachedMetadataAuthoritative() bool { // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. // Precondition: d.metadataMu must be locked. +// +checklocks:d.metadataMu func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { if mask.Mode { if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { - d.metadataMu.Unlock() panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) } atomic.StoreUint32(&d.mode, uint32(attr.Mode)) @@ -988,13 +988,14 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { // Preconditions: !d.isSynthetic(). // Preconditions: d.metadataMu is locked. +// +checklocks:d.metadataMu func (d *dentry) refreshSizeLocked(ctx context.Context) error { d.handleMu.RLock() if d.writeFD < 0 { d.handleMu.RUnlock() // Ask the gofer if we don't have a host FD. - return d.updateFromGetattrLocked(ctx) + return d.updateFromGetattrLocked(ctx, p9file{}) } var stat unix.Statx_t @@ -1013,37 +1014,41 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { // updating stale attributes in d.updateFromP9AttrsLocked(). d.metadataMu.Lock() defer d.metadataMu.Unlock() - return d.updateFromGetattrLocked(ctx) + return d.updateFromGetattrLocked(ctx, p9file{}) } // Preconditions: // * !d.isSynthetic(). // * d.metadataMu is locked. -func (d *dentry) updateFromGetattrLocked(ctx context.Context) error { - // Use d.readFile or d.writeFile, which represent 9P FIDs that have been - // opened, in preference to d.file, which represents a 9P fid that has not. - // This may be significantly more efficient in some implementations. Prefer - // d.writeFile over d.readFile since some filesystem implementations may - // update a writable handle's metadata after writes to that handle, without - // making metadata updates immediately visible to read-only handles - // representing the same file. - d.handleMu.RLock() - handleMuRLocked := true - var file p9file - switch { - case !d.writeFile.isNil(): - file = d.writeFile - case !d.readFile.isNil(): - file = d.readFile - default: - file = d.file - d.handleMu.RUnlock() - handleMuRLocked = false +// +checklocks:d.metadataMu +func (d *dentry) updateFromGetattrLocked(ctx context.Context, file p9file) error { + handleMuRLocked := false + if file.isNil() { + // Use d.readFile or d.writeFile, which represent 9P FIDs that have + // been opened, in preference to d.file, which represents a 9P fid that + // has not. This may be significantly more efficient in some + // implementations. Prefer d.writeFile over d.readFile since some + // filesystem implementations may update a writable handle's metadata + // after writes to that handle, without making metadata updates + // immediately visible to read-only handles representing the same file. + d.handleMu.RLock() + switch { + case !d.writeFile.isNil(): + file = d.writeFile + handleMuRLocked = true + case !d.readFile.isNil(): + file = d.readFile + handleMuRLocked = true + default: + file = d.file + d.handleMu.RUnlock() + } } _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) if handleMuRLocked { - d.handleMu.RUnlock() // must be released before updateFromP9AttrsLocked() + // handleMu must be released before updateFromP9AttrsLocked(). + d.handleMu.RUnlock() // +checklocksforce: complex case. } if err != nil { return err @@ -1090,7 +1095,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { - return syserror.EPERM + return linuxerr.EPERM } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { @@ -1108,9 +1113,9 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs case linux.S_IFREG: // ok case linux.S_IFDIR: - return syserror.EISDIR + return linuxerr.EISDIR default: - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -1157,6 +1162,13 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs if !d.isSynthetic() { if stat.Mask != 0 { + if stat.Mask&linux.STATX_SIZE != 0 { + // d.dataMu must be held around the update to both the remote + // file's size and d.size to serialize with writeback (which + // might otherwise write data back up to the old d.size after + // the remote file has been truncated). + d.dataMu.Lock() + } if err := d.file.setAttr(ctx, p9.SetAttrMask{ Permissions: stat.Mask&linux.STATX_MODE != 0, UID: stat.Mask&linux.STATX_UID != 0, @@ -1176,13 +1188,16 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs MTimeSeconds: uint64(stat.Mtime.Sec), MTimeNanoSeconds: uint64(stat.Mtime.Nsec), }); err != nil { + if stat.Mask&linux.STATX_SIZE != 0 { + d.dataMu.Unlock() // +checklocksforce: locked conditionally above + } return err } if stat.Mask&linux.STATX_SIZE != 0 { // d.size should be kept up to date, and privatized // copy-on-write mappings of truncated pages need to be // invalidated, even if InteropModeShared is in effect. - d.updateSizeLocked(stat.Size) + d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above } } if d.fs.opts.interop == InteropModeShared { @@ -1245,6 +1260,14 @@ func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate // Preconditions: d.metadataMu must be locked. func (d *dentry) updateSizeLocked(newSize uint64) { d.dataMu.Lock() + d.updateSizeAndUnlockDataMuLocked(newSize) +} + +// Preconditions: d.metadataMu and d.dataMu must be locked. +// +// Postconditions: d.dataMu is unlocked. +// +checklocksrelease:d.dataMu +func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) { oldSize := d.size atomic.StoreUint64(&d.size, newSize) // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings @@ -1253,9 +1276,9 @@ func (d *dentry) updateSizeLocked(newSize uint64) { // contents beyond the new d.size. (We are still holding d.metadataMu, // so we can't race with Write or another truncate.) d.dataMu.Unlock() - if d.size < oldSize { + if newSize < oldSize { oldpgend, _ := hostarch.PageRoundUp(oldSize) - newpgend, _ := hostarch.PageRoundUp(d.size) + newpgend, _ := hostarch.PageRoundUp(newSize) if oldpgend != newpgend { d.mapsMu.Lock() d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ @@ -1271,8 +1294,8 @@ func (d *dentry) updateSizeLocked(newSize uint64) { // truncated pages have been removed from the remote file, they // should be dropped without being written back. d.dataMu.Lock() - d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) - d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) + d.cache.Truncate(newSize, d.fs.mfp.MemoryFile()) + d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend}) d.dataMu.Unlock() } } @@ -1288,7 +1311,7 @@ func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats // to the remote filesystem. This is inconsistent with Linux's 9p client, // but consistent with other filesystems (e.g. FUSE). if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) kuid := auth.KUID(atomic.LoadUint32(&d.uid)) @@ -1469,7 +1492,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo if d.isDeleted() { d.watches.HandleDeletion(ctx) } - d.destroyLocked(ctx) + d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. return } // If d still has inotify watches and it is not deleted or invalidated, it @@ -1497,7 +1520,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo delete(d.parent.children, d.name) d.parent.dirMu.Unlock() } - d.destroyLocked(ctx) + d.destroyLocked(ctx) // +checklocksforce: see above. return } @@ -1526,7 +1549,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo d.fs.renameMu.Lock() defer d.fs.renameMu.Unlock() } - d.fs.evictCachedDentryLocked(ctx) + d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. } } @@ -1543,6 +1566,7 @@ func (d *dentry) removeFromCacheLocked() { // Precondition: fs.renameMu must be locked for writing; it may be temporarily // unlocked. +// +checklocks:fs.renameMu func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { for fs.cachedDentriesLen != 0 { fs.evictCachedDentryLocked(ctx) @@ -1551,6 +1575,7 @@ func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { // Preconditions: // * fs.renameMu must be locked for writing; it may be temporarily unlocked. +// +checklocks:fs.renameMu func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { fs.cacheMu.Lock() victim := fs.cachedDentries.Back() @@ -1587,7 +1612,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { // will try to acquire fs.renameMu (which we have already acquired). Hence, // fs.renameMu will synchronize the destroy attempts. victim.cachingMu.Unlock() - victim.destroyLocked(ctx) + victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs. } // destroyLocked destroys the dentry. @@ -1597,6 +1622,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { // * d.refs == 0. // * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal // from its former parent dentry. +// +checklocks:d.fs.renameMu func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: @@ -1630,18 +1656,18 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.dataMu.Unlock() // Clunk open fids and close open host FDs. if !d.readFile.isNil() { - d.readFile.close(ctx) + _ = d.readFile.close(ctx) } if !d.writeFile.isNil() && d.readFile != d.writeFile { - d.writeFile.close(ctx) + _ = d.writeFile.close(ctx) } d.readFile = p9file{} d.writeFile = p9file{} if d.readFD >= 0 { - unix.Close(int(d.readFD)) + _ = unix.Close(int(d.readFD)) } if d.writeFD >= 0 && d.readFD != d.writeFD { - unix.Close(int(d.writeFD)) + _ = unix.Close(int(d.writeFD)) } d.readFD = -1 d.writeFD = -1 @@ -1703,7 +1729,7 @@ func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size ui func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { if d.file.isNil() { - return "", syserror.ENODATA + return "", linuxerr.ENODATA } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err @@ -1713,7 +1739,7 @@ func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vf func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { if d.file.isNil() { - return syserror.EPERM + return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err @@ -1723,7 +1749,7 @@ func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vf func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { if d.file.isNil() { - return syserror.EPERM + return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err @@ -1763,7 +1789,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool openReadable := !d.readFile.isNil() || read openWritable := !d.writeFile.isNil() || write h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc) - if err == syserror.EACCES && (openReadable != read || openWritable != write) { + if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) { // It may not be possible to use a single handle for both // reading and writing, since permissions on the file may have // changed to e.g. disallow reading after previously being @@ -2020,9 +2046,17 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu d := fd.dentry() const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { - // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if - // available? - if err := d.updateFromGetattr(ctx); err != nil { + // Use specialFileFD.handle.file for the getattr if available, for the + // same reason that we try to use open file handles in + // dentry.updateFromGetattrLocked(). + var file p9file + if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { + file = sffd.handle.file + } + d.metadataMu.Lock() + err := d.updateFromGetattrLocked(ctx, file) + d.metadataMu.Unlock() + if err != nil { return linux.Statx{}, err } } diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index 5c57f6fea..02540a754 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/sync" ) // handle represents a remote "open file descriptor", consisting of an opened @@ -130,3 +131,43 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } return uint64(n), cperr } + +type handleReadWriter struct { + ctx context.Context + h *handle + off uint64 +} + +var handleReadWriterPool = sync.Pool{ + New: func() interface{} { + return &handleReadWriter{} + }, +} + +func getHandleReadWriter(ctx context.Context, h *handle, offset int64) *handleReadWriter { + rw := handleReadWriterPool.Get().(*handleReadWriter) + rw.ctx = ctx + rw.h = h + rw.off = uint64(offset) + return rw +} + +func putHandleReadWriter(rw *handleReadWriter) { + rw.ctx = nil + rw.h = nil + handleReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *handleReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + n, err := rw.h.readToBlocksAt(rw.ctx, dsts, rw.off) + rw.off += n + return n, err +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *handleReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + n, err := rw.h.writeFromBlocksAt(rw.ctx, srcs, rw.off) + rw.off += n + return n, err +} diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go index c7bf10007..505916a57 100644 --- a/pkg/sentry/fsimpl/gofer/host_named_pipe.go +++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go @@ -21,7 +21,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create @@ -78,7 +78,7 @@ func nonblockingPipeHasWriter(fd int32) (bool, error) { defer tempPipeMu.Unlock() // Copy 1 byte from fd into the temporary pipe. n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK) - if err == syserror.EAGAIN { + if linuxerr.Equals(linuxerr.EAGAIN, err) { // The pipe represented by fd is empty, but has a writer. return true, nil } @@ -108,6 +108,6 @@ func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error { return nil case <-cancel: ctx.SleepFinish(false) - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } } diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index b0a429d42..5a3ddfc9d 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -16,9 +16,9 @@ package gofer import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/syserror" ) // p9file is a wrapper around p9.File that provides methods that are @@ -59,7 +59,7 @@ func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file if newfile != nil { p9file{newfile}.close(ctx) } - return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, linuxerr.EIO } return qids[0], p9file{newfile}, attrMask, attr, nil } diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index eed05e369..947dbe05f 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" @@ -34,7 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -79,17 +79,22 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { if !fd.vfsfd.IsWritable() { return nil } - // Skip flushing if there are client-buffered writes, since (as with the - // VFS1 client) we don't flush buffered writes on close anyway. d := fd.dentry() - if d.fs.opts.interop != InteropModeExclusive { - return nil - } - d.dataMu.RLock() - haveDirtyPages := !d.dirty.IsEmpty() - d.dataMu.RUnlock() - if haveDirtyPages { - return nil + if d.fs.opts.interop == InteropModeExclusive { + // d may have dirty pages that we won't write back now (and wouldn't + // have in VFS1), making a flushf RPC ineffective. If this is the case, + // skip the flushf. + // + // Note that it's also possible to have dirty pages under other interop + // modes if forcePageCache is in effect; we conservatively assume that + // applications have some way of tolerating this and still want the + // flushf. + d.dataMu.RLock() + haveDirtyPages := !d.dirty.IsEmpty() + d.dataMu.RUnlock() + if haveDirtyPages { + return nil + } } d.handleMu.RLock() defer d.handleMu.RUnlock() @@ -124,14 +129,14 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs }() if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } // Check for reading at EOF before calling into MM (but not under @@ -194,14 +199,14 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, offset, syserror.EOPNOTSUPP + return 0, offset, linuxerr.EOPNOTSUPP } d := fd.dentry() @@ -297,7 +302,7 @@ func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64 pgstart := hostarch.PageRoundDown(uint64(offset)) pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mr := memmap.MappableRange{pgstart, pgend} var freed []memmap.FileRange @@ -652,20 +657,20 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6 offset += size case linux.SEEK_DATA: if offset > size { - return 0, syserror.ENXIO + return 0, linuxerr.ENXIO } // Use offset as specified. case linux.SEEK_HOLE: if offset > size { - return 0, syserror.ENXIO + return 0, linuxerr.ENXIO } offset = size } default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } return offset, nil } @@ -678,28 +683,28 @@ func (fd *regularFileFD) Sync(ctx context.Context) error { // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { d := fd.dentry() - switch d.fs.opts.interop { - case InteropModeExclusive: - // Any mapping is fine. - case InteropModeWritethrough: - // Shared writable mappings require a host FD, since otherwise we can't - // synchronously flush memory-mapped writes to the remote file. - if opts.Private || !opts.MaxPerms.Write { - break - } - fallthrough - case InteropModeShared: - // All mappings require a host FD to be coherent with other filesystem - // users. - if d.fs.opts.forcePageCache { - // Whether or not we have a host FD, we're not allowed to use it. - return syserror.ENODEV - } - if atomic.LoadInt32(&d.mmapFD) < 0 { - return syserror.ENODEV + // Force sentry page caching at your own risk. + if !d.fs.opts.forcePageCache { + switch d.fs.opts.interop { + case InteropModeExclusive: + // Any mapping is fine. + case InteropModeWritethrough: + // Shared writable mappings require a host FD, since otherwise we + // can't synchronously flush memory-mapped writes to the remote + // file. + if opts.Private || !opts.MaxPerms.Write { + break + } + fallthrough + case InteropModeShared: + // All mappings require a host FD to be coherent with other + // filesystem users. + if atomic.LoadInt32(&d.mmapFD) < 0 { + return linuxerr.ENODEV + } + default: + panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) } - default: - panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) } // After this point, d may be used as a memmap.Mappable. d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) @@ -707,14 +712,8 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) } -func (d *dentry) mayCachePages() bool { - if d.fs.opts.interop == InteropModeShared { - return false - } - if d.fs.opts.forcePageCache { - return true - } - return atomic.LoadInt32(&d.mmapFD) >= 0 +func (fs *filesystem) mayCachePagesInMemoryFile() bool { + return fs.opts.forcePageCache || fs.opts.interop != InteropModeShared } // AddMapping implements memmap.Mappable.AddMapping. @@ -726,7 +725,7 @@ func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar host for _, r := range mapped { d.pf.hostFileMapper.IncRefOn(r) } - if d.mayCachePages() { + if d.fs.mayCachePagesInMemoryFile() { // d.Evict() will refuse to evict memory-mapped pages, so tell the // MemoryFile to not bother trying. mf := d.fs.mfp.MemoryFile() @@ -745,7 +744,7 @@ func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar h for _, r := range unmapped { d.pf.hostFileMapper.DecRefOn(r) } - if d.mayCachePages() { + if d.fs.mayCachePagesInMemoryFile() { // Pages that are no longer referenced by any application memory // mappings are now considered unused; allow MemoryFile to evict them // when necessary. diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go index 8f81f0822..226790a11 100644 --- a/pkg/sentry/fsimpl/gofer/revalidate.go +++ b/pkg/sentry/fsimpl/gofer/revalidate.go @@ -247,16 +247,16 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF if found && !d.isSynthetic() { // First dentry is where the search is starting, just update attributes // since it cannot be replaced. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata. } - d.metadataMu.Unlock() + d.metadataMu.Unlock() // +checklocksforce: see above. continue } // Note that synthetic dentries will always fails the comparison check // below. if !found || d.qidPath != stats[i].QID.Path { - d.metadataMu.Unlock() + d.metadataMu.Unlock() // +checklocksforce: see above. if !found && d.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace // it. @@ -298,7 +298,7 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF } // The file at this path hasn't changed. Just update cached metadata. - d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) + d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above. d.metadataMu.Unlock() } @@ -354,6 +354,7 @@ func (r *revalidateState) add(name string, d *dentry) { r.dentries = append(r.dentries, d) } +// +checklocksignore func (r *revalidateState) lockAllMetadata() { for _, d := range r.dentries { d.metadataMu.Lock() @@ -372,6 +373,7 @@ func (r *revalidateState) popFront() *dentry { // reset releases all metadata locks and resets all fields to allow this // instance to be reused. +// +checklocksignore func (r *revalidateState) reset() { if r.locked { // Unlock any remaining dentries. diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go index 83e841a51..8dcbc61ed 100644 --- a/pkg/sentry/fsimpl/gofer/save_restore.go +++ b/pkg/sentry/fsimpl/gofer/save_restore.go @@ -21,13 +21,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) type saveRestoreContextID int @@ -92,7 +92,7 @@ func (fd *specialFileFD) savePipeData(ctx context.Context) error { fd.buf = append(fd.buf, buf[:n]...) } if err != nil { - if err == io.EOF || err == syserror.EAGAIN { + if err == io.EOF || linuxerr.Equals(linuxerr.EAGAIN, err) { break } return err @@ -158,6 +158,10 @@ func (d *dentryPlatformFile) afterLoad() { // afterLoad is invoked by stateify. func (fd *specialFileFD) afterLoad() { fd.handle.fd = -1 + if fd.hostFileMapper.IsInited() { + // Ensure that we don't call fd.hostFileMapper.Init() again. + fd.hostFileMapperInitOnce.Do(func() {}) + } } // CompleteRestore implements diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index c12444b7e..a8d47b65b 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -20,14 +20,17 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fsmetric" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -41,6 +44,11 @@ import ( type specialFileFD struct { fileDescription + // releaseMu synchronizes the closing of fd.handle with fd.sync(). It's safe + // to access fd.handle without locking for operations that require a ref to + // be held by the caller, e.g. vfs.FileDescriptionImpl implementations. + releaseMu sync.RWMutex `state:"nosave"` + // handle is used for file I/O. handle is immutable. handle handle `state:"nosave"` @@ -70,6 +78,16 @@ type specialFileFD struct { bufMu sync.Mutex `state:"nosave"` haveBuf uint32 buf []byte + + // If handle.fd >= 0, hostFileMapper caches mappings of handle.fd, and + // hostFileMapperInitOnce is used to initialize it on first use. + hostFileMapperInitOnce sync.Once `state:"nosave"` + hostFileMapper fsutil.HostFileMapper + + // If handle.fd >= 0, fileRefs counts references on memmap.File offsets. + // fileRefs is protected by fileRefsMu. + fileRefsMu sync.Mutex `state:"nosave"` + fileRefs fsutil.FrameRefSet } func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { @@ -116,7 +134,10 @@ func (fd *specialFileFD) Release(ctx context.Context) { if fd.haveQueue { fdnotifier.RemoveFD(fd.handle.fd) } + fd.releaseMu.Lock() fd.handle.close(ctx) + fd.releaseMu.Unlock() + fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() delete(fs.specialFileFDs, fd) @@ -183,14 +204,14 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs }() if fd.seekable && offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if d := fd.dentry(); d.cachedMetadataAuthoritative() { @@ -221,23 +242,13 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs } } - // Going through dst.CopyOutFrom() would hold MM locks around file - // operations of unknown duration. For regularFileFD, doing so is necessary - // to support mmap due to lock ordering; MM locks precede dentry.dataMu. - // That doesn't hold here since specialFileFD doesn't client-cache data. - // Just buffer the read instead. - buf := make([]byte, dst.NumBytes()) - n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) - if err == syserror.EAGAIN { - err = syserror.ErrWouldBlock - } - if n == 0 { - return bufN, err + rw := getHandleReadWriter(ctx, &fd.handle, offset) + n, err := dst.CopyOutFrom(ctx, rw) + putHandleReadWriter(rw) + if linuxerr.Equals(linuxerr.EAGAIN, err) { + err = linuxerr.ErrWouldBlock } - if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil { - return bufN + int64(cp), cperr - } - return bufN + int64(n), err + return bufN + n, err } // Read implements vfs.FileDescriptionImpl.Read. @@ -263,14 +274,14 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // offset should be ignored by PWrite. func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if fd.seekable && offset < 0 { - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, offset, syserror.EOPNOTSUPP + return 0, offset, linuxerr.EOPNOTSUPP } d := fd.dentry() @@ -308,20 +319,15 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } } - // Do a buffered write. See rationale in PRead. - buf := make([]byte, src.NumBytes()) - copied, copyErr := src.CopyIn(ctx, buf) - if copied == 0 && copyErr != nil { - // Only return the error if we didn't get any data. - return 0, offset, copyErr - } - n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset)) - if err == syserror.EAGAIN { - err = syserror.ErrWouldBlock + rw := getHandleReadWriter(ctx, &fd.handle, offset) + n, err := src.CopyInTo(ctx, rw) + putHandleReadWriter(rw) + if linuxerr.Equals(linuxerr.EAGAIN, err) { + err = linuxerr.ErrWouldBlock } // Update offset if the offset is valid. if offset >= 0 { - offset += int64(n) + offset += n } // Update file size for regular files. if fd.isRegularFile { @@ -332,10 +338,7 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off atomic.StoreUint64(&d.size, uint64(offset)) } } - if err != nil { - return int64(n), offset, err - } - return int64(n), offset, copyErr + return int64(n), offset, err } // Write implements vfs.FileDescriptionImpl.Write. @@ -354,7 +357,7 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if !fd.seekable { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } fd.mu.Lock() defer fd.mu.Unlock() @@ -372,6 +375,13 @@ func (fd *specialFileFD) Sync(ctx context.Context) error { } func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error { + // Locks to ensure it didn't race with fd.Release(). + fd.releaseMu.RLock() + defer fd.releaseMu.RUnlock() + + if !fd.handle.isOpen() { + return nil + } err := func() error { // If we have a host FD, fsyncing it is likely to be faster than an fsync // RPC. @@ -396,3 +406,85 @@ func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error } return nil } + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + if fd.handle.fd < 0 || fd.filesystem().opts.forcePageCache { + return linuxerr.ENODEV + } + // After this point, fd may be used as a memmap.Mappable and memmap.File. + fd.hostFileMapperInitOnce.Do(fd.hostFileMapper.Init) + return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { + fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { + fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (fd *specialFileFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { + return fd.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (fd *specialFileFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { + mr := optional + if fd.filesystem().opts.limitHostFDTranslation { + mr = maxFillRange(required, optional) + } + return []memmap.Translation{ + { + Source: mr, + File: fd, + Offset: mr.Start, + Perms: hostarch.AnyAccess, + }, + }, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error { + return nil +} + +// IncRef implements memmap.File.IncRef. +func (fd *specialFileFD) IncRef(fr memmap.FileRange) { + fd.fileRefsMu.Lock() + defer fd.fileRefsMu.Unlock() + fd.fileRefs.IncRefAndAccount(fr) +} + +// DecRef implements memmap.File.DecRef. +func (fd *specialFileFD) DecRef(fr memmap.FileRange) { + fd.fileRefsMu.Lock() + defer fd.fileRefsMu.Unlock() + fd.fileRefs.DecRefAndAccount(fr) +} + +// MapInternal implements memmap.File.MapInternal. +func (fd *specialFileFD) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { + fd.requireHostFD() + return fd.hostFileMapper.MapInternal(fr, int(fd.handle.fd), at.Write) +} + +// FD implements memmap.File.FD. +func (fd *specialFileFD) FD() int { + fd.requireHostFD() + return int(fd.handle.fd) +} + +func (fd *specialFileFD) requireHostFD() { + if fd.handle.fd < 0 { + // This is possible if fd was successfully mmapped before saving, then + // was restored without a host FD. This is unrecoverable: without a + // host FD, we can't mmap this file post-restore. + panic("gofer.specialFileFD can no longer be memory-mapped without a host FD") + } +} diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index 2ec819f86..dbd834c67 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -41,7 +41,7 @@ func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { d.haveTarget = true d.target = target } - d.dataMu.Unlock() + d.dataMu.Unlock() // +checklocksforce: guaranteed locked from above. } return target, err } diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index b94dfeb7f..180a35583 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -45,10 +45,10 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fdnotifier", "//pkg/fspath", "//pkg/hostarch", - "//pkg/iovec", "//pkg/log", "//pkg/marshal/primitive", "//pkg/refs", @@ -70,7 +70,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/unet", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index a81f550b1..984c6e8ee 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -24,6 +24,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" @@ -36,11 +37,40 @@ import ( unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) +// These are the modes that are stored with virtualOwner. +const virtualOwnerModes = linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID + +// +stateify savable +type virtualOwner struct { + // This field is initialized at creation time and is immutable. + enabled bool + + // mu protects the fields below and they can be accessed using atomic memory + // operations. + mu sync.Mutex `state:"nosave"` + uid uint32 + gid uint32 + // mode is also stored, otherwise setting the host file to `0000` could remove + // access to the file. + mode uint32 +} + +func (v *virtualOwner) atomicUID() uint32 { + return atomic.LoadUint32(&v.uid) +} + +func (v *virtualOwner) atomicGID() uint32 { + return atomic.LoadUint32(&v.gid) +} + +func (v *virtualOwner) atomicMode() uint32 { + return atomic.LoadUint32(&v.mode) +} + // inode implements kernfs.Inode. // // +stateify savable @@ -97,6 +127,11 @@ type inode struct { // Event queue for blocking operations. queue waiter.Queue + // virtualOwner caches ownership and permission information to override the + // underlying file owner and permission. This is used to allow the unstrusted + // application to change these fields without affecting the host. + virtualOwner virtualOwner + // If haveBuf is non-zero, hostFD represents a pipe, and buf contains data // read from the pipe from previous calls to inode.beforeSave(). haveBuf // and buf are protected by bufMu. haveBuf is accessed using atomic memory @@ -109,12 +144,12 @@ type inode struct { func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fileType linux.FileMode, isTTY bool) (*inode, error) { // Determine if hostFD is seekable. _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) - seekable := err != syserror.ESPIPE + seekable := !linuxerr.Equals(linuxerr.ESPIPE, err) // We expect regular files to be seekable, as this is required for them to // be memory-mappable. if !seekable && fileType == unix.S_IFREG { ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD) - return nil, syserror.ESPIPE + return nil, linuxerr.ESPIPE } i := &inode{ @@ -146,7 +181,7 @@ func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fil type NewFDOptions struct { // If Savable is true, the host file descriptor may be saved/restored by // numeric value; the sandbox API requires a corresponding host FD with the - // same numeric value to be provieded at time of restore. + // same numeric value to be provided at time of restore. Savable bool // If IsTTY is true, the file descriptor is a TTY. @@ -156,6 +191,12 @@ type NewFDOptions struct { // the new file description will inherit flags from hostFD. HaveFlags bool Flags uint32 + + // VirtualOwner allow the host file to have owner and permissions different + // than the underlying host file. + VirtualOwner bool + UID auth.KUID + GID auth.KGID } // NewFD returns a vfs.FileDescription representing the given host file @@ -167,8 +208,8 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) } // Retrieve metadata. - var s unix.Stat_t - if err := unix.Fstat(hostFD, &s); err != nil { + var stat unix.Stat_t + if err := unix.Fstat(hostFD, &stat); err != nil { return nil, err } @@ -182,11 +223,19 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) flags = uint32(flagsInt) } - d := &kernfs.Dentry{} - i, err := newInode(ctx, fs, hostFD, opts.Savable, linux.FileMode(s.Mode).FileType(), opts.IsTTY) + fileType := linux.FileMode(stat.Mode).FileType() + i, err := newInode(ctx, fs, hostFD, opts.Savable, fileType, opts.IsTTY) if err != nil { return nil, err } + if opts.VirtualOwner { + i.virtualOwner.enabled = true + i.virtualOwner.uid = uint32(opts.UID) + i.virtualOwner.gid = uint32(opts.GID) + i.virtualOwner.mode = stat.Mode + } + + d := &kernfs.Dentry{} d.Init(&fs.Filesystem, i) // i.open will take a reference on d. @@ -195,15 +244,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) // For simplicity, fileDescription.offset is set to 0. Technically, we // should only set to 0 on files that are not seekable (sockets, pipes, // etc.), and use the offset from the host fd otherwise when importing. - return i.open(ctx, d, mnt, flags) -} - -// ImportFD sets up and returns a vfs.FileDescription from a donated fd. -func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { - return NewFD(ctx, mnt, hostFD, &NewFDOptions{ - Savable: true, - IsTTY: isTTY, - }) + return i.open(ctx, d, mnt, fileType, flags) } // filesystemType implements vfs.FilesystemType. @@ -269,7 +310,7 @@ func (fs *filesystem) MountOptions() string { // CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { var s unix.Stat_t - if err := unix.Fstat(i.hostFD, &s); err != nil { + if err := i.stat(&s); err != nil { return err } return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) @@ -278,7 +319,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a // Mode implements kernfs.Inode.Mode. func (i *inode) Mode() linux.FileMode { var s unix.Stat_t - if err := unix.Fstat(i.hostFD, &s); err != nil { + if err := i.stat(&s); err != nil { // Retrieving the mode from the host fd using fstat(2) should not fail. // If the syscall does not succeed, something is fundamentally wrong. panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) @@ -289,10 +330,10 @@ func (i *inode) Mode() linux.FileMode { // Stat implements kernfs.Inode.Stat. func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { - return linux.Statx{}, syserror.EINVAL + return linux.Statx{}, linuxerr.EINVAL } if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { - return linux.Statx{}, syserror.EINVAL + return linux.Statx{}, linuxerr.EINVAL } fs := vfsfs.Impl().(*filesystem) @@ -301,11 +342,11 @@ func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOp mask := opts.Mask & linux.STATX_ALL var s unix.Statx_t err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) - if err == syserror.ENOSYS { + if linuxerr.Equals(linuxerr.ENOSYS, err) { // Fallback to fstat(2), if statx(2) is not supported on the host. // // TODO(b/151263641): Remove fallback. - return i.fstat(fs) + return i.statxFromStat(fs) } if err != nil { return linux.Statx{}, err @@ -329,19 +370,35 @@ func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOp // device numbers. ls.Mask |= s.Mask & linux.STATX_ALL if s.Mask&linux.STATX_TYPE != 0 { - ls.Mode |= s.Mode & linux.S_IFMT + if i.virtualOwner.enabled { + ls.Mode |= uint16(i.virtualOwner.atomicMode()) & linux.S_IFMT + } else { + ls.Mode |= s.Mode & linux.S_IFMT + } } if s.Mask&linux.STATX_MODE != 0 { - ls.Mode |= s.Mode &^ linux.S_IFMT + if i.virtualOwner.enabled { + ls.Mode |= uint16(i.virtualOwner.atomicMode()) &^ linux.S_IFMT + } else { + ls.Mode |= s.Mode &^ linux.S_IFMT + } } if s.Mask&linux.STATX_NLINK != 0 { ls.Nlink = s.Nlink } if s.Mask&linux.STATX_UID != 0 { - ls.UID = s.Uid + if i.virtualOwner.enabled { + ls.UID = i.virtualOwner.atomicUID() + } else { + ls.UID = s.Uid + } } if s.Mask&linux.STATX_GID != 0 { - ls.GID = s.Gid + if i.virtualOwner.enabled { + ls.GID = i.virtualOwner.atomicGID() + } else { + ls.GID = s.Gid + } } if s.Mask&linux.STATX_ATIME != 0 { ls.Atime = unixToLinuxStatxTimestamp(s.Atime) @@ -365,7 +422,7 @@ func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOp return ls, nil } -// fstat is a best-effort fallback for inode.Stat() if the host does not +// statxFromStat is a best-effort fallback for inode.Stat() if the host does not // support statx(2). // // We ignore the mask and sync flags in opts and simply supply @@ -373,9 +430,9 @@ func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOp // of a mask or sync flags. fstat(2) does not provide any metadata // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so // those fields remain empty. -func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { +func (i *inode) statxFromStat(fs *filesystem) (linux.Statx, error) { var s unix.Stat_t - if err := unix.Fstat(i.hostFD, &s); err != nil { + if err := i.stat(&s); err != nil { return linux.Statx{}, err } @@ -399,7 +456,21 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { }, nil } +func (i *inode) stat(stat *unix.Stat_t) error { + if err := unix.Fstat(i.hostFD, stat); err != nil { + return err + } + if i.virtualOwner.enabled { + stat.Uid = i.virtualOwner.atomicUID() + stat.Gid = i.virtualOwner.atomicGID() + stat.Mode = i.virtualOwner.atomicMode() + } + return nil +} + // SetStat implements kernfs.Inode.SetStat. +// +// +checklocksignore func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { s := &opts.Stat @@ -407,11 +478,22 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if m == 0 { return nil } - if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { - return syserror.EPERM + supportedModes := uint32(linux.STATX_MODE | linux.STATX_SIZE | linux.STATX_ATIME | linux.STATX_MTIME) + if i.virtualOwner.enabled { + if m&virtualOwnerModes != 0 { + // Take lock if any of the virtual owner fields will be updated. + i.virtualOwner.mu.Lock() + defer i.virtualOwner.mu.Unlock() + } + + supportedModes |= virtualOwnerModes } + if m&^supportedModes != 0 { + return linuxerr.EPERM + } + var hostStat unix.Stat_t - if err := unix.Fstat(i.hostFD, &hostStat); err != nil { + if err := i.stat(&hostStat); err != nil { return err } if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { @@ -419,13 +501,17 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } if m&linux.STATX_MODE != 0 { - if err := unix.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { - return err + if i.virtualOwner.enabled { + i.virtualOwner.mode = uint32(opts.Stat.Mode) + } else { + if err := unix.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { + return err + } } } if m&linux.STATX_SIZE != 0 { if hostStat.Mode&linux.S_IFMT != linux.S_IFREG { - return syserror.EINVAL + return linuxerr.EINVAL } if err := unix.Ftruncate(i.hostFD, int64(s.Size)); err != nil { return err @@ -448,6 +534,14 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre return err } } + if i.virtualOwner.enabled { + if m&linux.STATX_UID != 0 { + i.virtualOwner.uid = opts.Stat.UID + } + if m&linux.STATX_GID != 0 { + i.virtualOwner.gid = opts.Stat.GID + } + } return nil } @@ -470,18 +564,17 @@ func (i *inode) DecRef(ctx context.Context) { func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. if i.Mode().FileType() == linux.S_IFSOCK { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } - return i.open(ctx, d, rp.Mount(), opts.Flags) -} - -func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { - var s unix.Stat_t - if err := unix.Fstat(i.hostFD, &s); err != nil { + var stat unix.Stat_t + if err := i.stat(&stat); err != nil { return nil, err } - fileType := s.Mode & linux.FileTypeMask + fileType := linux.FileMode(stat.Mode).FileType() + return i.open(ctx, d, rp.Mount(), fileType, opts.Flags) +} +func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, fileType linux.FileMode, flags uint32) (*vfs.FileDescription, error) { // Constrain flags to a subset we can handle. // // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls. @@ -491,7 +584,7 @@ func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flag case unix.S_IFSOCK: if i.isTTY { log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) - return nil, syserror.ENOTTY + return nil, linuxerr.ENOTTY } ep, err := newEndpoint(ctx, i.hostFD, &i.queue) @@ -529,7 +622,7 @@ func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flag default: log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType) - return nil, syserror.EPERM + return nil, linuxerr.EPERM } } @@ -584,12 +677,12 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } i := f.inode if !i.seekable { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) @@ -601,7 +694,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } i := f.inode @@ -618,7 +711,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts if total != 0 { err = nil } else { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } } return total, err @@ -660,7 +753,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off // PWrite implements vfs.FileDescriptionImpl.PWrite. func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { if !f.inode.seekable { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } return f.writeToHostFD(ctx, src, offset, opts.Flags) @@ -672,7 +765,7 @@ func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opt if !i.seekable { n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) if isBlockError(err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } return n, err } @@ -700,7 +793,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque hostFD := f.inode.hostFD // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := src.CopyInTo(ctx, writer) @@ -721,7 +814,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { i := f.inode if !i.seekable { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } f.offsetMu.Lock() @@ -730,17 +823,17 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i switch whence { case linux.SEEK_SET: if offset < 0 { - return f.offset, syserror.EINVAL + return f.offset, linuxerr.EINVAL } f.offset = offset case linux.SEEK_CUR: // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. if offset > math.MaxInt64-f.offset { - return f.offset, syserror.EOVERFLOW + return f.offset, linuxerr.EOVERFLOW } if f.offset+offset < 0 { - return f.offset, syserror.EINVAL + return f.offset, linuxerr.EINVAL } f.offset += offset @@ -753,10 +846,10 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i // Check for overflow. Note that underflow cannot occur, since size >= 0. if offset > math.MaxInt64-size { - return f.offset, syserror.EOVERFLOW + return f.offset, linuxerr.EOVERFLOW } if size+offset < 0 { - return f.offset, syserror.EINVAL + return f.offset, linuxerr.EINVAL } f.offset = size + offset @@ -773,7 +866,7 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i default: // Invalid whence. - return f.offset, syserror.EINVAL + return f.offset, linuxerr.EINVAL } return f.offset, nil @@ -790,7 +883,7 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts // NOTE(b/38213152): Technically, some obscure char devices can be memory // mapped, but we only allow regular files. if f.inode.ftype != unix.S_IFREG { - return syserror.ENODEV + return linuxerr.ENODEV } i := f.inode i.CachedMappable.InitFileMapperOnce() diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index ca85f5601..709d5747d 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -21,6 +21,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/socket/control" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/waiter" @@ -158,9 +158,9 @@ func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMess if n < totalLen && err == nil { // The host only returns a short write if it would otherwise // block (and only for stream sockets). - err = syserror.EAGAIN + err = linuxerr.EAGAIN } - if n > 0 && err != syserror.EAGAIN { + if n > 0 && !linuxerr.Equals(linuxerr.EAGAIN, err) { // The caller may need to block to send more data, but // otherwise there isn't anything that can be done about an // error with a partial write. diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go index b123a63ee..292b44c43 100644 --- a/pkg/sentry/fsimpl/host/socket_iovec.go +++ b/pkg/sentry/fsimpl/host/socket_iovec.go @@ -16,8 +16,8 @@ package host import ( "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/iovec" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/hostfd" ) // copyToMulti copies as many bytes from src to dst as possible. @@ -64,13 +64,13 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec if length > maxlen { if truncate { stopLen = maxlen - err = syserror.EAGAIN + err = linuxerr.EAGAIN } else { - return 0, nil, nil, syserror.EMSGSIZE + return 0, nil, nil, linuxerr.EMSGSIZE } } - if iovsRequired > iovec.MaxIovs { + if iovsRequired > hostfd.MaxSendRecvMsgIov { // The kernel will reject our call if we pass this many iovs. // Use a single intermediate buffer instead. b := make([]byte, stopLen) diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 0f9e20a84..04ac73255 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -17,13 +17,13 @@ package host import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -147,7 +147,7 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // Ignore arg[0]. This is the real FD: @@ -188,7 +188,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch pidns := kernel.PIDNamespaceFromContext(ctx) if pidns == nil { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } t.mu.Lock() @@ -211,15 +211,15 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change() // to -ENOTTY. - if err == syserror.EIO { - return 0, syserror.ENOTTY + if linuxerr.Equals(linuxerr.EIO, err) { + return 0, linuxerr.ENOTTY } return 0, err } // Check that calling task's process group is in the TTY session. if task.ThreadGroup().Session() != t.session { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } var pgIDP primitive.Int32 @@ -230,19 +230,19 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch // pgID must be non-negative. if pgID < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Process group with pgID must exist in this PID namespace. pidns := task.PIDNamespace() pg := pidns.ProcessGroupWithID(pgID) if pg == nil { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } // Check that new process group is in the TTY session. if pg.Session() != t.session { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } t.fgProcessGroup = pg @@ -302,7 +302,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch unimpl.EmitUnimplementedEvent(ctx) fallthrough default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } @@ -345,7 +345,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // If the signal is SIGTTIN, then we are attempting to read // from the TTY. Don't send the signal and return EIO. if sig == linux.SIGTTIN { - return syserror.EIO + return linuxerr.EIO } // Otherwise, we are writing or changing terminal state. This is allowed. @@ -354,7 +354,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // If the process group is an orphan, return EIO. if pg.IsOrphan() { - return syserror.EIO + return linuxerr.EIO } // Otherwise, send the signal to the process group and return ERESTARTSYS. @@ -367,5 +367,5 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // // Linux ignores the result of kill_pgrp(). _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) - return syserror.ERESTARTSYS + return linuxerr.ERESTARTSYS } diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index 63b465859..9850f3f41 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -17,7 +17,7 @@ package host import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { @@ -42,7 +42,7 @@ func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { } // isBlockError checks if an error is EAGAIN or EWOULDBLOCK. -// If so, they can be transformed into syserror.ErrWouldBlock. +// If so, they can be transformed into linuxerr.ErrWouldBlock. func isBlockError(err error) bool { - return err == syserror.EAGAIN || err == syserror.EWOULDBLOCK + return linuxerr.Equals(linuxerr.EAGAIN, err) || linuxerr.Equals(linuxerr.EWOULDBLOCK, err) } diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index b7d13cced..4b577ea43 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -104,6 +104,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/hostarch", "//pkg/log", @@ -118,7 +119,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) @@ -135,6 +135,8 @@ go_test( ":kernfs", "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", + "//pkg/fspath", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", @@ -142,7 +144,6 @@ go_test( "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "@com_github_google_go_cmp//cmp:go_default_library", ], diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 84b1c3745..9d7526e47 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -71,7 +71,7 @@ func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *D // inode attributes to be changed. Override SetStat() making it call // f.InodeAttrs to allow it. func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a @@ -137,5 +137,5 @@ func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. - return syserror.EPERM + return linuxerr.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index e55111af0..7db1473c4 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -19,11 +19,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -98,7 +98,7 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, l func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. - return syserror.EISDIR + return linuxerr.EISDIR } fd.LockFD.Init(locks) fd.seekEnd = fdOpts.SeekEnd @@ -248,10 +248,10 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd)) } default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.off = offset return offset, nil diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 8fac53c60..363ebc466 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // stepExistingLocked resolves rp.Component() in parent directory vfsd. @@ -39,7 +39,7 @@ import ( // Postcondition: Caller must call fs.processDeferredDecRefs*. func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // Directory searchable? if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { @@ -70,7 +70,7 @@ afterSymlink: return d.parent, nil } if len(name) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } d.dirMu.Lock() next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name]) @@ -169,7 +169,7 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP } } if rp.MustBeDir() && !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -196,7 +196,7 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving } } if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -214,16 +214,16 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string return err } if name == "." || name == ".." { - return syserror.EEXIST + return linuxerr.EEXIST } if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } if _, ok := parent.children[name]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } if parent.VFSDentry().IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil { return err @@ -237,10 +237,10 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error { parent := d.parent if parent == nil { - return syserror.EBUSY + return linuxerr.EBUSY } if parent.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err @@ -317,7 +317,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if opts.CheckSearchable { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -344,7 +344,7 @@ func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { if rp.Done() { - return syserror.EEXIST + return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) @@ -361,10 +361,10 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return err } if rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if rp.Mount() != vd.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := rp.Mount().CheckBeginWrite(); err != nil { return err @@ -373,7 +373,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. d := vd.Dentry().Impl().(*Dentry) if d.isDir() { - return syserror.EPERM + return linuxerr.EPERM } childI, err := parent.inode.NewLink(ctx, pc, d.inode) @@ -389,7 +389,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { if rp.Done() { - return syserror.EEXIST + return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) @@ -411,7 +411,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v defer rp.Mount().EndWrite() childI, err := parent.inode.NewDir(ctx, pc, opts) if err != nil { - if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) { return err } childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode) @@ -425,7 +425,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { if rp.Done() { - return syserror.EEXIST + return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) @@ -442,7 +442,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } if rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := rp.Mount().CheckBeginWrite(); err != nil { return err @@ -508,10 +508,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf defer unlock() if rp.Done() { if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err @@ -535,18 +535,18 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } pc := rp.Component() if pc == "." || pc == ".." { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if len(pc) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } // Determine whether or not we need to create a file. child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */) - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { // Already checked for searchability above; now check for writability. if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err @@ -576,7 +576,7 @@ afterTrailingSymlink: } // Open existing file or follow symlink. if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if rp.ShouldFollowSymlink() && child.isSymlink() { targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount()) @@ -622,7 +622,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st } if !d.isSymlink() { fs.mu.RUnlock() - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // Inode.Readlink() cannot be called holding fs locks. @@ -648,13 +648,13 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Only RENAME_NOREPLACE is supported. if opts.Flags&^linux.RENAME_NOREPLACE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 mnt := rp.Mount() if mnt != oldParentVD.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err @@ -680,17 +680,19 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newName := rp.Component() if newName == "." || newName == ".." { if noReplace { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.EBUSY + return linuxerr.EBUSY } - switch err := checkCreateLocked(ctx, rp.Credentials(), newName, dstDir); err { - case nil: + + err = checkCreateLocked(ctx, rp.Credentials(), newName, dstDir) + switch { + case err == nil: // Ok, continue with rename as replacement. - case syserror.EEXIST: + case linuxerr.Equals(linuxerr.EEXIST, err): if noReplace { // Won't overwrite existing node since RENAME_NOREPLACE was requested. - return syserror.EEXIST + return linuxerr.EEXIST } dst = dstDir.children[newName] if dst == nil { @@ -749,7 +751,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa fs.deferDecRef(replaced) replaceVFSD = replaced.VFSDentry() } - virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) + virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) // +checklocksforce: to may be nil, that's okay. return nil } @@ -771,10 +773,10 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if !d.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if d.inode.HasChildren() { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } virtfs := rp.VirtualFilesystem() parentDentry := d.parent @@ -785,7 +787,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mntns.DecRef(ctx) vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { - return err + return err // +checklocksforce: vfsd is not locked. } if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil { @@ -841,7 +843,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { if rp.Done() { - return syserror.EEXIST + return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) @@ -858,7 +860,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } if rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := rp.Mount().CheckBeginWrite(); err != nil { return err @@ -892,7 +894,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if d.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } virtfs := rp.VirtualFilesystem() parentDentry := d.parent @@ -927,7 +929,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. @@ -940,7 +942,7 @@ func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si return nil, err } // kernfs currently does not support extended attributes. - return nil, syserror.ENOTSUP + return nil, linuxerr.ENOTSUP } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. @@ -953,7 +955,7 @@ func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return "", err } // kernfs currently does not support extended attributes. - return "", syserror.ENOTSUP + return "", linuxerr.ENOTSUP } // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. @@ -966,7 +968,7 @@ func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return err } // kernfs currently does not support extended attributes. - return syserror.ENOTSUP + return linuxerr.ENOTSUP } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. @@ -979,7 +981,7 @@ func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, return err } // kernfs currently does not support extended attributes. - return syserror.ENOTSUP + return linuxerr.ENOTSUP } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 3d0866ecf..b96dc9ef7 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -20,12 +20,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // InodeNoopRefCount partially implements the Inode interface, specifically the @@ -61,27 +61,27 @@ type InodeDirectoryNoNewChildren struct{} // NewFile implements Inode.NewFile. func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewDir implements Inode.NewDir. func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewLink implements Inode.NewLink. func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewSymlink implements Inode.NewSymlink. func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewNode implements Inode.NewNode. func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // InodeNotDirectory partially implements the Inode interface, specifically the @@ -158,12 +158,12 @@ type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) { - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // Getlink implements Inode.Getlink. func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { - return vfs.VirtualDentry{}, "", syserror.EINVAL + return vfs.VirtualDentry{}, "", linuxerr.EINVAL } // InodeAttrs partially implements the Inode interface, specifically the @@ -233,6 +233,11 @@ func (a *InodeAttrs) Mode() linux.FileMode { return linux.FileMode(atomic.LoadUint32(&a.mode)) } +// Links returns the link count. +func (a *InodeAttrs) Links() uint32 { + return atomic.LoadUint32(&a.nlink) +} + // TouchAtime updates a.atime to the current time. func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) { if mnt.Flags.NoATime || mnt.ReadOnly() { @@ -285,10 +290,10 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut // allowed by kernfs files but does not do anything. If some other behavior is // needed, the embedder should consider extending SetStat. if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { - return syserror.EPERM + return linuxerr.EPERM } if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err @@ -474,7 +479,7 @@ func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error s, ok := o.set[name] if !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } s.inode.IncRef() // This ref is passed to the dentry upon creation via Init. @@ -501,6 +506,30 @@ func (o *OrderedChildren) Insert(name string, child Inode) error { return o.insert(name, child, false) } +// Inserter is like Insert, but obtains the child to insert by calling +// makeChild. makeChild is only called if the insert will succeed. This allows +// the caller to atomically check and insert a child without having to +// clean up the child on failure. +func (o *OrderedChildren) Inserter(name string, makeChild func() Inode) (Inode, error) { + o.mu.Lock() + defer o.mu.Unlock() + if _, ok := o.set[name]; ok { + return nil, linuxerr.EEXIST + } + + // Note: We must not fail after we call makeChild(). + + child := makeChild() + s := &slot{ + name: name, + inode: child, + static: false, + } + o.order.PushBack(s) + o.set[name] = s + return child, nil +} + // insert inserts child into o. // // Precondition: Caller must be holding a ref on child if static is true. @@ -510,7 +539,7 @@ func (o *OrderedChildren) insert(name string, child Inode, static bool) error { o.mu.Lock() defer o.mu.Unlock() if _, ok := o.set[name]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } s := &slot{ name: name, @@ -558,7 +587,7 @@ func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, n func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error { s, ok := o.set[name] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if s.inode != child { panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child)) @@ -569,7 +598,7 @@ func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error { // Unlink implements Inode.Unlink. func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error { if !o.writable { - return syserror.EPERM + return linuxerr.EPERM } o.mu.Lock() defer o.mu.Unlock() @@ -599,15 +628,15 @@ func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) e // Postcondition: reference on any replaced dentry transferred to caller. func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error { if !o.writable { - return syserror.EPERM + return linuxerr.EPERM } dst, ok := dstDir.(interface{}).(*OrderedChildren) if !ok { - return syserror.EXDEV + return linuxerr.EXDEV } if !dst.writable { - return syserror.EPERM + return linuxerr.EPERM } // Note: There's a potential deadlock below if concurrent calls to Rename @@ -653,7 +682,7 @@ type InodeSymlink struct { // Open implements Inode.Open. func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return nil, syserror.ELOOP + return nil, linuxerr.ELOOP } // StaticDirectory is a standard implementation of a directory with static @@ -709,7 +738,7 @@ func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *De // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // DecRef implements Inode.DecRef. @@ -745,5 +774,5 @@ type InodeNoStatFS struct{} // StatFS implements Inode.StatFS. func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { - return linux.Statfs{}, syserror.ENOSYS + return linux.Statfs{}, linuxerr.ENOSYS } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 6f699c9cd..544698694 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -52,7 +52,7 @@ // vfs.VirtualFilesystem.mountMu // vfs.Dentry.mu // (inode implementation locks, if any) -// kernfs.Filesystem.droppedDentriesMu +// kernfs.Filesystem.deferredDecRefsMu package kernfs import ( @@ -61,6 +61,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -76,12 +77,12 @@ import ( type Filesystem struct { vfsfs vfs.Filesystem - droppedDentriesMu sync.Mutex `state:"nosave"` + deferredDecRefsMu sync.Mutex `state:"nosave"` - // droppedDentries is a list of dentries waiting to be DecRef()ed. This is + // deferredDecRefs is a list of dentries waiting to be DecRef()ed. This is // used to defer dentry destruction until mu can be acquired for - // writing. Protected by droppedDentriesMu. - droppedDentries []*Dentry + // writing. Protected by deferredDecRefsMu. + deferredDecRefs []refsvfs2.RefCounter // mu synchronizes the lifetime of Dentries on this filesystem. Holding it // for reading guarantees continued existence of any resolved dentries, but @@ -131,25 +132,49 @@ type Filesystem struct { // deferDecRef defers dropping a dentry ref until the next call to // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. // This may be called while Filesystem.mu or Dentry.dirMu is locked. -func (fs *Filesystem) deferDecRef(d *Dentry) { - fs.droppedDentriesMu.Lock() - fs.droppedDentries = append(fs.droppedDentries, d) - fs.droppedDentriesMu.Unlock() +func (fs *Filesystem) deferDecRef(d refsvfs2.RefCounter) { + fs.deferredDecRefsMu.Lock() + fs.deferredDecRefs = append(fs.deferredDecRefs, d) + fs.deferredDecRefsMu.Unlock() +} + +// SafeDecRefFD safely DecRef the FileDescription making sure DecRef is deferred +// in case Filesystem.mu is held. See comment on Filesystem.mu. +func (fs *Filesystem) SafeDecRefFD(ctx context.Context, fd *vfs.FileDescription) { + if d, ok := fd.Dentry().Impl().(*Dentry); ok && d.fs == fs { + // Only defer if dentry belongs to this filesystem, since locks cannot cross + // filesystems. + fs.deferDecRef(fd) + return + } + fd.DecRef(ctx) +} + +// SafeDecRef safely DecRef the virtual dentry making sure DecRef is deferred +// in case Filesystem.mu is held. See comment on Filesystem.mu. +func (fs *Filesystem) SafeDecRef(ctx context.Context, vd vfs.VirtualDentry) { + if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { + // Only defer if dentry belongs to this filesystem, since locks cannot cross + // filesystems. + fs.deferDecRef(&vd) + return + } + vd.DecRef(ctx) } // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the -// droppedDentries list. See comment on Filesystem.mu. +// deferredDecRefs list. See comment on Filesystem.mu. // // Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked. func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { - fs.droppedDentriesMu.Lock() - for _, d := range fs.droppedDentries { - // Defer the DecRef call so that we are not holding droppedDentriesMu + fs.deferredDecRefsMu.Lock() + for _, d := range fs.deferredDecRefs { + // Defer the DecRef call so that we are not holding deferredDecRefsMu // when DecRef is called. defer d.DecRef(ctx) } - fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. - fs.droppedDentriesMu.Unlock() + fs.deferredDecRefs = fs.deferredDecRefs[:0] // Keep slice memory for reuse. + fs.deferredDecRefsMu.Unlock() } // VFSFilesystem returns the generic vfs filesystem object. @@ -518,6 +543,63 @@ func (d *Dentry) FSLocalPath() string { return b.String() } +// WalkDentryTree traverses p in the dentry tree for this filesystem. Note that +// this only traverses the dentry tree and is not a general path traversal. No +// symlinks and dynamic children are resolved, and no permission checks are +// performed. The caller is responsible for ensuring the returned Dentry exists +// for an appropriate lifetime. +// +// p is interpreted starting at d, and may be absolute or relative (absolute vs +// relative paths both refer to the same target here, since p is absolute from +// d). p may contain "." and "..", but will not allow traversal above d (similar +// to ".." at the root dentry). +// +// This is useful for filesystem internals, where the filesystem may not be +// mounted yet. For a mounted filesystem, use GetDentryAt. +func (d *Dentry) WalkDentryTree(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (*Dentry, error) { + d.fs.mu.RLock() + defer d.fs.processDeferredDecRefs(ctx) + defer d.fs.mu.RUnlock() + + target := d + + for pit := p.Begin; pit.Ok(); pit = pit.Next() { + pc := pit.String() + + switch { + case target == nil: + return nil, linuxerr.ENOENT + case pc == ".": + // No-op, consume component and continue. + case pc == "..": + if target == d { + // Don't let .. traverse above the start point of the walk. + continue + } + target = target.parent + // Parent doesn't need revalidation since we revalidated it on the + // way to the child, and we're still holding fs.mu. + default: + var err error + + d.dirMu.Lock() + target, err = d.fs.revalidateChildLocked(ctx, vfsObj, target, pc, target.children[pc]) + d.dirMu.Unlock() + + if err != nil { + return nil, err + } + } + } + + if target == nil { + return nil, linuxerr.ENOENT + } + + target.IncRef() + return target, nil +} + // The Inode interface maps filesystem-level operations that operate on paths to // equivalent operations on specific filesystem nodes. // @@ -643,12 +725,15 @@ type inodeDirectory interface { // RmDir removes an empty child directory from this directory // inode. Implementations must update the parent directory's link count, // if required. Implementations are not responsible for checking that child - // is a directory, checking for an empty directory. + // is a directory, or checking for an empty directory. RmDir(ctx context.Context, name string, child Inode) error // Rename is called on the source directory containing an inode being - // renamed. child should point to the resolved child in the source - // directory. + // renamed. child points to the resolved child in the source directory. + // dstDir is guaranteed to be a directory inode. + // + // On a successful call to Rename, the caller updates the dentry tree to + // reflect the name change. // // Precondition: Caller must serialize concurrent calls to Rename. Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 1cd3137e6..a2aba9321 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -22,12 +22,13 @@ import ( "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -94,7 +95,7 @@ type attrs struct { } func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } type readonlyDir struct { @@ -196,15 +197,15 @@ func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (k } func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } func (fsType) Name() string { @@ -318,10 +319,10 @@ func TestDirFDReadWrite(t *testing.T) { defer fd.DecRef(sys.Ctx) // Read/Write should fail for directory FDs. - if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { + if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); !linuxerr.Equals(linuxerr.EISDIR, err) { t.Fatalf("Read for directory FD failed with unexpected error: %v", err) } - if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EBADF { + if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); !linuxerr.Equals(linuxerr.EBADF, err) { t.Fatalf("Write for directory FD failed with unexpected error: %v", err) } } @@ -346,3 +347,63 @@ func TestDirFDIterDirents(t *testing.T) { "file1": linux.DT_REG, }) } + +func TestDirWalkDentryTree(t *testing.T) { + sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "dir1": fs.newDir(ctx, creds, 0755, nil), + "dir2": fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "file1": fs.newFile(ctx, creds, staticFileContent), + "dir3": fs.newDir(ctx, creds, 0755, nil), + }), + }) + }) + defer sys.Destroy() + + testWalk := func(from *kernfs.Dentry, getDentryPath, walkPath string, expectedErr error) { + var d *kernfs.Dentry + if getDentryPath != "" { + pop := sys.PathOpAtRoot(getDentryPath) + vd := sys.GetDentryOrDie(pop) + defer vd.DecRef(sys.Ctx) + d = vd.Dentry().Impl().(*kernfs.Dentry) + } + + match, err := from.WalkDentryTree(sys.Ctx, sys.VFS, fspath.Parse(walkPath)) + if err == nil { + defer match.DecRef(sys.Ctx) + } + + if err != expectedErr { + t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) unexpected error, want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, expectedErr, err) + } + if expectedErr != nil { + return + } + + if d != match { + t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) found unexpected dentry; want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, d, match) + } + } + + rootD := sys.Root.Dentry().Impl().(*kernfs.Dentry) + + testWalk(rootD, "dir1", "/dir1", nil) + testWalk(rootD, "", "/dir-non-existent", linuxerr.ENOENT) + testWalk(rootD, "", "/dir1/child-non-existent", linuxerr.ENOENT) + testWalk(rootD, "", "/dir2/inner-non-existent/dir3", linuxerr.ENOENT) + + testWalk(rootD, "dir2/dir3", "/dir2/../dir2/dir3", nil) + testWalk(rootD, "dir2/dir3", "/dir2/././dir3", nil) + testWalk(rootD, "dir2/dir3", "/dir2/././dir3/.././dir3", nil) + + pop := sys.PathOpAtRoot("dir2") + dir2VD := sys.GetDentryOrDie(pop) + defer dir2VD.DecRef(sys.Ctx) + dir2D := dir2VD.Dentry().Impl().(*kernfs.Dentry) + + testWalk(dir2D, "dir2/dir3", "/dir3", nil) + testWalk(dir2D, "dir2/dir3", "/../../../dir3", nil) + testWalk(dir2D, "dir2/file1", "/file1", nil) + testWalk(dir2D, "dir2/file1", "file1", nil) +} diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index a0736c0d6..4adf76ce6 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -17,9 +17,9 @@ package kernfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // StaticSymlink provides an Inode implementation for symlinks that point to @@ -62,5 +62,5 @@ func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go index 11694c392..c91d23b56 100644 --- a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go +++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // syntheticDirectory implements kernfs.Inode for a directory created by @@ -65,13 +65,13 @@ func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, // NewFile implements Inode.NewFile. func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewDir implements Inode.NewDir. func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) { if !opts.ForSyntheticMountpoint { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } subdirI := newSyntheticDirectory(ctx, auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask) if err := dir.OrderedChildren.Insert(name, subdirI); err != nil { @@ -84,17 +84,17 @@ func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs // NewLink implements Inode.NewLink. func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewSymlink implements Inode.NewSymlink. func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // NewNode implements Inode.NewNode. func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } // DecRef implements Inode.DecRef. diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD index 5504476c8..d16dfef9b 100644 --- a/pkg/sentry/fsimpl/overlay/BUILD +++ b/pkg/sentry/fsimpl/overlay/BUILD @@ -29,6 +29,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/hostarch", "//pkg/log", @@ -41,7 +42,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 45aa5a494..520487066 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -20,12 +20,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) func (d *dentry) isCopiedUp() bool { @@ -36,6 +36,10 @@ func (d *dentry) isCopiedUp() bool { // // Preconditions: filesystem.renameMu must be locked. func (d *dentry) copyUpLocked(ctx context.Context) error { + return d.copyUpMaybeSyntheticMountpointLocked(ctx, false /* forSyntheticMountpoint */) +} + +func (d *dentry) copyUpMaybeSyntheticMountpointLocked(ctx context.Context, forSyntheticMountpoint bool) error { // Fast path. if d.isCopiedUp() { return nil @@ -51,15 +55,15 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { // Can be copied-up. default: // Can't be copied-up. - return syserror.EPERM + return linuxerr.EPERM } // Ensure that our parent directory is copied-up. if d.parent == nil { // d is a filesystem root with no upper layer. - return syserror.EROFS + return linuxerr.EROFS } - if err := d.parent.copyUpLocked(ctx); err != nil { + if err := d.parent.copyUpMaybeSyntheticMountpointLocked(ctx, forSyntheticMountpoint); err != nil { return err } @@ -71,7 +75,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { } if d.vfsd.IsDead() { // Raced with deletion of d. - return syserror.ENOENT + return linuxerr.ENOENT } // Obtain settable timestamps from the lower layer. @@ -168,7 +172,8 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { case linux.S_IFDIR: if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{ - Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + ForSyntheticMountpoint: forSyntheticMountpoint, }); err != nil { return err } @@ -271,7 +276,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { } if upperStat.Mask&linux.STATX_INO == 0 { cleanupUndoCopyUp() - return syserror.EREMOTE + return linuxerr.EREMOTE } atomic.StoreUint32(&d.devMajor, upperStat.DevMajor) atomic.StoreUint32(&d.devMinor, upperStat.DevMinor) @@ -349,7 +354,7 @@ func (d *dentry) copyXattrsLocked(ctx context.Context) error { lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0) if err != nil { - if err == syserror.EOPNOTSUPP { + if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { // There are no guarantees as to the contents of lowerXattrs. return nil } diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go index df4492346..ad3cdbb56 100644 --- a/pkg/sentry/fsimpl/overlay/directory.go +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) func (d *dentry) isDir() bool { @@ -69,7 +69,7 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string return nil } // Non-whiteout file in the directory prevents rmdir. - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY })) if err != nil { readdirErr = err @@ -88,7 +88,7 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string } if stat.RdevMajor != 0 || stat.RdevMinor != 0 { // This file is a real character device, not a whiteout. - readdirErr = syserror.ENOTEMPTY + readdirErr = linuxerr.ENOTEMPTY return false } whiteouts[maybeWhiteoutName] = isUpper @@ -256,7 +256,7 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in switch whence { case linux.SEEK_SET: if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset == 0 { // Ensure that the next call to fd.IterDirents() calls @@ -268,13 +268,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in case linux.SEEK_CUR: offset += fd.off if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Don't clear fd.dirents in this case, even if offset == 0. fd.off = offset return fd.off, nil default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 6b6fa0bd5..3b3dcf836 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -21,13 +21,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs @@ -86,7 +86,7 @@ func putDentrySlice(ds *[]*dentry) { // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. // -// +checklocks:fs.renameMu +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]*dentry) { fs.renameMu.RUnlock() if *dsp == nil { @@ -112,7 +112,7 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]* putDentrySlice(*dsp) } -// +checklocks:fs.renameMu +// +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() @@ -137,7 +137,7 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de // * !rp.Done(). func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, lookupLayer, error) { if !d.isDir() { - return nil, lookupLayerNone, syserror.ENOTDIR + return nil, lookupLayerNone, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, lookupLayerNone, err @@ -218,7 +218,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str Start: parentVD, Path: childPath, }, &vfs.GetDentryOptions{}) - if err == syserror.ENOENT || err == syserror.ENAMETOOLONG { + if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { // The file doesn't exist on this layer. Proceed to the next one. return true } @@ -245,7 +245,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str return false } if stat.Mask&mask != mask { - lookupErr = syserror.EREMOTE + lookupErr = linuxerr.EREMOTE return false } @@ -313,7 +313,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str } if !topLookupLayer.existsInOverlay() { child.destroyLocked(ctx) - return nil, topLookupLayer, syserror.ENOENT + return nil, topLookupLayer, linuxerr.ENOENT } // Device and inode numbers were copied from the topmost layer above. Remap @@ -352,7 +352,7 @@ func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, nam }, &vfs.StatOptions{ Mask: linux.STATX_TYPE, }) - if err == syserror.ENOENT || err == syserror.ENAMETOOLONG { + if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { // The file doesn't exist on this layer. Proceed to the next // one. return true @@ -365,7 +365,7 @@ func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, nam // Linux's overlayfs tends to return EREMOTE in cases where a file // is unusable for reasons that are not better captured by another // errno. - lookupErr = syserror.EREMOTE + lookupErr = linuxerr.EREMOTE return false } if isWhiteout(&stat) { @@ -437,7 +437,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving d = next } if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -457,18 +457,26 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, d = next } if rp.MustBeDir() && !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } +type createType int + +const ( + createNonDirectory createType = iota + createDirectory + createSyntheticMountpoint +) + // doCreateAt checks that creating a file at rp is permitted, then invokes // create to do so. // // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, ct createType, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) @@ -479,10 +487,10 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } name := rp.Component() if name == "." || name == ".." { - return syserror.EEXIST + return linuxerr.EEXIST } if parent.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { @@ -494,18 +502,18 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // Determine if a file already exists at name. if _, ok := parent.children[name]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } childLayer, err := fs.lookupLayerLocked(ctx, parent, name) if err != nil { return err } if childLayer.existsInOverlay() { - return syserror.EEXIST + return linuxerr.EEXIST } - if !dir && rp.MustBeDir() { - return syserror.ENOENT + if ct == createNonDirectory && rp.MustBeDir() { + return linuxerr.ENOENT } mnt := rp.Mount() @@ -518,7 +526,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } // Ensure that the parent directory is copied-up so that we can create the // new file in the upper layer. - if err := parent.copyUpLocked(ctx); err != nil { + if err := parent.copyUpMaybeSyntheticMountpointLocked(ctx, ct == createSyntheticMountpoint); err != nil { return err } @@ -529,7 +537,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir parent.dirents = nil ev := linux.IN_CREATE - if dir { + if ct != createNonDirectory { ev |= linux.IN_ISDIR } parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */) @@ -592,7 +600,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op } if opts.CheckSearchable { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -618,13 +626,13 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error { if rp.Mount() != vd.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } old := vd.Dentry().Impl().(*dentry) if old.isDir() { - return syserror.EPERM + return linuxerr.EPERM } if err := old.copyUpLocked(ctx); err != nil { return err @@ -671,7 +679,11 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + ct := createDirectory + if opts.ForSyntheticMountpoint { + ct = createSyntheticMountpoint + } + return fs.doCreateAt(ctx, rp, ct, func(parent *dentry, childName string, haveUpperWhiteout bool) error { vfsObj := fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ Root: parent.upperVD, @@ -722,10 +734,10 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error { // Disallow attempts to create whiteouts. if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 { - return syserror.EPERM + return linuxerr.EPERM } vfsObj := fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ @@ -779,10 +791,10 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf start := rp.Start().Impl().(*dentry) if rp.Done() { if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if start.isRegularFile() && mayWrite { if err := start.copyUpLocked(ctx); err != nil { @@ -806,12 +818,12 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } // Determine whether or not we need to create a file. parent.dirMu.Lock() child, topLookupLayer, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) - if err == syserror.ENOENT && mayCreate { + if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate { fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds, topLookupLayer == lookupLayerUpperWhiteout) parent.dirMu.Unlock() return fd, err @@ -822,7 +834,7 @@ afterTrailingSymlink: } // Open existing child or follow symlink. if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx) @@ -836,7 +848,7 @@ afterTrailingSymlink: goto afterTrailingSymlink } if rp.MustBeDir() && !child.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if child.isRegularFile() && mayWrite { if err := child.copyUpLocked(ctx); err != nil { @@ -864,14 +876,14 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts * if ftype == linux.S_IFDIR { // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } // Can't open directories writably. if ats.MayWrite() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fd := &directoryFD{} fd.LockFD.Init(&d.locks) @@ -918,7 +930,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving return nil, err } if parent.vfsd.IsDead() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -1027,19 +1039,19 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if opts.Flags&^linux.RENAME_NOREPLACE != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } newName := rp.Component() if newName == "." || newName == ".." { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.EBUSY + return linuxerr.EBUSY } mnt := rp.Mount() if mnt != oldParentVD.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err @@ -1064,7 +1076,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if renamed.isDir() { if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { - return syserror.EINVAL + return linuxerr.EINVAL } if oldParent != newParent { if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { @@ -1073,7 +1085,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } else { if opts.MustBeDir || rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } @@ -1085,7 +1097,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer newParent.dirMu.Unlock() } if newParent.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } var ( replaced *dentry @@ -1094,20 +1106,20 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa whiteouts map[string]bool ) replaced, replacedLayer, err = fs.getChildLocked(ctx, newParent, newName, &ds) - if err != nil && err != syserror.ENOENT { + if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { return err } if replaced != nil { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if genericIsAncestorDentry(replaced, renamed) { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } replaced.dirMu.Lock() defer replaced.dirMu.Unlock() @@ -1117,7 +1129,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } else { if rp.MustBeDir() || renamed.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } } @@ -1177,7 +1189,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa Root: replaced.upperVD, Start: replaced.upperVD, Path: fspath.Parse(whiteoutName), - }); err != nil && err != syserror.EEXIST { + }); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err)) } } @@ -1285,10 +1297,10 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer rp.Mount().EndWrite() name := rp.Component() if name == "." { - return syserror.EINVAL + return linuxerr.EINVAL } if name == ".." { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) @@ -1309,7 +1321,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if !child.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if err := parent.mayDelete(rp.Credentials(), child); err != nil { return err @@ -1344,7 +1356,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error Root: child.upperVD, Start: child.upperVD, Path: fspath.Parse(whiteoutName), - }); err != nil && err != syserror.EEXIST { + }); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)) } } @@ -1476,7 +1488,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error { vfsObj := fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ Root: parent.upperVD, @@ -1532,10 +1544,10 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error defer rp.Mount().EndWrite() name := rp.Component() if name == "." || name == ".." { - return syserror.EISDIR + return linuxerr.EISDIR } if rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) @@ -1556,7 +1568,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if child.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if err := parent.mayDelete(rp.Credentials(), child); err != nil { return err @@ -1658,7 +1670,7 @@ func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Crede // Return EOPNOTSUPP when fetching an overlay attribute. // See fs/overlayfs/super.c:ovl_own_xattr_get(). if isOverlayXattr(opts.Name) { - return "", syserror.EOPNOTSUPP + return "", linuxerr.EOPNOTSUPP } // Analogous to fs/overlayfs/super.c:ovl_other_xattr_get(). @@ -1696,7 +1708,7 @@ func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mo // Return EOPNOTSUPP when setting an overlay attribute. // See fs/overlayfs/super.c:ovl_own_xattr_set(). if isOverlayXattr(opts.Name) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } // Analogous to fs/overlayfs/super.c:ovl_other_xattr_set(). @@ -1741,7 +1753,7 @@ func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs // Linux passes the remove request to xattr_handler->set. // See fs/xattr.c:vfs_removexattr(). if isOverlayXattr(name) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } if err := mnt.CheckBeginWrite(); err != nil { diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index 454c20d4f..46d9f1f1d 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -40,13 +40,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Name is the default filesystem name. @@ -135,7 +135,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fsopts, ok := fsoptsRaw.(FilesystemOptions) if fsoptsRaw != nil && !ok { ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } vfsroot := vfs.RootFromContext(ctx) if vfsroot.Ok() { @@ -145,7 +145,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if upperPathname, ok := mopts["upperdir"]; ok { if fsopts.UpperRoot.Ok() { ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } delete(mopts, "upperdir") // Linux overlayfs also requires a workdir when upperdir is @@ -154,7 +154,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt upperPath := fspath.Parse(upperPathname) if !upperPath.Absolute { ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ Root: vfsroot, @@ -181,7 +181,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if lowerPathnamesStr, ok := mopts["lowerdir"]; ok { if len(fsopts.LowerRoots) != 0 { ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } delete(mopts, "lowerdir") lowerPathnames := strings.Split(lowerPathnamesStr, ":") @@ -189,7 +189,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt lowerPath := fspath.Parse(lowerPathname) if !lowerPath.Absolute { ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ Root: vfsroot, @@ -216,21 +216,21 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if len(mopts) != 0 { ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if len(fsopts.LowerRoots) == 0 { ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK if len(fsopts.LowerRoots) > maxLowerLayers { ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Take extra references held by the filesystem. @@ -277,13 +277,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if rootStat.Mask&rootStatMask != rootStatMask { root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) - return nil, nil, syserror.EREMOTE + return nil, nil, linuxerr.EREMOTE } if isWhiteout(&rootStat) { ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } root.mode = uint32(rootStat.Mode) root.uid = rootStat.UID diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go index 82491a0f8..156ffeaeb 100644 --- a/pkg/sentry/fsimpl/overlay/regular_file.go +++ b/pkg/sentry/fsimpl/overlay/regular_file.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -26,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -415,7 +415,7 @@ func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOp // Only permit mmap of regular files, since other file types may have // unpredictable behavior when mmapped (e.g. /dev/zero). if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG { - return syserror.ENODEV + return linuxerr.ENODEV } // Get a Mappable for the current top layer. diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD index 278ee3c92..a50510031 100644 --- a/pkg/sentry/fsimpl/pipefs/BUILD +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/hostarch", "//pkg/sentry/fsimpl/kernfs", @@ -16,6 +17,5 @@ go_library( "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 08aedc2ad..af09195a7 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // +stateify savable @@ -152,7 +152,7 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. if opts.Stat.Mask == 0 { return nil } - return syserror.EPERM + return linuxerr.EPERM } // Open implements kernfs.Inode.Open. diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 2b628bd55..95cfbdc42 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -81,6 +81,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/refs", @@ -101,7 +102,6 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip/header", "//pkg/tcpip/network/ipv4", "//pkg/usermem", @@ -119,6 +119,7 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", @@ -127,7 +128,6 @@ go_test( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index ce8f55b1f..f2697c12d 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -76,7 +76,7 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("proc.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index c53cc0122..e04ae6660 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -20,11 +20,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // subtasksInode represents the inode for /proc/[pid]/task/ directory. @@ -70,15 +70,15 @@ func (fs *filesystem) newSubtasks(ctx context.Context, task *kernel.Task, pidns func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { tid, err := strconv.ParseUint(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } subTask := i.pidns.TaskWithID(kernel.ThreadID(tid)) if subTask == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } if subTask.ThreadGroup() != i.task.ThreadGroup() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return i.fs.newTaskInode(ctx, subTask, i.pidns, false, i.cgroupControllers) } @@ -87,7 +87,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { tasks := i.task.ThreadGroup().MemberIDs(i.pidns) if len(tasks) == 0 { - return offset, syserror.ENOENT + return offset, linuxerr.ENOENT } if relOffset >= int64(len(tasks)) { return offset, nil @@ -123,7 +123,7 @@ type subtasksFD struct { func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { if fd.task.ExitState() >= kernel.TaskExitZombie { - return syserror.ENOENT + return linuxerr.ENOENT } return fd.GenericDirectoryFD.IterDirents(ctx, cb) } @@ -131,7 +131,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { - return 0, syserror.ENOENT + return 0, linuxerr.ENOENT } return fd.GenericDirectoryFD.Seek(ctx, offset, whence) } @@ -139,7 +139,7 @@ func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { - return linux.Statx{}, syserror.ENOENT + return linux.Statx{}, linuxerr.ENOENT } return fd.GenericDirectoryFD.Stat(ctx, opts) } @@ -147,7 +147,7 @@ func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Sta // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { if fd.task.ExitState() >= kernel.TaskExitZombie { - return syserror.ENOENT + return linuxerr.ENOENT } return fd.GenericDirectoryFD.SetStat(ctx, opts) } @@ -180,7 +180,7 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // DecRef implements kernfs.Inode.DecRef. diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index d05cc1508..f54811edf 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -20,12 +20,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // taskInode represents the inode for /proc/PID/ directory. @@ -49,7 +49,7 @@ var _ kernfs.Inode = (*taskInode)(nil) func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, fakeCgroupControllers map[string]string) (kernfs.Inode, error) { if task.ExitState() == kernel.TaskExitDead { - return nil, syserror.ESRCH + return nil, linuxerr.ESRCH } contents := map[string]kernfs.Inode{ @@ -65,8 +65,8 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns "io": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), "maps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mapsData{task: task}), "mem": fs.newMemInode(ctx, task, fs.NextIno(), 0400), - "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{task: task}), - "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{task: task}), + "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{fs: fs, task: task}), + "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{fs: fs, task: task}), "net": fs.newTaskNetDir(ctx, task), "ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{ "net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "net"), @@ -78,7 +78,7 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns "smaps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &smapsData{task: task}), "stat": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), "statm": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statmData{task: task}), - "status": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "status": fs.newStatusInode(ctx, task, pidns, fs.NextIno(), 0444), "uid_map": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { @@ -124,7 +124,7 @@ func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.D // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // DecRef implements kernfs.Inode.DecRef. diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 4718fac7a..5c6412fc0 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -22,11 +22,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) { @@ -42,12 +42,12 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) return file, flags } -func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool { +func taskFDExists(ctx context.Context, fs *filesystem, t *kernel.Task, fd int32) bool { file, _ := getTaskFD(t, fd) if file == nil { return false } - file.DecRef(ctx) + fs.SafeDecRefFD(ctx, file) return true } @@ -142,11 +142,11 @@ func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.Ite func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } fd := int32(fdInt) - if !taskFDExists(ctx, i.task, fd) { - return nil, syserror.ENOENT + if !taskFDExists(ctx, i.fs, i.task, fd) { + return nil, linuxerr.ENOENT } return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil } @@ -198,6 +198,7 @@ type fdSymlink struct { kernfs.InodeNoopRefCount kernfs.InodeSymlink + fs *filesystem task *kernel.Task fd int32 } @@ -206,6 +207,7 @@ var _ kernfs.Inode = (*fdSymlink)(nil) func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd int32, ino uint64) kernfs.Inode { inode := &fdSymlink{ + fs: fs, task: task, fd: fd, } @@ -216,11 +218,11 @@ func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd in func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } - defer file.DecRef(ctx) + defer s.fs.SafeDecRefFD(ctx, file) root := vfs.RootFromContext(ctx) - defer root.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, root) // Note: it's safe to reenter kernfs from Readlink if needed to resolve path. return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) @@ -229,9 +231,9 @@ func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { - return vfs.VirtualDentry{}, "", syserror.ENOENT + return vfs.VirtualDentry{}, "", linuxerr.ENOENT } - defer file.DecRef(ctx) + defer s.fs.SafeDecRefFD(ctx, file) vd := file.VirtualDentry() vd.IncRef() return vd, "", nil @@ -239,7 +241,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen // Valid implements kernfs.Inode.Valid. func (s *fdSymlink) Valid(ctx context.Context) bool { - return taskFDExists(ctx, s.task, s.fd) + return taskFDExists(ctx, s.fs, s.task, s.fd) } // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. @@ -276,13 +278,14 @@ func (fs *filesystem) newFDInfoDirInode(ctx context.Context, task *kernel.Task) func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } fd := int32(fdInt) - if !taskFDExists(ctx, i.task, fd) { - return nil, syserror.ENOENT + if !taskFDExists(ctx, i.fs, i.task, fd) { + return nil, linuxerr.ENOENT } data := &fdInfoData{ + fs: i.fs, task: i.task, fd: fd, } @@ -316,6 +319,7 @@ func (i *fdInfoDirInode) DecRef(ctx context.Context) { type fdInfoData struct { kernfs.DynamicBytesFile + fs *filesystem task *kernel.Task fd int32 } @@ -326,9 +330,9 @@ var _ dynamicInode = (*fdInfoData)(nil) func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { file, descriptorFlags := getTaskFD(d.task, d.fd) if file == nil { - return syserror.ENOENT + return linuxerr.ENOENT } - defer file.DecRef(ctx) + defer d.fs.SafeDecRefFD(ctx, file) // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt @@ -339,5 +343,5 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Valid implements kernfs.Inode.Valid. func (d *fdInfoData) Valid(ctx context.Context) bool { - return taskFDExists(ctx, d.task, d.fd) + return taskFDExists(ctx, d.fs, d.task, d.fd) } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index b294dfd6a..d3f9cf489 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsbridge" @@ -32,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -40,7 +40,7 @@ import ( // Linux 3.18, the limit is five lines." - user_namespaces(7) const maxIDMapLines = 5 -// mm gets the kernel task's MemoryManager. No additional reference is taken on +// getMM gets the kernel task's MemoryManager. No additional reference is taken on // mm here. This is safe because MemoryManager.destroy is required to leave the // MemoryManager in a state where it's still usable as a DynamicBytesSource. func getMM(task *kernel.Task) *mm.MemoryManager { @@ -70,9 +70,9 @@ func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { func checkTaskState(t *kernel.Task) error { switch t.ExitState() { case kernel.TaskExitZombie: - return syserror.EACCES + return linuxerr.EACCES case kernel.TaskExitDead: - return syserror.ESRCH + return linuxerr.ESRCH } return nil } @@ -109,7 +109,7 @@ var _ dynamicInode = (*auxvData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.task.ExitState() == kernel.TaskExitDead { - return syserror.ESRCH + return linuxerr.ESRCH } m, err := getMMIncRef(d.task) if err != nil { @@ -159,7 +159,7 @@ var _ dynamicInode = (*cmdlineData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.task.ExitState() == kernel.TaskExitDead { - return syserror.ESRCH + return linuxerr.ESRCH } m, err := getMMIncRef(d.task) if err != nil { @@ -227,7 +227,7 @@ func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { if int(arEnvv.Length()) > remaining { end, ok := arEnvv.Start.AddLength(uint64(remaining)) if !ok { - return syserror.EFAULT + return linuxerr.EFAULT } arEnvv.End = end } @@ -325,7 +325,7 @@ func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset in // the file ..." - user_namespaces(7) srclen := src.NumBytes() if srclen >= hostarch.PageSize || offset != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } b := make([]byte, srclen) if _, err := src.CopyIn(ctx, b); err != nil { @@ -345,7 +345,7 @@ func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset in } lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) if len(lines) > maxIDMapLines { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } entries := make([]auth.IDMapEntry, len(lines)) @@ -353,7 +353,7 @@ func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset in var e auth.IDMapEntry _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) if err != nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } entries[i] = e } @@ -408,7 +408,7 @@ func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.De // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH if !kernel.ContextCanTrace(ctx, f.task, true) { - return nil, syserror.EACCES + return nil, linuxerr.EACCES } if err := checkTaskState(f.task); err != nil { return nil, err @@ -422,7 +422,7 @@ func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.De // SetStat implements kernfs.Inode.SetStat. func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } var _ vfs.FileDescriptionImpl = (*memFD)(nil) @@ -461,10 +461,10 @@ func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, e case linux.SEEK_CUR: offset += fd.offset default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.offset = offset return offset, nil @@ -485,12 +485,12 @@ func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64 n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) if n > 0 { if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } return int64(n), nil } if readErr != nil { - return 0, syserror.EIO + return 0, linuxerr.EIO } return 0, nil } @@ -512,7 +512,7 @@ func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, e // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // Release implements vfs.FileDescriptionImpl.Release. @@ -608,12 +608,10 @@ func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) var vss, rss uint64 - s.task.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) + if mm := getMM(s.task); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize) // rsslim. @@ -649,63 +647,159 @@ var _ dynamicInode = (*statmData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { var vss, rss uint64 - s.task.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) - + if mm := getMM(s.task); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize) return nil } -// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. +// statusInode implements kernfs.Inode for /proc/[pid]/status. // // +stateify savable -type statusData struct { - kernfs.DynamicBytesFile +type statusInode struct { + kernfs.InodeAttrs + kernfs.InodeNoStatFS + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink task *kernel.Task pidns *kernel.PIDNamespace + locks vfs.FileLocks +} + +// statusFD implements vfs.FileDescriptionImpl and vfs.DynamicByteSource for +// /proc/[pid]/status. +// +// +stateify savable +type statusFD struct { + statusFDLowerBase + vfs.DynamicBytesFileDescriptionImpl + vfs.LockFD + + vfsfd vfs.FileDescription + + inode *statusInode + task *kernel.Task + pidns *kernel.PIDNamespace + userns *auth.UserNamespace // equivalent to struct file::f_cred::user_ns } -var _ dynamicInode = (*statusData)(nil) +// statusFDLowerBase is a dumb hack to ensure that statusFD prefers +// vfs.DynamicBytesFileDescriptionImpl methods to vfs.FileDescriptinDefaultImpl +// methods. +// +// +stateify savable +type statusFDLowerBase struct { + vfs.FileDescriptionDefaultImpl +} + +func (fs *filesystem) newStatusInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, ino uint64, perm linux.FileMode) kernfs.Inode { + // Note: credentials are overridden by taskOwnedInode. + inode := &statusInode{ + task: task, + pidns: pidns, + } + inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeRegular|perm) + return &taskOwnedInode{Inode: inode, owner: task} +} + +// Open implements kernfs.Inode.Open. +func (s *statusInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &statusFD{ + inode: s, + task: s.task, + pidns: s.pidns, + userns: rp.Credentials().UserNamespace, + } + fd.LockFD.Init(&s.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + fd.SetDataSource(fd) + return &fd.vfsfd, nil +} + +// SetStat implements kernfs.Inode.SetStat. +func (*statusInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return linuxerr.EPERM +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (s *statusFD) Release(ctx context.Context) { +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (s *statusFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := s.vfsfd.VirtualDentry().Mount().Filesystem() + return s.inode.Stat(ctx, fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (s *statusFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + return linuxerr.EPERM +} // Generate implements vfs.DynamicBytesSource.Generate. -func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) + ppid := kernel.ThreadID(0) if parent := s.task.Parent(); parent != nil { ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) } fmt.Fprintf(buf, "PPid:\t%d\n", ppid) + tpid := kernel.ThreadID(0) if tracer := s.task.Tracer(); tracer != nil { tpid = s.pidns.IDOfTask(tracer) } fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) + + creds := s.task.Credentials() + ruid := creds.RealKUID.In(s.userns).OrOverflow() + euid := creds.EffectiveKUID.In(s.userns).OrOverflow() + suid := creds.SavedKUID.In(s.userns).OrOverflow() + rgid := creds.RealKGID.In(s.userns).OrOverflow() + egid := creds.EffectiveKGID.In(s.userns).OrOverflow() + sgid := creds.SavedKGID.In(s.userns).OrOverflow() var fds int var vss, rss, data uint64 s.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { fds = fdTable.CurrentMaxFDs() } - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - data = mm.VirtualDataSize() - } }) + if mm := getMM(s.task); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + data = mm.VirtualDataSize() + } + // Filesystem user/group IDs aren't implemented; effective UID/GID are used + // instead. + fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid) + fmt.Fprintf(buf, "Gid:\t%d\t%d\t%d\t%d\n", rgid, egid, sgid, egid) fmt.Fprintf(buf, "FDSize:\t%d\n", fds) + buf.WriteString("Groups:\t ") + // There is a space between each pair of supplemental GIDs, as well as an + // unconditional trailing space that some applications actually depend on. + var sep string + for _, kgid := range creds.ExtraKGIDs { + fmt.Fprintf(buf, "%s%d", sep, kgid.In(s.userns).OrOverflow()) + sep = " " + } + buf.WriteString(" \n") + fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) + fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) - creds := s.task.Credentials() fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) @@ -762,7 +856,7 @@ var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { if o.task.ExitState() == kernel.TaskExitDead { - return syserror.ESRCH + return linuxerr.ESRCH } fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) return nil @@ -784,7 +878,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset } if o.task.ExitState() == kernel.TaskExitDead { - return 0, syserror.ESRCH + return 0, linuxerr.ESRCH } if err := o.task.SetOOMScoreAdj(v); err != nil { return 0, err @@ -802,13 +896,17 @@ type exeSymlink struct { kernfs.InodeNoopRefCount kernfs.InodeSymlink + fs *filesystem task *kernel.Task } var _ kernfs.Inode = (*exeSymlink)(nil) func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { - inode := &exeSymlink{task: task} + inode := &exeSymlink{ + fs: fs, + task: task, + } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } @@ -819,14 +917,14 @@ func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) if err != nil { return "", err } - defer exec.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, exec) root := vfs.RootFromContext(ctx) if !root.Ok() { // It could have raced with process deletion. - return "", syserror.ESRCH + return "", linuxerr.ESRCH } - defer root.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, root) vfsObj := exec.Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) @@ -836,31 +934,23 @@ func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) // Getlink implements kernfs.Inode.Getlink. func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { - return vfs.VirtualDentry{}, "", syserror.EACCES + return vfs.VirtualDentry{}, "", linuxerr.EACCES } if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err } - var err error - var exec fsbridge.File - s.task.WithMuLocked(func(t *kernel.Task) { - mm := t.MemoryManager() - if mm == nil { - err = syserror.EACCES - return - } + mm := getMM(s.task) + if mm == nil { + return vfs.VirtualDentry{}, "", linuxerr.EACCES + } - // The MemoryManager may be destroyed, in which case - // MemoryManager.destroy will simply set the executable to nil - // (with locks held). - exec = mm.Executable() - if exec == nil { - err = syserror.ESRCH - } - }) - if err != nil { - return vfs.VirtualDentry{}, "", err + // The MemoryManager may be destroyed, in which case + // MemoryManager.destroy will simply set the executable to nil + // (with locks held). + exec := mm.Executable() + if exec == nil { + return vfs.VirtualDentry{}, "", linuxerr.ESRCH } defer exec.DecRef(ctx) @@ -878,13 +968,17 @@ type cwdSymlink struct { kernfs.InodeNoopRefCount kernfs.InodeSymlink + fs *filesystem task *kernel.Task } var _ kernfs.Inode = (*cwdSymlink)(nil) func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { - inode := &cwdSymlink{task: task} + inode := &cwdSymlink{ + fs: fs, + task: task, + } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } @@ -895,14 +989,14 @@ func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) if err != nil { return "", err } - defer cwd.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, cwd) root := vfs.RootFromContext(ctx) if !root.Ok() { // It could have raced with process deletion. - return "", syserror.ESRCH + return "", linuxerr.ESRCH } - defer root.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, root) vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) @@ -912,7 +1006,7 @@ func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) // Getlink implements kernfs.Inode.Getlink. func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { - return vfs.VirtualDentry{}, "", syserror.EACCES + return vfs.VirtualDentry{}, "", linuxerr.EACCES } if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err @@ -920,8 +1014,9 @@ func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent cwd := s.task.FSContext().WorkingDirectoryVFS2() if !cwd.Ok() { // It could have raced with process deletion. - return vfs.VirtualDentry{}, "", syserror.ESRCH + return vfs.VirtualDentry{}, "", linuxerr.ESRCH } + // The reference is transferred to the caller. return cwd, "", nil } @@ -931,6 +1026,7 @@ func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent type mountInfoData struct { kernfs.DynamicBytesFile + fs *filesystem task *kernel.Task } @@ -951,7 +1047,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef(ctx) + defer i.fs.SafeDecRef(ctx, rootDir) i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) return nil } @@ -962,6 +1058,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { type mountsData struct { kernfs.DynamicBytesFile + fs *filesystem task *kernel.Task } @@ -982,7 +1079,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef(ctx) + defer i.fs.SafeDecRef(ctx, rootDir) i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } @@ -1123,7 +1220,7 @@ func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error // exit this file show a task is in no cgroups, which is incorrect. Instead, // once a task has left its cgroups, we return an error. if d.task.ExitState() >= kernel.TaskExitInitiated { - return syserror.ESRCH + return linuxerr.ESRCH } d.task.GenerateProcTaskCgroup(buf) diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 177cb828f..ab47ea5a7 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -33,7 +34,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" ) @@ -679,7 +679,7 @@ func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { continue } if err := d.stack.Statistics(stat, line.prefix); err != nil { - if err == syserror.EOPNOTSUPP { + if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) } else { log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index cf905fae4..26d44744b 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -116,12 +116,12 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, err case threadSelfName: return i.newThreadSelfSymlink(ctx, root), nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } task := i.pidns.TaskWithID(kernel.ThreadID(tid)) if task == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return i.fs.newTaskInode(ctx, task, i.pidns, true, i.fakeCgroupControllers) diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 045ed7a2d..4d3a2f7e6 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // +stateify savable @@ -53,11 +53,11 @@ func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? - return "", syserror.EINVAL + return "", linuxerr.EINVAL } tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) if tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return strconv.FormatUint(uint64(tgid), 10), nil } @@ -69,7 +69,7 @@ func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualD // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // +stateify savable @@ -94,12 +94,12 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? - return "", syserror.EINVAL + return "", linuxerr.EINVAL } tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) tid := s.pidns.IDOfTask(t) if tid == 0 || tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return fmt.Sprintf("%d/task/%d", tgid, tid), nil } @@ -111,7 +111,7 @@ func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vi // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // dynamicBytesFileSetAttr implements a special file that allows inode diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 2bc98a94f..99f64a9d8 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/usermem" ) @@ -209,7 +209,7 @@ func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -257,7 +257,7 @@ func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -311,7 +311,7 @@ func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error { func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -396,7 +396,7 @@ func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -449,7 +449,7 @@ func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error { func (pr *portRange) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -467,7 +467,7 @@ func (pr *portRange) Write(ctx context.Context, src usermem.IOSequence, offset i // Port numbers must be uint16s. if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil { diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index e534fbca8..14f806c3c 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -23,13 +23,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -227,7 +227,7 @@ func TestTasks(t *testing.T) { defer fd.DecRef(s.Ctx) buf := make([]byte, 1) bufIOSeq := usermem.BytesIOSequence(buf) - if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { + if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); !linuxerr.Equals(linuxerr.EISDIR, err) { t.Errorf("wrong error reading directory: %v", err) } } @@ -237,7 +237,7 @@ func TestTasks(t *testing.T) { s.Creds, s.PathOpAtRoot("/proc/9999"), &vfs.OpenOptions{}, - ); err != syserror.ENOENT { + ); !linuxerr.Equals(linuxerr.ENOENT, err) { t.Fatalf("wrong error from vfsfs.OpenAt(/proc/9999): %v", err) } } diff --git a/pkg/sentry/fsimpl/proc/yama.go b/pkg/sentry/fsimpl/proc/yama.go index e039ec45e..7240563d7 100644 --- a/pkg/sentry/fsimpl/proc/yama.go +++ b/pkg/sentry/fsimpl/proc/yama.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -56,7 +56,7 @@ func (s *yamaPtraceScope) Generate(ctx context.Context, buf *bytes.Buffer) error func (s *yamaPtraceScope) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // Ignore partial writes. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if src.NumBytes() == 0 { return 0, nil @@ -73,7 +73,7 @@ func (s *yamaPtraceScope) Write(ctx context.Context, src usermem.IOSequence, off // We do not support YAMA levels > YAMA_SCOPE_RELATIONAL. if v < linux.YAMA_SCOPE_DISABLED || v > linux.YAMA_SCOPE_RELATIONAL { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } atomic.StoreInt32(s.level, v) diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD index adb610213..403c6f254 100644 --- a/pkg/sentry/fsimpl/signalfd/BUILD +++ b/pkg/sentry/fsimpl/signalfd/BUILD @@ -9,10 +9,10 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/kernel", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index a7f5928b7..bdb03ef96 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -18,10 +18,10 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -91,7 +91,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) if err != nil { // There must be no signal available. - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Copy out the signal info using the specified format. diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD index 9453277b8..9defca936 100644 --- a/pkg/sentry/fsimpl/sockfs/BUILD +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -9,10 +9,10 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 735756280..75934ecd0 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -20,11 +20,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // filesystemType implements vfs.FilesystemType. @@ -102,7 +102,7 @@ type inode struct { // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } // StatFS implements kernfs.Inode.StatFS. diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index 09043b572..ab21f028e 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -26,6 +26,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/coverage", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", @@ -35,7 +36,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go index b13f141a8..51f0bf3d8 100644 --- a/pkg/sentry/fsimpl/sys/kcov.go +++ b/pkg/sentry/fsimpl/sys/kcov.go @@ -17,13 +17,13 @@ package sys import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -85,11 +85,11 @@ func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallAr case linux.KCOV_DISABLE: if arg != 0 { // This arg is unused; it should be 0. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } return 0, fd.kcov.DisableTrace(ctx) default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 14eb10dcd..f322d2747 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -23,12 +23,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -74,7 +74,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } } @@ -174,7 +174,7 @@ func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } // Open implements kernfs.Inode.Open. diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 97aa20cd1..473b41cff 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -80,12 +80,8 @@ func Boot() (*kernel.Kernel, error) { } // Create timekeeper. - tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) - if err != nil { - return nil, fmt.Errorf("creating timekeeper: %v", err) - } + tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) tk.SetClocks(time.NewCalibratedClocks()) - k.SetTimekeeper(tk) creds := auth.NewRootCredentials(auth.NewRootUserNamespace()) @@ -94,6 +90,7 @@ func Boot() (*kernel.Kernel, error) { if err = k.Init(kernel.InitKernelArgs{ ApplicationCores: uint(runtime.GOMAXPROCS(-1)), FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, RootUserNamespace: creds.UserNamespace, Vdso: vdso, RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace), diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD index 7ce7dc429..2b83d7d9a 100644 --- a/pkg/sentry/fsimpl/timerfd/BUILD +++ b/pkg/sentry/fsimpl/timerfd/BUILD @@ -8,10 +8,10 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index cbb8b67c5..68b785791 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -19,10 +19,10 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -69,7 +69,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { const sizeofUint64 = 8 if dst.NumBytes() < sizeofUint64 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { var buf [sizeofUint64]byte @@ -81,7 +81,7 @@ func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequenc } return sizeofUint64, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Clock returns the timer fd's Clock. diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index e21fddd7f..94486bb63 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -58,6 +58,7 @@ go_library( "//pkg/abi/linux", "//pkg/amutex", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/hostarch", "//pkg/log", @@ -81,7 +82,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sentry/vfs/memxattr", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) @@ -94,6 +94,7 @@ go_test( ":tmpfs", "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/refs", "//pkg/sentry/contexttest", @@ -101,7 +102,6 @@ go_test( "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) @@ -118,12 +118,12 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/sentry/contexttest", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 3cc63e732..2c29343c1 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/contexttest" @@ -30,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Differences from stat_benchmark: @@ -68,7 +68,7 @@ func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent rel = wd } else { // Need to extract the given FD. - return syserror.EBADF + return linuxerr.EBADF } // Lookup the node. @@ -146,7 +146,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { for i := 0; i < b.N; i++ { err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } uattr, err := d.Inode.UnstableAttr(ctx) if err != nil { @@ -341,7 +341,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { for i := 0; i < b.N; i++ { err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } uattr, err := d.Inode.UnstableAttr(ctx) if err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index e8d256495..c25494c0b 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // +stateify savable @@ -196,10 +196,10 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in case linux.SEEK_CUR: offset += fd.off default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index f0f4297ef..e067f136e 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -20,12 +20,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -45,7 +45,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { dir, ok := d.inode.impl.(*directory) if !ok { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -70,11 +70,11 @@ afterSymlink: return d.parent, nil } if len(name) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } child, ok := dir.childMap[name] if !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err @@ -112,7 +112,7 @@ func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) } dir, ok := d.inode.impl.(*directory) if !ok { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return dir, nil } @@ -132,7 +132,7 @@ func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) d = next } if rp.MustBeDir() && !d.inode.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -161,21 +161,21 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } name := rp.Component() if name == "." || name == ".." { - return syserror.EEXIST + return linuxerr.EEXIST } if len(name) > linux.NAME_MAX { - return syserror.ENAMETOOLONG + return linuxerr.ENAMETOOLONG } if _, ok := parentDir.childMap[name]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } if !dir && rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only // be dead if it was deleted. if parentDir.dentry.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -220,7 +220,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op } if opts.CheckSearchable { if !d.inode.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -246,21 +246,21 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } d := vd.Dentry().Impl().(*dentry) i := d.inode if i.isDir() { - return syserror.EPERM + return linuxerr.EPERM } if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err } if i.nlink == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } if i.nlink == maxLinks { - return syserror.EMLINK + return linuxerr.EMLINK } i.incLinksLocked() i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) @@ -274,7 +274,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() if parentDir.inode.nlink == maxLinks { - return syserror.EMLINK + return linuxerr.EMLINK } parentDir.inode.incLinksLocked() // from child's ".." childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) @@ -300,7 +300,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v case linux.S_IFSOCK: childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint, parentDir) default: - return syserror.EINVAL + return linuxerr.EINVAL } child := fs.newDentry(childInode) parentDir.insertChildLocked(child, name) @@ -312,7 +312,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { if opts.Flags&linux.O_TMPFILE != 0 { // Not yet supported. - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } // Handle O_CREAT and !O_CREAT separately, since in the latter case we @@ -344,10 +344,10 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if rp.Done() { // Reject attempts to open mount root directory with O_CREAT. if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } start.IncRef() defer start.DecRef(ctx) @@ -365,14 +365,14 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } name := rp.Component() if name == "." || name == ".." { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if len(name) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + return nil, linuxerr.ENAMETOOLONG } // Determine whether or not we need to create a file. child, ok := parentDir.childMap[name] @@ -401,7 +401,7 @@ afterTrailingSymlink: return fd, nil } if mustCreate { - return nil, syserror.EEXIST + return nil, linuxerr.EEXIST } // Is the file mounted over? if err := rp.CheckMount(ctx, &child.vfsd); err != nil { @@ -418,7 +418,7 @@ afterTrailingSymlink: goto afterTrailingSymlink } if rp.MustBeDir() && !child.inode.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } child.IncRef() defer child.DecRef(ctx) @@ -456,7 +456,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open case *directory: // Can't open directories writably. if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } var fd directoryFD fd.LockFD.Init(&d.inode.locks) @@ -466,13 +466,13 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH, which is handled at the VFS layer. - return nil, syserror.ELOOP + return nil, linuxerr.ELOOP case *namedPipe: return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) case *socketFile: - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO default: panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) } @@ -488,7 +488,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st } symlink, ok := d.inode.impl.(*symlink) if !ok { - return "", syserror.EINVAL + return "", linuxerr.EINVAL } symlink.inode.touchAtime(rp.Mount()) return symlink.target, nil @@ -506,19 +506,19 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if opts.Flags&^linux.RENAME_NOREPLACE != 0 { // TODO(b/145974740): Support other renameat2 flags. - return syserror.EINVAL + return linuxerr.EINVAL } newName := rp.Component() if newName == "." || newName == ".." { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.EBUSY + return linuxerr.EBUSY } mnt := rp.Mount() if mnt != oldParentVD.Mount() { - return syserror.EXDEV + return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err @@ -531,7 +531,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } renamed, ok := oldParentDir.childMap[oldName] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil { return err @@ -541,7 +541,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // mounted filesystem. if renamed.inode.isDir() { if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) { - return syserror.EINVAL + return linuxerr.EINVAL } if oldParentDir != newParentDir { // Writability is needed to change renamed's "..". @@ -551,7 +551,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } else { if opts.MustBeDir || rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } @@ -561,33 +561,33 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa replaced, ok := newParentDir.childMap[newName] if ok { if opts.Flags&linux.RENAME_NOREPLACE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } replacedDir, ok := replaced.inode.impl.(*directory) if ok { if !renamed.inode.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if len(replacedDir.childMap) != 0 { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } } else { if rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if renamed.inode.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } } else { if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks { - return syserror.EMLINK + return linuxerr.EMLINK } } // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can // only be dead if it was deleted. if newParentDir.dentry.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } // Linux places this check before some of those above; we do it here for @@ -646,24 +646,24 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } name := rp.Component() if name == "." { - return syserror.EINVAL + return linuxerr.EINVAL } if name == ".." { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } child, ok := parentDir.childMap[name] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { return err } childDir, ok := child.inode.impl.(*directory) if !ok { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if len(childDir.childMap) != 0 { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -753,20 +753,20 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } name := rp.Component() if name == "." || name == ".." { - return syserror.EISDIR + return linuxerr.EISDIR } child, ok := parentDir.childMap[name] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { return err } if child.inode.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if rp.MustBeDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -806,11 +806,11 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath switch impl := d.inode.impl.(type) { case *socketFile: if impl.ep == nil { - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } return impl.ep, nil default: - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } } diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 2f856ce36..99afd9817 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -20,11 +20,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -114,7 +114,7 @@ func TestNonblockingWriteError(t *testing.T) { } openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK} _, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) - if err != syserror.ENXIO { + if !linuxerr.Equals(linuxerr.ENXIO, err) { t.Fatalf("expected ENXIO, but got error: %v", err) } } @@ -201,7 +201,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { readData := make([]byte, 1) dst := usermem.BytesIOSequence(readData) bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err) } if bytesRead != 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index c45bddff6..0f2ac6144 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -33,7 +34,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -185,7 +185,7 @@ func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { // Can we grow the file? if rf.seals&linux.F_SEAL_GROW != 0 { rf.dataMu.Unlock() - return false, syserror.EPERM + return false, linuxerr.EPERM } // We only need to update the file size. atomic.StoreUint64(&rf.size, newSize) @@ -196,7 +196,7 @@ func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { // We are shrinking the file. First check if this is allowed. if rf.seals&linux.F_SEAL_SHRINK != 0 { rf.dataMu.Unlock() - return false, syserror.EPERM + return false, linuxerr.EPERM } // Update the file size. @@ -233,7 +233,7 @@ func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, a // Reject writable mapping if F_SEAL_WRITE is set. if rf.seals&linux.F_SEAL_WRITE != 0 && writable { - return syserror.EPERM + return linuxerr.EPERM } rf.mappings.AddMapping(ms, ar, offset, writable) @@ -366,7 +366,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs fsmetric.TmpfsReads.Increment() if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since @@ -374,7 +374,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { @@ -407,7 +407,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // final offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since @@ -415,7 +415,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { - return 0, offset, syserror.EOPNOTSUPP + return 0, offset, linuxerr.EOPNOTSUPP } srclen := src.NumBytes() @@ -432,7 +432,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } if end := offset + srclen; end < offset { // Overflow. - return 0, offset, syserror.EINVAL + return 0, offset, linuxerr.EINVAL } srclen, err = vfs.CheckLimit(ctx, offset, srclen) @@ -476,10 +476,10 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) ( case linux.SEEK_END: offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size)) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.off = offset return offset, nil @@ -594,7 +594,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, // Check if seals prevent either file growth or all writes. switch { case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed - return 0, syserror.EPERM + return 0, linuxerr.EPERM case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed // When growth is sealed, Linux effectively allows writes which would // normally grow the file to partially succeed up to the current EOF, @@ -615,7 +615,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, } if end <= rw.off { // Truncation would result in no data being written. - return 0, syserror.EPERM + return 0, linuxerr.EPERM } } @@ -684,7 +684,7 @@ exitLoop: func GetSeals(fd *vfs.FileDescription) (uint32, error) { f, ok := fd.Impl().(*regularFileFD) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } rf := f.inode().impl.(*regularFile) rf.dataMu.RLock() @@ -696,7 +696,7 @@ func GetSeals(fd *vfs.FileDescription) (uint32, error) { func AddSeals(fd *vfs.FileDescription, val uint32) error { f, ok := fd.Impl().(*regularFileFD) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } rf := f.inode().impl.(*regularFile) rf.mapsMu.Lock() @@ -706,13 +706,13 @@ func AddSeals(fd *vfs.FileDescription, val uint32) error { if rf.seals&linux.F_SEAL_SEAL != 0 { // Seal applied which prevents addition of any new seals. - return syserror.EPERM + return linuxerr.EPERM } // F_SEAL_WRITE can only be added if there are no active writable maps. if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { if rf.writableMappingPages > 0 { - return syserror.EBUSY + return linuxerr.EBUSY } } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 4393cc13b..cb7711b39 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -21,10 +21,10 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -146,7 +146,7 @@ func TestLocks(t *testing.T) { if err := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), linuxerr.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { @@ -165,7 +165,7 @@ func TestLocks(t *testing.T) { if err := fd.Impl().LockPOSIX(ctx, uid1, 0 /* ownerPID */, lock.WriteLock, lock.LockRange{Start: 0, End: 1}, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), linuxerr.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockPOSIX(ctx, uid1, lock.LockRange{Start: 0, End: 1}); err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 6b4367c42..feafb06e4 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -36,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -43,7 +44,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Name is the default filesystem name. @@ -138,7 +138,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt mode, err := strconv.ParseUint(modeStr, 8, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } rootMode = linux.FileMode(mode & 07777) } @@ -149,12 +149,12 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt uid, err := strconv.ParseUint(uidStr, 10, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) if !kuid.Ok() { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } rootKUID = kuid } @@ -165,18 +165,18 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt gid, err := strconv.ParseUint(gidStr, 10, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) if !kgid.Ok() { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } rootKGID = kgid } if len(mopts) != 0 { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } devMinor, err := vfsObj.GetAnonBlockDevMinor() @@ -396,8 +396,8 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth } // Inherit the group and setgid bit as in fs/inode.c:inode_init_owner(). - if parentDir != nil && parentDir.inode.mode&linux.S_ISGID == linux.S_ISGID { - kgid = auth.KGID(parentDir.inode.gid) + if parentDir != nil && atomic.LoadUint32(&parentDir.inode.mode)&linux.S_ISGID == linux.S_ISGID { + kgid = auth.KGID(atomic.LoadUint32(&parentDir.inode.gid)) if mode&linux.S_IFDIR == linux.S_IFDIR { mode |= linux.S_ISGID } @@ -527,7 +527,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs. return nil } if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { - return syserror.EPERM + return linuxerr.EPERM } mode := linux.FileMode(atomic.LoadUint32(&i.mode)) if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { @@ -555,9 +555,9 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs. needsCtimeBump = true } case *directory: - return syserror.EISDIR + return linuxerr.EISDIR default: - return syserror.EINVAL + return linuxerr.EINVAL } } if mask&linux.STATX_UID != 0 { @@ -730,7 +730,7 @@ func checkXattrName(name string) error { if strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { return nil } - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) { diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD index d473a922d..c12abdf33 100644 --- a/pkg/sentry/fsimpl/verity/BUILD +++ b/pkg/sentry/fsimpl/verity/BUILD @@ -1,10 +1,24 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "verity", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + go_library( name = "verity", srcs = [ + "dentry_list.go", "filesystem.go", "save_restore.go", "verity.go", @@ -13,6 +27,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/hostarch", "//pkg/marshal/primitive", @@ -27,7 +42,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) @@ -41,6 +55,7 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/sentry/arch", "//pkg/sentry/fsimpl/testutil", @@ -48,7 +63,6 @@ go_test( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index 3582d14c9..52d47994d 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -25,13 +25,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/merkletree" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -66,38 +66,23 @@ func putDentrySlice(ds *[]*dentry) { dentrySlicePool.Put(ds) } -// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls -// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls +// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for // writing. // // ds is a pointer-to-pointer since defer evaluates its arguments immediately, // but dentry slices are allocated lazily, and it's much easier to say "defer -// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { -// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { +// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. +// +checklocksrelease:fs.renameMu +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return } - if len(**ds) != 0 { - fs.renameMu.Lock() - for _, d := range **ds { - d.checkDropLocked(ctx) - } - fs.renameMu.Unlock() - } - putDentrySlice(*ds) -} - -func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { - if *ds == nil { - fs.renameMu.Unlock() - return - } for _, d := range **ds { - d.checkDropLocked(ctx) + d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) } - fs.renameMu.Unlock() putDentrySlice(*ds) } @@ -113,7 +98,7 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de // * !rp.Done(). func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { @@ -163,7 +148,7 @@ afterSymlink: // verifyChildLocked verifies the hash of child against the already verified // hash of the parent to ensure the child is expected. verifyChild triggers a // sentry panic if unexpected modifications to the file system are detected. In -// ErrorOnViolation mode it returns a syserror instead. +// ErrorOnViolation mode it returns a linuxerr instead. // // Preconditions: // * fs.renameMu must be locked. @@ -195,7 +180,7 @@ func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, chi // The Merkle tree file for the child should have been created and // contains the expected xattrs. If the file or the xattr does not // exist, it indicates unexpected modifications to the file system. - if err == syserror.ENOENT || err == syserror.ENODATA { + if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENODATA, err) { return nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err)) } if err != nil { @@ -218,7 +203,7 @@ func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, chi // The parent Merkle tree file should have been created. If it's // missing, it indicates an unexpected modification to the file system. - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { return nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err)) } if err != nil { @@ -238,7 +223,7 @@ func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, chi // The Merkle tree file for the child should have been created and // contains the expected xattrs. If the file or the xattr does not // exist, it indicates unexpected modifications to the file system. - if err == syserror.ENOENT || err == syserror.ENODATA { + if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENODATA, err) { return nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err)) } if err != nil { @@ -261,7 +246,7 @@ func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, chi Root: parent.lowerVD, Start: parent.lowerVD, }, &vfs.StatOptions{}) - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { return nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to get parent stat for %s: %v", childPath, err)) } if err != nil { @@ -282,7 +267,7 @@ func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, chi Mode: uint32(parentStat.Mode), UID: parentStat.UID, GID: parentStat.GID, - Children: parent.childrenNames, + Children: parent.childrenList, HashAlgorithms: fs.alg.toLinuxHashAlg(), ReadOffset: int64(offset), ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())), @@ -327,7 +312,7 @@ func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry }, &vfs.OpenOptions{ Flags: linux.O_RDONLY, }) - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { return fs.alertIntegrityViolation(fmt.Sprintf("Failed to open merkle file for %s: %v", childPath, err)) } if err != nil { @@ -341,7 +326,7 @@ func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry Size: sizeOfStringInt32, }) - if err == syserror.ENODATA { + if linuxerr.Equals(linuxerr.ENODATA, err) { return fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", merkleSizeXattr, childPath, err)) } if err != nil { @@ -359,7 +344,7 @@ func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry Size: sizeOfStringInt32, }) - if err == syserror.ENODATA { + if linuxerr.Equals(linuxerr.ENODATA, err) { return fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", childrenOffsetXattr, childPath, err)) } if err != nil { @@ -375,7 +360,7 @@ func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry Size: sizeOfStringInt32, }) - if err == syserror.ENODATA { + if linuxerr.Equals(linuxerr.ENODATA, err) { return fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", childrenSizeXattr, childPath, err)) } if err != nil { @@ -403,6 +388,9 @@ func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry var buf bytes.Buffer d.hashMu.RLock() + + d.generateChildrenList() + params := &merkletree.VerifyParams{ Out: &buf, Tree: &fdReader, @@ -411,7 +399,7 @@ func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry Mode: uint32(stat.Mode), UID: stat.UID, GID: stat.GID, - Children: d.childrenNames, + Children: d.childrenList, HashAlgorithms: fs.alg.toLinuxHashAlg(), ReadOffset: 0, // Set read size to 0 so only the metadata is verified. @@ -465,7 +453,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s } childVD, err := parent.getLowerAt(ctx, vfsObj, name) - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { // The file was previously accessed. If the // file does not exist now, it indicates an // unexpected modification to the file system. @@ -480,7 +468,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s // The Merkle tree file was previous accessed. If it // does not exist now, it indicates an unexpected // modification to the file system. - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { return nil, fs.alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", path)) } if err != nil { @@ -541,7 +529,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, if parent.verityEnabled() { if _, ok := parent.childrenNames[name]; !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } } @@ -551,7 +539,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, } childVD, err := parent.getLowerAt(ctx, vfsObj, name) - if parent.verityEnabled() && err == syserror.ENOENT { + if parent.verityEnabled() && linuxerr.Equals(linuxerr.ENOENT, err) { return nil, fs.alertIntegrityViolation(fmt.Sprintf("file %s expected but not found", parentPath+"/"+name)) } if err != nil { @@ -564,7 +552,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name) if err != nil { - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { if parent.verityEnabled() { return nil, fs.alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath+"/"+name)) } @@ -589,23 +577,6 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, } } - // Clear the Merkle tree file if they are to be generated at runtime. - // TODO(b/182315468): Optimize the Merkle tree generate process to - // allow only updating certain files/directories. - if fs.allowRuntimeEnable { - childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ - Root: childMerkleVD, - Start: childMerkleVD, - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_TRUNC, - Mode: 0644, - }) - if err != nil { - return nil, err - } - childMerkleFD.DecRef(ctx) - } - // The dentry needs to be cleaned up if any error occurs. IncRef will be // called if a verity child dentry is successfully created. defer childMerkleVD.DecRef(ctx) @@ -679,7 +650,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving d = next } if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -699,7 +670,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, d = next } if rp.MustBeDir() && !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return d, nil } @@ -708,11 +679,11 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { // Verity file system is read-only. if ats&vfs.MayWrite != 0 { - return syserror.EROFS + return linuxerr.EROFS } var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -724,14 +695,14 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } if opts.CheckSearchable { if !d.isDir() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err @@ -745,7 +716,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -758,31 +729,31 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Verity fs is read-only. if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 { - return nil, syserror.EROFS + return nil, linuxerr.EROFS } var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if rp.Done() { @@ -826,7 +797,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // Users should not open the Merkle tree files. Those are for verity fs // use only. if strings.Contains(d.name, merklePrefix) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } ats := vfs.AccessTypesForOpenFlags(opts) if err := d.checkPermissions(rp.Credentials(), ats); err != nil { @@ -835,7 +806,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // Verity fs is read-only. if ats&vfs.MayWrite != 0 { - return nil, syserror.EROFS + return nil, linuxerr.EROFS } // Get the path to the target file. This is only used to provide path @@ -845,16 +816,23 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, err } + tmpOpts := *opts + + // Open the lowerFD with O_PATH if a symlink is opened for verity. + if tmpOpts.Flags&linux.O_NOFOLLOW != 0 && d.isSymlink() { + tmpOpts.Flags |= linux.O_PATH + } + // Open the file in the underlying file system. lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: d.lowerVD, Start: d.lowerVD, - }, opts) + }, &tmpOpts) // The file should exist, as we succeeded in finding its dentry. If it's // missing, it indicates an unexpected modification to the file system. if err != nil { - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { return nil, d.fs.alertIntegrityViolation(fmt.Sprintf("File %s expected but not found", path)) } return nil, err @@ -877,7 +855,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // dentry. If it's missing, it indicates an unexpected modification to // the file system. if err != nil { - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { return nil, d.fs.alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path)) } return nil, err @@ -887,7 +865,6 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // be called if a verity FD is successfully created. defer merkleReader.DecRef(ctx) - lowerFlags := lowerFD.StatusFlags() lowerFDOpts := lowerFD.Options() var merkleWriter *vfs.FileDescription var parentMerkleWriter *vfs.FileDescription @@ -902,7 +879,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf Flags: linux.O_WRONLY | linux.O_APPEND, }) if err != nil { - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { return nil, d.fs.alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path)) } return nil, err @@ -919,7 +896,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf Flags: linux.O_WRONLY | linux.O_APPEND, }) if err != nil { - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD) return nil, d.fs.alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath)) } @@ -940,7 +917,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf isDir: d.isDir(), } - if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil { return nil, err } lowerFD.IncRef() @@ -958,7 +935,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -969,26 +946,26 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1021,31 +998,31 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil { return nil, err } - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1061,7 +1038,7 @@ func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1076,13 +1053,13 @@ func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { // Verity file system is read-only. - return syserror.EROFS + return linuxerr.EROFS } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index 969003613..d2526263c 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -23,10 +23,12 @@ // Lock order: // // filesystem.renameMu -// dentry.dirMu -// fileDescription.mu -// filesystem.verityMu -// dentry.hashMu +// dentry.cachingMu +// filesystem.cacheMu +// dentry.dirMu +// fileDescription.mu +// filesystem.verityMu +// dentry.hashMu // // Locking dentry.dirMu in multiple dentries requires that parent dentries are // locked before child dentries, and that filesystem.renameMu is locked to @@ -39,12 +41,14 @@ import ( "encoding/json" "fmt" "math" + "sort" "strconv" "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" @@ -58,7 +62,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -95,6 +98,9 @@ const ( // sizeOfStringInt32 is the size for a 32 bit integer stored as string in // extended attributes. The maximum value of a 32 bit integer has 10 digits. sizeOfStringInt32 = 10 + + // defaultMaxCachedDentries is the default limit of dentry cache. + defaultMaxCachedDentries = uint64(1000) ) var ( @@ -105,9 +111,10 @@ var ( // Mount option names for verityfs. const ( - moptLowerPath = "lower_path" - moptRootHash = "root_hash" - moptRootName = "root_name" + moptLowerPath = "lower_path" + moptRootHash = "root_hash" + moptRootName = "root_name" + moptDentryCacheLimit = "dentry_cache_limit" ) // HashAlgorithm is a type specifying the algorithm used to hash the file @@ -187,6 +194,17 @@ type filesystem struct { // dentries. renameMu sync.RWMutex `state:"nosave"` + // cachedDentries contains all dentries with 0 references. (Due to race + // conditions, it may also contain dentries with non-zero references.) + // cachedDentriesLen is the number of dentries in cachedDentries. These + // fields are protected by cacheMu. + cacheMu sync.Mutex `state:"nosave"` + cachedDentries dentryList + cachedDentriesLen uint64 + + // maxCachedDentries is the maximum size of filesystem.cachedDentries. + maxCachedDentries uint64 + // verityMu synchronizes enabling verity files, protects files or // directories from being enabled by different threads simultaneously. // It also ensures that verity does not access files that are being @@ -197,6 +215,10 @@ type filesystem struct { // is for the whole file system to ensure that no more than one file is // enabled the same time. verityMu sync.RWMutex `state:"nosave"` + + // released is nonzero once filesystem.Release has been called. It is accessed + // with atomic memory operations. + released int32 } // InternalFilesystemOptions may be passed as @@ -237,7 +259,7 @@ func (FilesystemType) Release(ctx context.Context) {} // mode, it returns EIO, otherwise it panic. func (fs *filesystem) alertIntegrityViolation(msg string) error { if fs.action == ErrorOnViolation { - return syserror.EIO + return linuxerr.EIO } panic(msg) } @@ -251,7 +273,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt hash, err := hex.DecodeString(encodedRootHash) if err != nil { ctx.Warningf("verity.FilesystemType.GetFilesystem: Failed to decode root hash: %v", err) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } rootHash = hash } @@ -265,23 +287,33 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt delete(mopts, moptRootName) rootName = root } + maxCachedDentries := defaultMaxCachedDentries + if str, ok := mopts[moptDentryCacheLimit]; ok { + delete(mopts, moptDentryCacheLimit) + maxCD, err := strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("verity.FilesystemType.GetFilesystem: invalid dentry cache limit: %s=%s", moptDentryCacheLimit, str) + return nil, nil, linuxerr.EINVAL + } + maxCachedDentries = maxCD + } // Check for unparsed options. if len(mopts) != 0 { ctx.Warningf("verity.FilesystemType.GetFilesystem: unknown options: %v", mopts) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } // Handle internal options. iopts, ok := opts.InternalData.(InternalFilesystemOptions) if len(lowerPathname) == 0 && !ok { ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if len(lowerPathname) != 0 { if ok { ctx.Warningf("verity.FilesystemType.GetFilesystem: unexpected verity configs with specified lower path") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } iopts = InternalFilesystemOptions{ AllowRuntimeEnable: len(rootHash) == 0, @@ -300,7 +332,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt lowerPath := fspath.Parse(lowerPathname) if !lowerPath.Absolute { ctx.Infof("verity.FilesystemType.GetFilesystem: lower_path %q must be absolute", lowerPathname) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } var err error mountedLowerVD, err = vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ @@ -338,12 +370,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt action: iopts.Action, opts: opts.Data, allowRuntimeEnable: iopts.AllowRuntimeEnable, + maxCachedDentries: maxCachedDentries, } fs.vfsfs.Init(vfsObj, &fstype, fs) // Construct the root dentry. d := fs.newDentry() - d.refs = 1 + // Set the root's reference count to 2. One reference is returned to + // the caller, and the other is held by fs to prevent the root from + // being "cached" and subsequently evicted. + d.refs = 2 lowerVD := vfs.MakeVirtualDentry(lowerMount, lowerMount.Root()) lowerVD.IncRef() d.lowerVD = lowerVD @@ -358,7 +394,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // If runtime enable is allowed, the root merkle tree may be absent. We // should create the tree file. - if err == syserror.ENOENT && fs.allowRuntimeEnable { + if linuxerr.Equals(linuxerr.ENOENT, err) && fs.allowRuntimeEnable { lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ Root: lowerVD, Start: lowerVD, @@ -439,7 +475,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if !d.isDir() { ctx.Warningf("verity root must be a directory") - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } if !fs.allowRuntimeEnable { @@ -451,7 +487,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt Name: childrenOffsetXattr, Size: sizeOfStringInt32, }) - if err == syserror.ENOENT || err == syserror.ENODATA { + if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENODATA, err) { return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenOffsetXattr, err)) } if err != nil { @@ -470,7 +506,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt Name: childrenSizeXattr, Size: sizeOfStringInt32, }) - if err == syserror.ENOENT || err == syserror.ENODATA { + if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENODATA, err) { return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenSizeXattr, err)) } if err != nil { @@ -487,7 +523,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt }, &vfs.OpenOptions{ Flags: linux.O_RDONLY, }) - if err == syserror.ENOENT { + if linuxerr.Equals(linuxerr.ENOENT, err) { return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to open root Merkle file: %v", err)) } if err != nil { @@ -508,6 +544,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { return nil, nil, err } + d.generateChildrenList() } d.vfsd.Init(d) @@ -517,7 +554,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { + atomic.StoreInt32(&fs.released, 1) fs.lowerMount.DecRef(ctx) + + fs.renameMu.Lock() + fs.evictAllCachedDentriesLocked(ctx) + fs.renameMu.Unlock() + + // An extra reference was held by the filesystem on the root to prevent + // it from being cached/evicted. + fs.rootDentry.DecRef(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. @@ -531,6 +577,11 @@ func (fs *filesystem) MountOptions() string { type dentry struct { vfsd vfs.Dentry + // refs is the reference count. Each dentry holds a reference on its + // parent, even if disowned. When refs reaches 0, the dentry may be + // added to the cache or destroyed. If refs == -1, the dentry has + // already been destroyed. refs is accessed using atomic memory + // operations. refs int64 // fs is the owning filesystem. fs is immutable. @@ -564,6 +615,11 @@ type dentry struct { // populated by enableVerity. childrenNames is also protected by dirMu. childrenNames map[string]struct{} + // childrenList is a complete sorted list of childrenNames. This list + // is generated when verity is enabled, or the first time the file is + // verified in non runtime enable mode. + childrenList []string + // lowerVD is the VirtualDentry in the underlying file system. It is // never modified after initialized. lowerVD vfs.VirtualDentry @@ -580,13 +636,23 @@ type dentry struct { // is protected by hashMu. hashMu sync.RWMutex `state:"nosave"` hash []byte + + // cachingMu is used to synchronize concurrent dentry caching attempts on + // this dentry. + cachingMu sync.Mutex `state:"nosave"` + + // If cached is true, dentryEntry links dentry into + // filesystem.cachedDentries. cached and dentryEntry are protected by + // cachingMu. + cached bool + dentryEntry } // newDentry creates a new dentry representing the given verity file. The -// dentry initially has no references; it is the caller's responsibility to set -// the dentry's reference count and/or call dentry.destroy() as appropriate. -// The dentry is initially invalid in that it contains no underlying dentry; -// the caller is responsible for setting them. +// dentry initially has no references, but is not cached; it is the caller's +// responsibility to set the dentry's reference count and/or call +// dentry.destroy() as appropriate. The dentry is initially invalid in that it +// contains no underlying dentry; the caller is responsible for setting them. func (fs *filesystem) newDentry() *dentry { d := &dentry{ fs: fs, @@ -622,42 +688,23 @@ func (d *dentry) TryIncRef() bool { // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef(ctx context.Context) { - r := atomic.AddInt64(&d.refs, -1) - if d.LogRefs() { - refsvfs2.LogDecRef(d, r) - } - if r == 0 { - d.fs.renameMu.Lock() - d.checkDropLocked(ctx) - d.fs.renameMu.Unlock() - } else if r < 0 { - panic("verity.dentry.DecRef() called without holding a reference") + if d.decRefNoCaching() == 0 { + d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) } } -func (d *dentry) decRefLocked(ctx context.Context) { +// decRefNoCaching decrements d's reference count without calling +// d.checkCachingLocked, even if d's reference count reaches 0; callers are +// responsible for ensuring that d.checkCachingLocked will be called later. +func (d *dentry) decRefNoCaching() int64 { r := atomic.AddInt64(&d.refs, -1) if d.LogRefs() { refsvfs2.LogDecRef(d, r) } - if r == 0 { - d.checkDropLocked(ctx) - } else if r < 0 { - panic("verity.dentry.decRefLocked() called without holding a reference") - } -} - -// checkDropLocked should be called after d's reference count becomes 0 or it -// becomes deleted. -func (d *dentry) checkDropLocked(ctx context.Context) { - // Dentries with a positive reference count must be retained. Dentries - // with a negative reference count have already been destroyed. - if atomic.LoadInt64(&d.refs) != 0 { - return + if r < 0 { + panic("verity.dentry.decRefNoCaching() called without holding a reference") } - // Refs is still zero; destroy it. - d.destroyLocked(ctx) - return + return r } // destroyLocked destroys the dentry. @@ -676,6 +723,12 @@ func (d *dentry) destroyLocked(ctx context.Context) { panic("verity.dentry.destroyLocked() called with references on the dentry") } + // Drop the reference held by d on its parent without recursively + // locking d.fs.renameMu. + if d.parent != nil && d.parent.decRefNoCaching() == 0 { + d.parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) + } + if d.lowerVD.Ok() { d.lowerVD.DecRef(ctx) } @@ -688,7 +741,6 @@ func (d *dentry) destroyLocked(ctx context.Context) { delete(d.parent.children, d.name) } d.parent.dirMu.Unlock() - d.parent.decRefLocked(ctx) } refsvfs2.Unregister(d) } @@ -727,6 +779,140 @@ func (d *dentry) OnZeroWatches(context.Context) { //TODO(b/159261227): Implement OnZeroWatches. } +// checkCachingLocked should be called after d's reference count becomes 0 or +// it becomes disowned. +// +// For performance, checkCachingLocked can also be called after d's reference +// count becomes non-zero, so that d can be removed from the LRU cache. This +// may help in reducing the size of the cache and hence reduce evictions. Note +// that this is not necessary for correctness. +// +// It may be called on a destroyed dentry. For example, +// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times +// for the same dentry when the dentry is visited more than once in the same +// operation. One of the calls may destroy the dentry, so subsequent calls will +// do nothing. +// +// Preconditions: d.fs.renameMu must be locked for writing if +// renameMuWriteLocked is true; it may be temporarily unlocked. +func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { + d.cachingMu.Lock() + refs := atomic.LoadInt64(&d.refs) + if refs == -1 { + // Dentry has already been destroyed. + d.cachingMu.Unlock() + return + } + if refs > 0 { + // fs.cachedDentries is permitted to contain dentries with non-zero refs, + // which are skipped by fs.evictCachedDentryLocked() upon reaching the end + // of the LRU. But it is still beneficial to remove d from the cache as we + // are already holding d.cachingMu. Keeping a cleaner cache also reduces + // the number of evictions (which is expensive as it acquires fs.renameMu). + d.removeFromCacheLocked() + d.cachingMu.Unlock() + return + } + + if atomic.LoadInt32(&d.fs.released) != 0 { + d.cachingMu.Unlock() + if !renameMuWriteLocked { + // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as + // needed by d.destroyLocked() later. + d.fs.renameMu.Lock() + defer d.fs.renameMu.Unlock() + } + if d.parent != nil { + d.parent.dirMu.Lock() + delete(d.parent.children, d.name) + d.parent.dirMu.Unlock() + } + d.destroyLocked(ctx) // +checklocksforce: see above. + return + } + + d.fs.cacheMu.Lock() + // If d is already cached, just move it to the front of the LRU. + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentries.PushFront(d) + d.fs.cacheMu.Unlock() + d.cachingMu.Unlock() + return + } + // Cache the dentry, then evict the least recently used cached dentry if + // the cache becomes over-full. + d.fs.cachedDentries.PushFront(d) + d.fs.cachedDentriesLen++ + d.cached = true + shouldEvict := d.fs.cachedDentriesLen > d.fs.maxCachedDentries + d.fs.cacheMu.Unlock() + d.cachingMu.Unlock() + + if shouldEvict { + if !renameMuWriteLocked { + // Need to lock d.fs.renameMu for writing as needed by + // d.evictCachedDentryLocked(). + d.fs.renameMu.Lock() + defer d.fs.renameMu.Unlock() + } + d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. + } +} + +// Preconditions: d.cachingMu must be locked. +func (d *dentry) removeFromCacheLocked() { + if d.cached { + d.fs.cacheMu.Lock() + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.fs.cacheMu.Unlock() + d.cached = false + } +} + +// Precondition: fs.renameMu must be locked for writing; it may be temporarily +// unlocked. +// +checklocks:fs.renameMu +func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { + for fs.cachedDentriesLen != 0 { + fs.evictCachedDentryLocked(ctx) + } +} + +// Preconditions: +// * fs.renameMu must be locked for writing; it may be temporarily unlocked. +// +checklocks:fs.renameMu +func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { + fs.cacheMu.Lock() + victim := fs.cachedDentries.Back() + fs.cacheMu.Unlock() + if victim == nil { + // fs.cachedDentries may have become empty between when it was + // checked and when we locked fs.cacheMu. + return + } + + victim.cachingMu.Lock() + victim.removeFromCacheLocked() + // victim.refs may have become non-zero from an earlier path resolution + // since it was inserted into fs.cachedDentries. + if atomic.LoadInt64(&victim.refs) != 0 { + victim.cachingMu.Unlock() + return + } + if victim.parent != nil { + victim.parent.dirMu.Lock() + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd) + delete(victim.parent.children, victim.name) + victim.parent.dirMu.Unlock() + } + victim.cachingMu.Unlock() + victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs. +} + func (d *dentry) isSymlink() bool { return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK } @@ -749,6 +935,17 @@ func (d *dentry) verityEnabled() bool { return !d.fs.allowRuntimeEnable || len(d.hash) != 0 } +// generateChildrenList generates a sorted childrenList from childrenNames, and +// cache it in d for hashing. +func (d *dentry) generateChildrenList() { + if len(d.childrenList) == 0 && len(d.childrenNames) != 0 { + for child := range d.childrenNames { + d.childrenList = append(d.childrenList, child) + } + sort.Strings(d.childrenList) + } +} + // getLowerAt returns the dentry in the underlying file system, which is // represented by filename relative to d. func (d *dentry) getLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, filename string) (vfs.VirtualDentry, error) { @@ -857,13 +1054,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { // Verity files are read-only. - return syserror.EPERM + return linuxerr.EPERM } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *fileDescription) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { if !fd.d.isDir() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } fd.mu.Lock() defer fd.mu.Unlock() @@ -921,14 +1118,14 @@ func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) case linux.SEEK_END: n = int64(fd.d.size) default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset > math.MaxInt64-n { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } offset += n if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } fd.off = offset return offset, nil @@ -962,10 +1159,12 @@ func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, ui return nil, 0, err } + fd.d.generateChildrenList() + params := &merkletree.GenerateParams{ TreeReader: &merkleReader, TreeWriter: &merkleWriter, - Children: fd.d.childrenNames, + Children: fd.d.childrenList, HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(), Name: fd.d.name, Mode: uint32(stat.Mode), @@ -1007,7 +1206,7 @@ func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, ui default: // TODO(b/167728857): Investigate whether and how we should // enable other types of file. - return nil, 0, syserror.EINVAL + return nil, 0, linuxerr.EINVAL } hash, err := merkletree.Generate(params) return hash, uint64(params.Size), err @@ -1056,7 +1255,7 @@ func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error { // and stores its hash in its parent directory's Merkle tree. func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { if !fd.d.fs.allowRuntimeEnable { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } fd.d.fs.verityMu.Lock() @@ -1070,6 +1269,21 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { return 0, fd.d.fs.alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds") } + // Populate children names here. We cannot rely on the children + // dentries to populate parent dentry's children names, because the + // parent dentry may be destroyed before users enable verity if its ref + // count drops to zero. + if fd.d.isDir() { + if err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name != "." && dirent.Name != ".." { + fd.d.childrenNames[dirent.Name] = struct{}{} + } + return nil + })); err != nil { + return 0, err + } + } + hash, dataSize, err := fd.generateMerkleLocked(ctx) if err != nil { return 0, err @@ -1097,9 +1311,6 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { }); err != nil { return 0, err } - - // Add the current child's name to parent's childrenNames. - fd.d.parent.childrenNames[fd.d.name] = struct{}{} } // Record the size of the data being hashed for fd. @@ -1125,7 +1336,7 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest hostarch.Addr) (uintptr, error) { t := kernel.TaskFromContext(ctx) if t == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var metadata linux.DigestMetadata @@ -1138,7 +1349,7 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest hosta // enabled, in which case fd.d.hash should be set. if len(fd.d.hash) == 0 { if fd.d.fs.allowRuntimeEnable { - return 0, syserror.ENODATA + return 0, linuxerr.ENODATA } return 0, fd.d.fs.alertIntegrityViolation("Ioctl measureVerity: no hash found") } @@ -1148,7 +1359,7 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest hosta return 0, err } if metadata.DigestSize < uint16(len(fd.d.hash)) { - return 0, syserror.EOVERFLOW + return 0, linuxerr.EOVERFLOW } // Populate the output digest size, since DigestSize is both input and @@ -1178,7 +1389,7 @@ func (fd *fileDescription) verityFlags(ctx context.Context, flags hostarch.Addr) t := kernel.TaskFromContext(ctx) if t == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } _, err := primitive.CopyInt32Out(t, flags, f) return 0, err @@ -1194,7 +1405,7 @@ func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. case linux.FS_IOC_GETFLAGS: return fd.verityFlags(ctx, args[2].Pointer()) default: - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } } @@ -1227,7 +1438,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of // The Merkle tree file for the child should have been created and // contains the expected xattrs. If the xattr does not exist, it // indicates unexpected modifications to the file system. - if err == syserror.ENODATA { + if linuxerr.Equals(linuxerr.ENODATA, err) { return 0, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err)) } if err != nil { @@ -1261,7 +1472,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of Mode: fd.d.mode, UID: fd.d.uid, GID: fd.d.gid, - Children: fd.d.childrenNames, + Children: fd.d.childrenList, HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(), ReadOffset: offset, ReadSize: dst.NumBytes(), @@ -1277,12 +1488,12 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EROFS + return 0, linuxerr.EROFS } // Write implements vfs.FileDescriptionImpl.Write. func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EROFS + return 0, linuxerr.EROFS } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. @@ -1298,7 +1509,7 @@ func (fd *fileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapO // Check if mmap is allowed on the lower filesystem. if !opts.SentryOwnedContent { - return syserror.ENODEV + return linuxerr.ENODEV } return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } @@ -1349,7 +1560,7 @@ func (fd *fileDescription) Translate(ctx context.Context, required, optional mem // The Merkle tree file for the child should have been created and // contains the expected xattrs. If the xattr does not exist, it // indicates unexpected modifications to the file system. - if err == syserror.ENODATA { + if linuxerr.Equals(linuxerr.ENODATA, err) { return nil, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err)) } if err != nil { @@ -1433,7 +1644,7 @@ func (r *mmapReadSeeker) ReadAt(p []byte, off int64) (int, error) { // mapped region. readOffset := off - int64(r.Offset) if readOffset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } bs.DropFirst64(uint64(readOffset)) view := bs.TakeFirst64(uint64(len(p))) diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go index 5c78a0019..af041bd50 100644 --- a/pkg/sentry/fsimpl/verity/verity_test.go +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" @@ -31,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -476,7 +476,7 @@ func TestOpenNonexistentFile(t *testing.T) { // Ensure open an unexpected file in the parent directory fails with // ENOENT rather than verification failure. - if _, err = openVerityAt(ctx, vfsObj, root, filename+"abc", linux.O_RDONLY, linux.ModeRegular); err != syserror.ENOENT { + if _, err = openVerityAt(ctx, vfsObj, root, filename+"abc", linux.O_RDONLY, linux.ModeRegular); !linuxerr.Equals(linuxerr.ENOENT, err) { t.Errorf("OpenAt unexpected error: %v", err) } } @@ -767,7 +767,7 @@ func TestOpenDeletedFileFails(t *testing.T) { } // Ensure reopening the verity enabled file fails. - if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err != syserror.EIO { + if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); !linuxerr.Equals(linuxerr.EIO, err) { t.Errorf("got OpenAt error: %v, expected EIO", err) } }) @@ -829,7 +829,7 @@ func TestOpenRenamedFileFails(t *testing.T) { } // Ensure reopening the verity enabled file fails. - if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); err != syserror.EIO { + if _, err = openVerityAt(ctx, vfsObj, root, filename, linux.O_RDONLY, linux.ModeRegular); !linuxerr.Equals(linuxerr.EIO, err) { t.Errorf("got OpenAt error: %v, expected EIO", err) } }) @@ -899,7 +899,7 @@ func TestUnmodifiedSymlinkFileReadSucceeds(t *testing.T) { t.Fatalf("SymlinkAt: %v", err) } - fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_PATH|linux.O_NOFOLLOW, linux.ModeRegular) + fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_NOFOLLOW, linux.ModeRegular) if err != nil { t.Fatalf("openVerityAt symlink: %v", err) @@ -1034,7 +1034,7 @@ func TestDeletedSymlinkFileReadFails(t *testing.T) { t.Fatalf("SymlinkAt: %v", err) } - fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_PATH|linux.O_NOFOLLOW, linux.ModeRegular) + fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_NOFOLLOW, linux.ModeRegular) if err != nil { t.Fatalf("openVerityAt symlink: %v", err) @@ -1063,14 +1063,14 @@ func TestDeletedSymlinkFileReadFails(t *testing.T) { Root: root, Start: root, Path: fspath.Parse(symlink), - }); err != syserror.EIO { + }); !linuxerr.Equals(linuxerr.EIO, err) { t.Fatalf("ReadlinkAt succeeded with modified symlink: %v", err) } if tc.testWalk { fileInSymlinkDirectory := symlink + "/verity-test-file" // Ensure opening the verity enabled file in the symlink directory fails. - if _, err := openVerityAt(ctx, vfsObj, root, fileInSymlinkDirectory, linux.O_RDONLY, linux.ModeRegular); err != syserror.EIO { + if _, err := openVerityAt(ctx, vfsObj, root, fileInSymlinkDirectory, linux.O_RDONLY, linux.ModeRegular); !linuxerr.Equals(linuxerr.EIO, err) { t.Errorf("Open succeeded with modified symlink: %v", err) } } @@ -1136,7 +1136,7 @@ func TestModifiedSymlinkFileReadFails(t *testing.T) { } // Open symlink file to get the fd for ioctl in new step. - fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_PATH|linux.O_NOFOLLOW, linux.ModeRegular) + fd, err := openVerityAt(ctx, vfsObj, root, symlink, linux.O_NOFOLLOW, linux.ModeRegular) if err != nil { t.Fatalf("OpenAt symlink: %v", err) } @@ -1195,14 +1195,14 @@ func TestModifiedSymlinkFileReadFails(t *testing.T) { Root: root, Start: root, Path: fspath.Parse(symlink), - }); err != syserror.EIO { + }); !linuxerr.Equals(linuxerr.EIO, err) { t.Fatalf("ReadlinkAt succeeded with modified symlink: %v", err) } if tc.testWalk { fileInSymlinkDirectory := symlink + "/verity-test-file" // Ensure opening the verity enabled file in the symlink directory fails. - if _, err := openVerityAt(ctx, vfsObj, root, fileInSymlinkDirectory, linux.O_RDONLY, linux.ModeRegular); err != syserror.EIO { + if _, err := openVerityAt(ctx, vfsObj, root, fileInSymlinkDirectory, linux.O_RDONLY, linux.ModeRegular); !linuxerr.Equals(linuxerr.EIO, err) { t.Errorf("Open succeeded with modified symlink: %v", err) } } diff --git a/pkg/sentry/hostfd/hostfd_linux.go b/pkg/sentry/hostfd/hostfd_linux.go index 1cabc848f..0131da22d 100644 --- a/pkg/sentry/hostfd/hostfd_linux.go +++ b/pkg/sentry/hostfd/hostfd_linux.go @@ -12,7 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package hostfd -// maxIov is the maximum permitted size of a struct iovec array. -const maxIov = 1024 // UIO_MAXIOV +// MaxReadWriteIov is the maximum permitted size of a struct iovec array in a +// readv, writev, preadv, or pwritev host syscall. +const MaxReadWriteIov = 1024 // UIO_MAXIOV + +// MaxSendRecvMsgIov is the maximum permitted size of a struct iovec array in a +// sendmsg or recvmsg host syscall. +const MaxSendRecvMsgIov = 1024 // UIO_MAXIOV diff --git a/pkg/sentry/hostfd/hostfd_unsafe.go b/pkg/sentry/hostfd/hostfd_unsafe.go index 03c6d2a16..a43311eb4 100644 --- a/pkg/sentry/hostfd/hostfd_unsafe.go +++ b/pkg/sentry/hostfd/hostfd_unsafe.go @@ -23,6 +23,11 @@ import ( "gvisor.dev/gvisor/pkg/safemem" ) +const ( + sizeofIovec = unsafe.Sizeof(unix.Iovec{}) + sizeofMsghdr = unsafe.Sizeof(unix.Msghdr{}) +) + // Preadv2 reads up to dsts.NumBytes() bytes from host file descriptor fd into // dsts. offset and flags are interpreted as for preadv2(2). // @@ -44,9 +49,9 @@ func Preadv2(fd int32, dsts safemem.BlockSeq, offset int64, flags uint32) (uint6 } } else { iovs := safemem.IovecsFromBlockSeq(dsts) - if len(iovs) > maxIov { - log.Debugf("hostfd.Preadv2: truncating from %d iovecs to %d", len(iovs), maxIov) - iovs = iovs[:maxIov] + if len(iovs) > MaxReadWriteIov { + log.Debugf("hostfd.Preadv2: truncating from %d iovecs to %d", len(iovs), MaxReadWriteIov) + iovs = iovs[:MaxReadWriteIov] } n, _, e = unix.Syscall6(unix.SYS_PREADV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags)) } @@ -80,9 +85,9 @@ func Pwritev2(fd int32, srcs safemem.BlockSeq, offset int64, flags uint32) (uint } } else { iovs := safemem.IovecsFromBlockSeq(srcs) - if len(iovs) > maxIov { - log.Debugf("hostfd.Preadv2: truncating from %d iovecs to %d", len(iovs), maxIov) - iovs = iovs[:maxIov] + if len(iovs) > MaxReadWriteIov { + log.Debugf("hostfd.Preadv2: truncating from %d iovecs to %d", len(iovs), MaxReadWriteIov) + iovs = iovs[:MaxReadWriteIov] } n, _, e = unix.Syscall6(unix.SYS_PWRITEV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags)) } diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index 80dda1559..b121fc1b4 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -27,6 +27,9 @@ type Stack interface { // integers. Interfaces() map[int32]Interface + // RemoveInterface removes the specified network interface. + RemoveInterface(idx int32) error + // InterfaceAddrs returns all network interface addresses as a mapping from // interface indexes to a slice of associated interface address properties. InterfaceAddrs() map[int32][]InterfaceAddr diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index 218d9dafc..621f47e1f 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -45,23 +45,29 @@ func NewTestStack() *TestStack { } } -// Interfaces implements Stack.Interfaces. +// Interfaces implements Stack. func (s *TestStack) Interfaces() map[int32]Interface { return s.InterfacesMap } -// InterfaceAddrs implements Stack.InterfaceAddrs. +// RemoveInterface implements Stack. +func (s *TestStack) RemoveInterface(idx int32) error { + delete(s.InterfacesMap, idx) + return nil +} + +// InterfaceAddrs implements Stack. func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr { return s.InterfaceAddrsMap } -// AddInterfaceAddr implements Stack.AddInterfaceAddr. +// AddInterfaceAddr implements Stack. func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error { s.InterfaceAddrsMap[idx] = append(s.InterfaceAddrsMap[idx], addr) return nil } -// RemoveInterfaceAddr implements Stack.RemoveInterfaceAddr. +// RemoveInterfaceAddr implements Stack. func (s *TestStack) RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error { interfaceAddrs, ok := s.InterfaceAddrsMap[idx] if !ok { @@ -79,94 +85,94 @@ func (s *TestStack) RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error { return nil } -// SupportsIPv6 implements Stack.SupportsIPv6. +// SupportsIPv6 implements Stack. func (s *TestStack) SupportsIPv6() bool { return s.SupportsIPv6Flag } -// TCPReceiveBufferSize implements Stack.TCPReceiveBufferSize. +// TCPReceiveBufferSize implements Stack. func (s *TestStack) TCPReceiveBufferSize() (TCPBufferSize, error) { return s.TCPRecvBufSize, nil } -// SetTCPReceiveBufferSize implements Stack.SetTCPReceiveBufferSize. +// SetTCPReceiveBufferSize implements Stack. func (s *TestStack) SetTCPReceiveBufferSize(size TCPBufferSize) error { s.TCPRecvBufSize = size return nil } -// TCPSendBufferSize implements Stack.TCPSendBufferSize. +// TCPSendBufferSize implements Stack. func (s *TestStack) TCPSendBufferSize() (TCPBufferSize, error) { return s.TCPSendBufSize, nil } -// SetTCPSendBufferSize implements Stack.SetTCPSendBufferSize. +// SetTCPSendBufferSize implements Stack. func (s *TestStack) SetTCPSendBufferSize(size TCPBufferSize) error { s.TCPSendBufSize = size return nil } -// TCPSACKEnabled implements Stack.TCPSACKEnabled. +// TCPSACKEnabled implements Stack. func (s *TestStack) TCPSACKEnabled() (bool, error) { return s.TCPSACKFlag, nil } -// SetTCPSACKEnabled implements Stack.SetTCPSACKEnabled. +// SetTCPSACKEnabled implements Stack. func (s *TestStack) SetTCPSACKEnabled(enabled bool) error { s.TCPSACKFlag = enabled return nil } -// TCPRecovery implements Stack.TCPRecovery. +// TCPRecovery implements Stack. func (s *TestStack) TCPRecovery() (TCPLossRecovery, error) { return s.Recovery, nil } -// SetTCPRecovery implements Stack.SetTCPRecovery. +// SetTCPRecovery implements Stack. func (s *TestStack) SetTCPRecovery(recovery TCPLossRecovery) error { s.Recovery = recovery return nil } -// Statistics implements inet.Stack.Statistics. +// Statistics implements Stack. func (s *TestStack) Statistics(stat interface{}, arg string) error { return nil } -// RouteTable implements Stack.RouteTable. +// RouteTable implements Stack. func (s *TestStack) RouteTable() []Route { return s.RouteList } -// Resume implements Stack.Resume. +// Resume implements Stack. func (s *TestStack) Resume() {} -// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. +// RegisteredEndpoints implements Stack. func (s *TestStack) RegisteredEndpoints() []stack.TransportEndpoint { return nil } -// CleanupEndpoints implements inet.Stack.CleanupEndpoints. +// CleanupEndpoints implements Stack. func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint { return nil } -// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. +// RestoreCleanupEndpoints implements Stack. func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} -// SetForwarding implements inet.Stack.SetForwarding. +// SetForwarding implements Stack. func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error { s.IPForwarding = enable return nil } -// PortRange implements inet.Stack.PortRange. +// PortRange implements Stack. func (*TestStack) PortRange() (uint16, uint16) { // Use the default Linux values per net/ipv4/af_inet.c:inet_init_net(). return 32768, 28232 } -// SetPortRange implements inet.Stack.SetPortRange. +// SetPortRange implements Stack. func (*TestStack) SetPortRange(start uint16, end uint16) error { // No-op. return nil diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 188c0ebff..c0f13bf52 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -218,13 +218,17 @@ go_library( ":uncaught_signal_go_proto", "//pkg/abi", "//pkg/abi/linux", + "//pkg/abi/linux/errno", "//pkg/amutex", + "//pkg/bitmap", "//pkg/bits", "//pkg/bpf", "//pkg/cleanup", "//pkg/context", "//pkg/coverage", "//pkg/cpuid", + "//pkg/errors", + "//pkg/errors/linuxerr", "//pkg/eventchannel", "//pkg/fspath", "//pkg/goid", @@ -253,6 +257,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/futex", + "//pkg/sentry/kernel/msgqueue", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/semaphore", "//pkg/sentry/kernel/shm", @@ -263,6 +268,7 @@ go_library( "//pkg/sentry/mm", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/seccheck", "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/time", @@ -276,7 +282,6 @@ go_library( "//pkg/state/wire", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/stack", "//pkg/usermem", @@ -298,6 +303,7 @@ go_test( deps = [ "//pkg/abi", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/arch", "//pkg/sentry/contexttest", @@ -309,6 +315,5 @@ go_test( "//pkg/sentry/time", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index d100e58d7..5d86a04f3 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -27,7 +27,7 @@ import ( // +stateify savable type abstractEndpoint struct { ep transport.BoundEndpoint - socket refsvfs2.RefCounter + socket refsvfs2.TryRefCounter name string ns *AbstractSocketNamespace } @@ -57,7 +57,7 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace { // its backing socket. type boundEndpoint struct { transport.BoundEndpoint - socket refsvfs2.RefCounter + socket refsvfs2.TryRefCounter } // Release implements transport.BoundEndpoint.Release. @@ -89,7 +89,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp // // When the last reference managed by socket is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.TryRefCounter) error { a.mu.Lock() defer a.mu.Unlock() @@ -109,7 +109,7 @@ func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep tran // Remove removes the specified socket at name from the abstract socket // namespace, if it has not yet been replaced. -func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) { +func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.TryRefCounter) { a.mu.Lock() defer a.mu.Unlock() diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 12180351d..9aa03f506 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -63,8 +63,8 @@ go_library( "//pkg/abi/linux", "//pkg/bits", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/sync", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go index 3325fedcb..fc245c54b 100644 --- a/pkg/sentry/kernel/auth/credentials.go +++ b/pkg/sentry/kernel/auth/credentials.go @@ -16,7 +16,7 @@ package auth import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // Credentials contains information required to authorize privileged operations @@ -203,7 +203,7 @@ func (c *Credentials) UseUID(uid UID) (KUID, error) { // uid must be mapped. kuid := c.UserNamespace.MapToKUID(uid) if !kuid.Ok() { - return NoID, syserror.EINVAL + return NoID, linuxerr.EINVAL } // If c has CAP_SETUID, then it can use any UID in its user namespace. if c.HasCapability(linux.CAP_SETUID) { @@ -214,7 +214,7 @@ func (c *Credentials) UseUID(uid UID) (KUID, error) { if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID { return kuid, nil } - return NoID, syserror.EPERM + return NoID, linuxerr.EPERM } // UseGID checks that c can use gid in its user namespace, then translates it @@ -222,7 +222,7 @@ func (c *Credentials) UseUID(uid UID) (KUID, error) { func (c *Credentials) UseGID(gid GID) (KGID, error) { kgid := c.UserNamespace.MapToKGID(gid) if !kgid.Ok() { - return NoID, syserror.EINVAL + return NoID, linuxerr.EINVAL } if c.HasCapability(linux.CAP_SETGID) { return kgid, nil @@ -230,7 +230,7 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) { if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID { return kgid, nil } - return NoID, syserror.EPERM + return NoID, linuxerr.EPERM } // SetUID translates the provided uid to the root user namespace and updates c's @@ -239,7 +239,7 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) { func (c *Credentials) SetUID(uid UID) error { kuid := c.UserNamespace.MapToKUID(uid) if !kuid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } c.RealKUID = kuid c.EffectiveKUID = kuid @@ -253,7 +253,7 @@ func (c *Credentials) SetUID(uid UID) error { func (c *Credentials) SetGID(gid GID) error { kgid := c.UserNamespace.MapToKGID(gid) if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } c.RealKGID = kgid c.EffectiveKGID = kgid diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go index 28cbe159d..f06a374a0 100644 --- a/pkg/sentry/kernel/auth/id_map.go +++ b/pkg/sentry/kernel/auth/id_map.go @@ -17,7 +17,7 @@ package auth import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns. @@ -106,11 +106,11 @@ func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) er // than once to a uid_map file in a user namespace fails with the error // EPERM. Similar rules apply for gid_map files." - user_namespaces(7) if !ns.uidMapFromParent.IsEmpty() { - return syserror.EPERM + return linuxerr.EPERM } // "At least one line must be written to the file." if len(entries) == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // """ // In order for a process to write to the /proc/[pid]/uid_map @@ -121,12 +121,12 @@ func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) er // in the user namespace of the process pid. // """ if !c.HasCapabilityIn(linux.CAP_SETUID, ns) { - return syserror.EPERM + return linuxerr.EPERM } // "2. The writing process must either be in the user namespace of the process // pid or be in the parent user namespace of the process pid." if c.UserNamespace != ns && c.UserNamespace != ns.parent { - return syserror.EPERM + return linuxerr.EPERM } // """ // 3. (see trySetUIDMap) @@ -145,14 +145,14 @@ func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) er // parent user namespace to a user ID (group ID) in the user namespace. // """ if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 { - return syserror.EPERM + return linuxerr.EPERM } // """ // + The writing process must have the same effective user ID as the // process that created the user namespace. // """ if c.EffectiveKUID != ns.owner { - return syserror.EPERM + return linuxerr.EPERM } } // trySetUIDMap leaves data in maps if it fails. @@ -170,11 +170,11 @@ func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error { // checks for NoID. lastID := e.FirstID + e.Length if lastID <= e.FirstID { - return syserror.EINVAL + return linuxerr.EINVAL } lastParentID := e.FirstParentID + e.Length if lastParentID <= e.FirstParentID { - return syserror.EINVAL + return linuxerr.EINVAL } // "3. The mapped user IDs (group IDs) must in turn have a mapping in // the parent user namespace." @@ -182,14 +182,14 @@ func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error { // mappings when it's created, so SetUIDMap would have returned EPERM // without reaching this point if ns is root. if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) { - return syserror.EPERM + return linuxerr.EPERM } // If either of these Adds fail, we have an overlapping range. if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { - return syserror.EINVAL + return linuxerr.EINVAL } if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { - return syserror.EINVAL + return linuxerr.EINVAL } } return nil @@ -202,24 +202,24 @@ func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) er ns.mu.Lock() defer ns.mu.Unlock() if !ns.gidMapFromParent.IsEmpty() { - return syserror.EPERM + return linuxerr.EPERM } if len(entries) == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if !c.HasCapabilityIn(linux.CAP_SETGID, ns) { - return syserror.EPERM + return linuxerr.EPERM } if c.UserNamespace != ns && c.UserNamespace != ns.parent { - return syserror.EPERM + return linuxerr.EPERM } if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) { if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 { - return syserror.EPERM + return linuxerr.EPERM } // It's correct for this to still be UID. if c.EffectiveKUID != ns.owner { - return syserror.EPERM + return linuxerr.EPERM } // "In the case of gid_map, use of the setgroups(2) system call must // first be denied by writing "deny" to the /proc/[pid]/setgroups file @@ -239,20 +239,20 @@ func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error { for _, e := range entries { lastID := e.FirstID + e.Length if lastID <= e.FirstID { - return syserror.EINVAL + return linuxerr.EINVAL } lastParentID := e.FirstParentID + e.Length if lastParentID <= e.FirstParentID { - return syserror.EINVAL + return linuxerr.EINVAL } if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) { - return syserror.EPERM + return linuxerr.EPERM } if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { - return syserror.EINVAL + return linuxerr.EINVAL } if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { - return syserror.EINVAL + return linuxerr.EINVAL } } return nil diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go index 9dd52c860..40a406f9d 100644 --- a/pkg/sentry/kernel/auth/user_namespace.go +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -17,8 +17,8 @@ package auth import ( "math" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // A UserNamespace represents a user namespace. See user_namespaces(7) for @@ -105,7 +105,7 @@ func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) { if c.UserNamespace.depth() >= maxUserNamespaceDepth { // "... Calls to unshare(2) or clone(2) that would cause this limit to // be exceeded fail with the error EUSERS." - user_namespaces(7) - return nil, syserror.EUSERS + return nil, linuxerr.EUSERS } // "EPERM: CLONE_NEWUSER was specified in flags, but either the effective // user ID or the effective group ID of the caller does not have a mapping @@ -114,10 +114,10 @@ func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) { // process are mapped to user IDs and group IDs in the user namespace of // the calling process at the time of the call." - unshare(2) if !c.EffectiveKUID.In(c.UserNamespace).Ok() { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } if !c.EffectiveKGID.In(c.UserNamespace).Ok() { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } return &UserNamespace{ parent: c.UserNamespace, diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go index c93ef6ac1..a0e291f58 100644 --- a/pkg/sentry/kernel/cgroup.go +++ b/pkg/sentry/kernel/cgroup.go @@ -196,6 +196,7 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files // uniqueness of controllers enforced by Register, drop the // dying hierarchy now. The eventual unregister by the FS // teardown will become a no-op. + r.unregisterLocked(h.id) return nil } return h.fs diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index 564c3d42e..f240a68aa 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -9,13 +9,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fdnotifier", "//pkg/hostarch", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 4466fbc9d..5ea44a2c2 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -22,13 +22,13 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -145,7 +145,7 @@ func (e *EventOperations) hostRead(ctx context.Context, dst usermem.IOSequence) if _, err := unix.Read(e.hostfd, buf[:]); err != nil { if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -165,7 +165,7 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro // We can't complete the read if the value is currently zero. if e.val == 0 { e.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } // Update the value based on the mode the event is operating in. @@ -198,7 +198,7 @@ func (e *EventOperations) hostWrite(val uint64) error { hostarch.ByteOrder.PutUint64(buf[:], val) _, err := unix.Write(e.hostfd, buf[:]) if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -230,7 +230,7 @@ func (e *EventOperations) Signal(val uint64) error { // uint64 minus 1. if val > math.MaxUint64-1-e.val { e.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } e.val += val diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 6224a0cbd..6b2dd09da 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -8,12 +8,12 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/errors/linuxerr", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index 5d584dc45..473987a79 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -17,12 +17,12 @@ package fasync import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -248,7 +248,7 @@ func (a *FileAsync) Signal() linux.Signal { // to send SIGIO. func (a *FileAsync) SetSignal(signal linux.Signal) error { if signal != 0 && !signal.IsValid() { - return syserror.EINVAL + return linuxerr.EINVAL } a.mu.Lock() defer a.mu.Unlock() diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 62777faa8..eff556a0c 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -18,17 +18,17 @@ import ( "fmt" "math" "strings" - "sync/atomic" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bitmap" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // FDFlags define flags for an individual descriptor. @@ -84,13 +84,8 @@ type FDTable struct { // mu protects below. mu sync.Mutex `state:"nosave"` - // next is start position to find fd. - next int32 - - // used contains the number of non-nil entries. It must be accessed - // atomically. It may be read atomically without holding mu (but not - // written). - used int32 + // fdBitmap shows which fds are already in use. + fdBitmap bitmap.Bitmap `state:"nosave"` // descriptorTable holds descriptors. descriptorTable `state:".(map[int32]descriptor)"` @@ -98,6 +93,8 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) + f.mu.Lock() + defer f.mu.Unlock() f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ file: file, @@ -111,12 +108,16 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { ctx := context.Background() f.initNoLeakCheck() // Initialize table. - f.used = 0 + f.fdBitmap = bitmap.New(uint32(math.MaxUint16)) for fd, d := range m { + if fd < 0 { + panic(fmt.Sprintf("FD is not supposed to be negative. FD: %d", fd)) + } + if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil { panic("VFS1 or VFS2 files set") } - + f.fdBitmap.Add(uint32(fd)) // Note that we do _not_ need to acquire a extra table reference here. The // table reference will already be accounted for in the file, so we drop the // reference taken by set above. @@ -156,7 +157,7 @@ func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) { // Release any POSIX lock possibly held by the FDTable. if file.SupportsLocks() { err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF}) - if err != nil && err != syserror.ENOLCK { + if err != nil && !linuxerr.Equals(linuxerr.ENOLCK, err) { panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) } } @@ -189,8 +190,10 @@ func (f *FDTable) DecRef(ctx context.Context) { func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { // retries tracks the number of failed TryIncRef attempts for the same FD. retries := 0 - fd := int32(0) - for { + fds := f.fdBitmap.ToSlice() + // Iterate through the fdBitmap. + for _, ufd := range fds { + fd := int32(ufd) file, fileVFS2, flags, ok := f.getAll(fd) if !ok { break @@ -218,7 +221,6 @@ func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2.DecRef(ctx) } retries = 0 - fd++ } } @@ -226,6 +228,8 @@ func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, func (f *FDTable) String() string { var buf strings.Builder ctx := context.Background() + f.mu.Lock() + defer f.mu.Unlock() f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { switch { case file != nil: @@ -250,10 +254,10 @@ func (f *FDTable) String() string { } // NewFDs allocates new FDs guaranteed to be the lowest number available -// greater than or equal to the fd parameter. All files will share the set +// greater than or equal to the minFD parameter. All files will share the set // flags. Success is guaranteed to be all or none. -func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) { - if fd < 0 { +func (f *FDTable) NewFDs(ctx context.Context, minFD int32, files []*fs.File, flags FDFlags) (fds []int32, err error) { + if minFD < 0 { // Don't accept negative FDs. return nil, unix.EINVAL } @@ -267,31 +271,48 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags if lim.Cur != limits.Infinity { end = int32(lim.Cur) } - if fd >= end { + if minFD+int32(len(files)) > end { return nil, unix.EMFILE } } f.mu.Lock() - // From f.next to find available fd. - if fd < f.next { - fd = f.next + // max is used as the largest number in fdBitmap + 1. + max := int32(0) + + if !f.fdBitmap.IsEmpty() { + max = int32(f.fdBitmap.Maximum()) + max++ } + // Adjust max in case it is less than minFD. + if max < minFD { + max = minFD + } // Install all entries. - for i := fd; i < end && len(fds) < len(files); i++ { - if d, _, _ := f.get(i); d == nil { - // Set the descriptor. - f.set(ctx, i, files[len(fds)], flags) - fds = append(fds, i) // Record the file descriptor. + for len(fds) < len(files) { + // Try to use free bit in fdBitmap. + // If all bits in fdBitmap are used, expand fd to the max. + fd := f.fdBitmap.FirstZero(uint32(minFD)) + if fd == math.MaxInt32 { + fd = uint32(max) + max++ + } + if fd >= uint32(end) { + break } + f.fdBitmap.Add(fd) + f.set(ctx, int32(fd), files[len(fds)], flags) + fds = append(fds, int32(fd)) + minFD = int32(fd) } // Failure? Unwind existing FDs. if len(fds) < len(files) { for _, i := range fds { f.set(ctx, i, nil, FDFlags{}) + f.fdBitmap.Remove(uint32(i)) } f.mu.Unlock() @@ -305,20 +326,15 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags return nil, unix.EMFILE } - if fd == f.next { - // Update next search start position. - f.next = fds[len(fds)-1] + 1 - } - f.mu.Unlock() return fds, nil } // NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available -// greater than or equal to the fd parameter. All files will share the set +// greater than or equal to the minFD parameter. All files will share the set // flags. Success is guaranteed to be all or none. -func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { - if fd < 0 { +func (f *FDTable) NewFDsVFS2(ctx context.Context, minFD int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { + if minFD < 0 { // Don't accept negative FDs. return nil, unix.EINVAL } @@ -332,31 +348,47 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes if lim.Cur != limits.Infinity { end = int32(lim.Cur) } - if fd >= end { + if minFD >= end { return nil, unix.EMFILE } } f.mu.Lock() - // From f.next to find available fd. - if fd < f.next { - fd = f.next + // max is used as the largest number in fdBitmap + 1. + max := int32(0) + + if !f.fdBitmap.IsEmpty() { + max = int32(f.fdBitmap.Maximum()) + max++ } - // Install all entries. - for i := fd; i < end && len(fds) < len(files); i++ { - if d, _, _ := f.getVFS2(i); d == nil { - // Set the descriptor. - f.setVFS2(ctx, i, files[len(fds)], flags) - fds = append(fds, i) // Record the file descriptor. - } + // Adjust max in case it is less than minFD. + if max < minFD { + max = minFD } + for len(fds) < len(files) { + // Try to use free bit in fdBitmap. + // If all bits in fdBitmap are used, expand fd to the max. + fd := f.fdBitmap.FirstZero(uint32(minFD)) + if fd == math.MaxInt32 { + fd = uint32(max) + max++ + } + if fd >= uint32(end) { + break + } + f.fdBitmap.Add(fd) + f.setVFS2(ctx, int32(fd), files[len(fds)], flags) + fds = append(fds, int32(fd)) + minFD = int32(fd) + } // Failure? Unwind existing FDs. if len(fds) < len(files) { for _, i := range fds { f.setVFS2(ctx, i, nil, FDFlags{}) + f.fdBitmap.Remove(uint32(i)) } f.mu.Unlock() @@ -370,57 +402,19 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes return nil, unix.EMFILE } - if fd == f.next { - // Update next search start position. - f.next = fds[len(fds)-1] + 1 - } - f.mu.Unlock() return fds, nil } -// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for +// NewFDVFS2 allocates a file descriptor greater than or equal to minFD for // the given file description. If it succeeds, it takes a reference on file. -func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { - if minfd < 0 { - // Don't accept negative FDs. - return -1, unix.EINVAL - } - - // Default limit. - end := int32(math.MaxInt32) - - // Ensure we don't get past the provided limit. - if limitSet := limits.FromContext(ctx); limitSet != nil { - lim := limitSet.Get(limits.NumberOfFiles) - if lim.Cur != limits.Infinity { - end = int32(lim.Cur) - } - if minfd >= end { - return -1, unix.EMFILE - } - } - - f.mu.Lock() - defer f.mu.Unlock() - - // From f.next to find available fd. - fd := minfd - if fd < f.next { - fd = f.next - } - for fd < end { - if d, _, _ := f.getVFS2(fd); d == nil { - f.setVFS2(ctx, fd, file, flags) - if fd == f.next { - // Update next search start position. - f.next = fd + 1 - } - return fd, nil - } - fd++ +func (f *FDTable) NewFDVFS2(ctx context.Context, minFD int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + files := []*vfs.FileDescription{file} + fileSlice, error := f.NewFDsVFS2(ctx, minFD, files, flags) + if error != nil { + return -1, error } - return -1, unix.EMFILE + return fileSlice[0], nil } // NewFDAt sets the file reference for the given FD. If there is an active @@ -469,6 +463,11 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 defer f.mu.Unlock() df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags) + // Add fd to fdBitmap. + if file != nil || fileVFS2 != nil { + f.fdBitmap.Add(uint32(fd)) + } + return df, dfVFS2, nil } @@ -573,7 +572,9 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { // Precondition: The caller must be running on the task goroutine, or Task.mu // must be locked. func (f *FDTable) GetFDs(ctx context.Context) []int32 { - fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) + f.mu.Lock() + defer f.mu.Unlock() + fds := make([]int32, 0, int(f.fdBitmap.GetNumOnes())) f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) @@ -583,13 +584,15 @@ func (f *FDTable) GetFDs(ctx context.Context) []int32 { // Fork returns an independent FDTable. func (f *FDTable) Fork(ctx context.Context) *FDTable { clone := f.k.NewFDTable() - + f.mu.Lock() + defer f.mu.Unlock() f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil { panic("VFS1 or VFS2 files set") } + clone.fdBitmap.Add(uint32(fd)) }) return clone } @@ -604,11 +607,6 @@ func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDesc f.mu.Lock() - // Update current available position. - if fd < f.next { - f.next = fd - } - orig, orig2, _, _ := f.getAll(fd) // Add reference for caller. @@ -621,6 +619,7 @@ func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDesc if orig != nil || orig2 != nil { orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry. + f.fdBitmap.Remove(uint32(fd)) } f.mu.Unlock() @@ -644,16 +643,13 @@ func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDes f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { if cond(file, fileVFS2, flags) { df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table. + f.fdBitmap.Remove(uint32(fd)) if df != nil { files = append(files, df) } if dfVFS2 != nil { filesVFS2 = append(filesVFS2, dfVFS2) } - // Update current available position. - if fd < f.next { - f.next = fd - } } }) f.mu.Unlock() diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index f17f9c59c..2b3e6ef71 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -15,9 +15,11 @@ package kernel import ( + "math" "sync/atomic" "unsafe" + "gvisor.dev/gvisor/pkg/bitmap" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -44,6 +46,7 @@ func (f *FDTable) initNoLeakCheck() { func (f *FDTable) init() { f.initNoLeakCheck() f.InitRefs() + f.fdBitmap = bitmap.New(uint32(math.MaxUint16)) } // get gets a file entry. @@ -162,14 +165,6 @@ func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 } } - // Adjust used. - switch { - case orig == nil && desc != nil: - atomic.AddInt32(&f.used, 1) - case orig != nil && desc == nil: - atomic.AddInt32(&f.used, -1) - } - if orig != nil { switch { case orig.file != nil: diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 6c31e082c..c897e3a5f 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -37,11 +37,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/sentry/memmap", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) @@ -53,8 +53,8 @@ go_test( library = ":futex", deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sync", - "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 0427cf3f4..2c9ea65aa 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -20,10 +20,10 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // KeyKind indicates the type of a Key. @@ -122,7 +122,7 @@ func check(t Target, addr hostarch.Addr, val uint32) error { return err } if cur != val { - return syserror.EAGAIN + return linuxerr.EAGAIN } return nil } @@ -165,7 +165,7 @@ func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) { case linux.FUTEX_OP_XOR: newVal = oldVal ^ opArg default: - return false, syserror.ENOSYS + return false, linuxerr.ENOSYS } prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal) if err != nil { @@ -191,7 +191,7 @@ func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) { case linux.FUTEX_OP_CMP_GE: return oldVal >= cmpArg, nil default: - return false, syserror.ENOSYS + return false, linuxerr.ENOSYS } } @@ -332,7 +332,7 @@ func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) { // Ensure the address is aligned. // It must be a DWORD boundary. if addr&0x3 != 0 { - return Key{}, syserror.EINVAL + return Key{}, linuxerr.EINVAL } if private { return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil @@ -397,8 +397,8 @@ func (m *Manager) Fork() *Manager { } // lockBucket returns a locked bucket for the given key. -func (m *Manager) lockBucket(k *Key) *bucket { - var b *bucket +// +checklocksacquire:b.mu +func (m *Manager) lockBucket(k *Key) (b *bucket) { if k.Kind == KindSharedMappable { b = m.sharedBucket } else { @@ -409,7 +409,9 @@ func (m *Manager) lockBucket(k *Key) *bucket { } // lockBuckets returns locked buckets for the given keys. -func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { +// +checklocksacquire:b1.mu +// +checklocksacquire:b2.mu +func (m *Manager) lockBuckets(k1, k2 *Key) (b1 *bucket, b2 *bucket) { // Buckets must be consistently ordered to avoid circular lock // dependencies. We order buckets in m.privateBuckets by index (lowest // index first), and all buckets in m.privateBuckets precede @@ -419,8 +421,8 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable { i1 := bucketIndexForAddr(k1.addr()) i2 := bucketIndexForAddr(k2.addr()) - b1 := &m.privateBuckets[i1] - b2 := &m.privateBuckets[i2] + b1 = &m.privateBuckets[i1] + b2 = &m.privateBuckets[i2] switch { case i1 < i2: b1.mu.Lock() @@ -431,19 +433,30 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { default: b1.mu.Lock() } - return b1, b2 + return b1, b2 // +checklocksforce } // At least one of b1 or b2 should be m.sharedBucket. - b1 := m.sharedBucket - b2 := m.sharedBucket + b1 = m.sharedBucket + b2 = m.sharedBucket if k1.Kind != KindSharedMappable { b1 = m.lockBucket(k1) } else if k2.Kind != KindSharedMappable { b2 = m.lockBucket(k2) } m.sharedBucket.mu.Lock() - return b1, b2 + return b1, b2 // +checklocksforce +} + +// unlockBuckets unlocks two buckets. +// +checklocksrelease:b1.mu +// +checklocksrelease:b2.mu +func (m *Manager) unlockBuckets(b1, b2 *bucket) { + b1.mu.Unlock() + if b1 != b2 { + b2.mu.Unlock() + } + return // +checklocksforce } // Wake wakes up to n waiters matching the bitmask on the given addr. @@ -476,10 +489,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, c defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) - defer b1.mu.Unlock() - if b2 != b1 { - defer b2.mu.Unlock() - } + defer m.unlockBuckets(b1, b2) if checkval { if err := check(t, addr, val); err != nil { @@ -526,10 +536,7 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwa defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) - defer b1.mu.Unlock() - if b2 != b1 { - defer b2.mu.Unlock() - } + defer m.unlockBuckets(b1, b2) done := 0 cond, err := atomicOp(t, addr2, op) @@ -670,7 +677,7 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint return false, err } if (cur & linux.FUTEX_TID_MASK) == tid { - return false, syserror.EDEADLK + return false, linuxerr.EDEADLK } if (cur & linux.FUTEX_TID_MASK) == 0 { @@ -745,7 +752,7 @@ func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bu } if (cur & linux.FUTEX_TID_MASK) != tid { - return syserror.EPERM + return linuxerr.EPERM } var next *Waiter // Who's the next owner? @@ -773,7 +780,7 @@ func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bu if prev != cur { // Let user mode handle CAS races. This is different than lock, which // retries when CAS fails. - return syserror.EAGAIN + return linuxerr.EAGAIN } return nil } @@ -790,7 +797,7 @@ func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bu return err } if prev != cur { - return syserror.EINVAL + return linuxerr.EINVAL } b.wakeWaiterLocked(next) diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index deba44e5c..04c136f87 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -21,8 +21,8 @@ import ( "testing" "unsafe" - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sync" ) @@ -488,7 +488,7 @@ func (t *testMutex) Lock() { // Wait for it to be "not locked". w := NewWaiter() err := t.m.WaitPrepare(w, t.d, t.a, true, testMutexLocked, ^uint32(0)) - if err == unix.EAGAIN { + if linuxerr.Equals(linuxerr.EAGAIN, err) { continue } if err != nil { diff --git a/pkg/sentry/kernel/ipc/BUILD b/pkg/sentry/kernel/ipc/BUILD new file mode 100644 index 000000000..e42a94e15 --- /dev/null +++ b/pkg/sentry/kernel/ipc/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "ipc", + srcs = [ + "object.go", + "registry.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/errors/linuxerr", + "//pkg/log", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + ], +) diff --git a/pkg/sentry/kernel/ipc/object.go b/pkg/sentry/kernel/ipc/object.go new file mode 100644 index 000000000..facd157c7 --- /dev/null +++ b/pkg/sentry/kernel/ipc/object.go @@ -0,0 +1,150 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ipc defines functionality and utilities common to sysvipc mechanisms. +// +// Lock ordering: [shm/semaphore/msgqueue].Registry.mu -> Mechanism +package ipc + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// Key is a user-provided identifier for IPC objects. +type Key int32 + +// ID is a kernel identifier for IPC objects. +type ID int32 + +// Object represents an abstract IPC object with fields common to all IPC +// mechanisms. +// +// +stateify savable +type Object struct { + // User namespace which owns the IPC namespace which owns the IPC object. + // Immutable. + UserNS *auth.UserNamespace + + // ID is a kernel identifier for the IPC object. Immutable. + ID ID + + // Key is a user-provided identifier for the IPC object. Immutable. + Key Key + + // Creator is the user who created the IPC object. Immutable. + Creator fs.FileOwner + + // Owner is the current owner of the IPC object. + Owner fs.FileOwner + + // Perms is the access permissions the IPC object. + Perms fs.FilePermissions +} + +// Mechanism represents a SysV mechanism that holds an IPC object. It can also +// be looked at as a container for an ipc.Object, which is by definition a fully +// functional SysV object. +type Mechanism interface { + // Lock behaves the same as Mutex.Lock on the mechanism. + Lock() + + // Unlock behaves the same as Mutex.Unlock on the mechanism. + Unlock() + + // Object returns a pointer to the mechanism's ipc.Object. Mechanism.Lock, + // and Mechanism.Unlock should be used when the object is used. + Object() *Object + + // Destroy destroys the mechanism. + Destroy() +} + +// NewObject returns a new, initialized ipc.Object. The newly returned object +// doesn't have a valid ID. When the object is registered, the registry assigns +// it a new unique ID. +func NewObject(un *auth.UserNamespace, key Key, creator, owner fs.FileOwner, perms fs.FilePermissions) *Object { + return &Object{ + UserNS: un, + Key: key, + Creator: creator, + Owner: owner, + Perms: perms, + } +} + +// CheckOwnership verifies whether an IPC object may be accessed using creds as +// an owner. See ipc/util.c:ipcctl_obtain_check() in Linux. +func (o *Object) CheckOwnership(creds *auth.Credentials) bool { + if o.Owner.UID == creds.EffectiveKUID || o.Creator.UID == creds.EffectiveKUID { + return true + } + + // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux + // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented + // for use to "override IPC ownership checks". + return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, o.UserNS) +} + +// CheckPermissions verifies whether an IPC object is accessible using creds for +// access described by req. See ipc/util.c:ipcperms() in Linux. +func (o *Object) CheckPermissions(creds *auth.Credentials, req fs.PermMask) bool { + p := o.Perms.Other + if o.Owner.UID == creds.EffectiveKUID { + p = o.Perms.User + } else if creds.InGroup(o.Owner.GID) { + p = o.Perms.Group + } + + if p.SupersetOf(req) { + return true + } + return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, o.UserNS) +} + +// Set modifies attributes for an IPC object. See *ctl(IPC_SET). +// +// Precondition: Mechanism.mu must be held. +func (o *Object) Set(ctx context.Context, perm *linux.IPCPerm) error { + creds := auth.CredentialsFromContext(ctx) + uid := creds.UserNamespace.MapToKUID(auth.UID(perm.UID)) + gid := creds.UserNamespace.MapToKGID(auth.GID(perm.GID)) + if !uid.Ok() || !gid.Ok() { + // The man pages don't specify an errno for invalid uid/gid, but EINVAL + // is generally used for invalid arguments. + return linuxerr.EINVAL + } + + if !o.CheckOwnership(creds) { + // "The argument cmd has the value IPC_SET or IPC_RMID, but the + // effective user ID of the calling process is not the creator (as + // found in msg_perm.cuid) or the owner (as found in msg_perm.uid) + // of the message queue, and the caller is not privileged (Linux: + // does not have the CAP_SYS_ADMIN capability)." + return linuxerr.EPERM + } + + // User may only modify the lower 9 bits of the mode. All the other bits are + // always 0 for the underlying inode. + mode := linux.FileMode(perm.Mode & 0x1ff) + + o.Perms = fs.FilePermsFromMode(mode) + o.Owner.UID = uid + o.Owner.GID = gid + + return nil +} diff --git a/pkg/sentry/kernel/ipc/registry.go b/pkg/sentry/kernel/ipc/registry.go new file mode 100644 index 000000000..91de19070 --- /dev/null +++ b/pkg/sentry/kernel/ipc/registry.go @@ -0,0 +1,196 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ipc + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// Registry is similar to Object, but for registries. It represent an abstract +// SysV IPC registry with fields common to all SysV registries. Registry is not +// thread-safe, and should be protected using a mutex. +// +// +stateify savable +type Registry struct { + // UserNS owning the IPC namespace this registry belongs to. Immutable. + UserNS *auth.UserNamespace + + // objects is a map of IDs to IPC mechanisms. + objects map[ID]Mechanism + + // KeysToIDs maps a lookup key to an ID. + keysToIDs map[Key]ID + + // lastIDUsed is used to find the next available ID for object creation. + lastIDUsed ID +} + +// NewRegistry return a new, initialized ipc.Registry. +func NewRegistry(userNS *auth.UserNamespace) *Registry { + return &Registry{ + UserNS: userNS, + objects: make(map[ID]Mechanism), + keysToIDs: make(map[Key]ID), + } +} + +// Find uses key to search for and return a SysV mechanism. Find returns an +// error if an object is found by shouldn't be, or if the user doesn't have +// permission to use the object. If no object is found, Find checks create +// flag, and returns an error only if it's false. +func (r *Registry) Find(ctx context.Context, key Key, mode linux.FileMode, create, exclusive bool) (Mechanism, error) { + if id, ok := r.keysToIDs[key]; ok { + mech := r.objects[id] + mech.Lock() + defer mech.Unlock() + + obj := mech.Object() + creds := auth.CredentialsFromContext(ctx) + if !obj.CheckPermissions(creds, fs.PermsFromMode(mode)) { + // The [calling process / user] does not have permission to access + // the set, and does not have the CAP_IPC_OWNER capability in the + // user namespace that governs its IPC namespace. + return nil, linuxerr.EACCES + } + + if create && exclusive { + // IPC_CREAT and IPC_EXCL were specified, but an object already + // exists for key. + return nil, linuxerr.EEXIST + } + return mech, nil + } + + if !create { + // No object exists for key and msgflg did not specify IPC_CREAT. + return nil, linuxerr.ENOENT + } + + return nil, nil +} + +// Register adds the given object into Registry.Objects, and assigns it a new +// ID. It returns an error if all IDs are exhausted. +func (r *Registry) Register(m Mechanism) error { + id, err := r.newID() + if err != nil { + return err + } + + obj := m.Object() + obj.ID = id + + r.objects[id] = m + r.keysToIDs[obj.Key] = id + + return nil +} + +// newID finds the first unused ID in the registry, and returns an error if +// non is found. +func (r *Registry) newID() (ID, error) { + // Find the next available ID. + for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { + // Handle wrap around. + if id < 0 { + id = 0 + continue + } + if r.objects[id] == nil { + r.lastIDUsed = id + return id, nil + } + } + + log.Warningf("ids exhausted, they may be leaking") + + // The man pages for shmget(2) mention that ENOSPC should be used if "All + // possible shared memory IDs have been taken (SHMMNI)". Other SysV + // mechanisms don't have a specific errno for running out of IDs, but they + // return ENOSPC if the max number of objects is exceeded, so we assume that + // it's the same case. + return 0, linuxerr.ENOSPC +} + +// Remove removes the mechanism with the given id from the registry, and calls +// mechanism.Destroy to perform mechanism-specific removal. +func (r *Registry) Remove(id ID, creds *auth.Credentials) error { + mech := r.objects[id] + if mech == nil { + return linuxerr.EINVAL + } + + mech.Lock() + defer mech.Unlock() + + obj := mech.Object() + + // The effective user ID of the calling process must match the creator or + // owner of the [mechanism], or the caller must be privileged. + if !obj.CheckOwnership(creds) { + return linuxerr.EPERM + } + + delete(r.objects, obj.ID) + delete(r.keysToIDs, obj.Key) + mech.Destroy() + + return nil +} + +// ForAllObjects executes a given function for all given objects. +func (r *Registry) ForAllObjects(f func(o Mechanism)) { + for _, o := range r.objects { + f(o) + } +} + +// FindByID returns the mechanism with the given ID, nil if non exists. +func (r *Registry) FindByID(id ID) Mechanism { + return r.objects[id] +} + +// DissociateKey removes the association between a mechanism and its key +// (deletes it from r.keysToIDs), preventing it from being discovered by any new +// process, but not necessarily destroying it. If the given key doesn't exist, +// nothing is changed. +func (r *Registry) DissociateKey(key Key) { + delete(r.keysToIDs, key) +} + +// DissociateID removes the association between a mechanism and its ID (deletes +// it from r.objects). An ID can't be removed unless the associated key is +// removed already, this is done to prevent the users from acquiring nil a +// Mechanism. +// +// Precondition: must be preceded by a call to r.DissociateKey. +func (r *Registry) DissociateID(id ID) { + delete(r.objects, id) +} + +// ObjectCount returns the number of registered objects. +func (r *Registry) ObjectCount() int { + return len(r.objects) +} + +// LastIDUsed returns the last used ID. +func (r *Registry) LastIDUsed() ID { + return r.lastIDUsed +} diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 9545bb5ef..0b101b1bb 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" ) @@ -30,6 +31,7 @@ type IPCNamespace struct { // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace + queues *msgqueue.Registry semaphores *semaphore.Registry shms *shm.Registry } @@ -38,6 +40,7 @@ type IPCNamespace struct { func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { ns := &IPCNamespace{ userNS: userNS, + queues: msgqueue.NewRegistry(userNS), semaphores: semaphore.NewRegistry(userNS), shms: shm.NewRegistry(userNS), } @@ -45,6 +48,11 @@ func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { return ns } +// MsgqueueRegistry returns the message queue registry for this namespace. +func (i *IPCNamespace) MsgqueueRegistry() *msgqueue.Registry { + return i.queues +} + // SemaphoreRegistry returns the semaphore set registry for this namespace. func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry { return i.semaphores diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go index 4b943106b..e8a71bec1 100644 --- a/pkg/sentry/kernel/kcov.go +++ b/pkg/sentry/kernel/kcov.go @@ -22,13 +22,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov @@ -125,19 +125,19 @@ func (kcov *Kcov) InitTrace(size uint64) error { defer kcov.mu.Unlock() if kcov.mode != linux.KCOV_MODE_DISABLED { - return syserror.EBUSY + return linuxerr.EBUSY } // To simplify all the logic around mapping, we require that the length of the // shared region is a multiple of the system page size. if (8*size)&(hostarch.PageSize-1) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // We need space for at least two uint64s to hold current position and a // single PC. if size < 2 || size > kcovAreaSizeMax { - return syserror.EINVAL + return linuxerr.EINVAL } kcov.size = size @@ -157,7 +157,7 @@ func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error { // KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call. if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil { - return syserror.EINVAL + return linuxerr.EINVAL } switch traceKind { @@ -165,13 +165,13 @@ func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error { kcov.mode = linux.KCOV_MODE_TRACE_PC case linux.KCOV_TRACE_CMP: // We do not support KCOV_MODE_TRACE_CMP. - return syserror.ENOTSUP + return linuxerr.ENOTSUP default: - return syserror.EINVAL + return linuxerr.EINVAL } if kcov.owningTask != nil && kcov.owningTask != t { - return syserror.EBUSY + return linuxerr.EBUSY } kcov.owningTask = t @@ -195,7 +195,7 @@ func (kcov *Kcov) DisableTrace(ctx context.Context) error { } if t != kcov.owningTask { - return syserror.EINVAL + return linuxerr.EINVAL } kcov.mode = linux.KCOV_MODE_INIT kcov.owningTask = nil @@ -237,7 +237,7 @@ func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) erro defer kcov.mu.Unlock() if kcov.mode != linux.KCOV_MODE_INIT { - return syserror.EINVAL + return linuxerr.EINVAL } if kcov.mappable == nil { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index d537e608a..df5160b67 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -143,12 +143,6 @@ type Kernel struct { // to CreateProcess, and is protected by extMu. globalInit *ThreadGroup - // realtimeClock is a ktime.Clock based on timekeeper's Realtime. - realtimeClock *timekeeperClock - - // monotonicClock is a ktime.Clock based on timekeeper's Monotonic. - monotonicClock *timekeeperClock - // syslog is the kernel log. syslog syslog @@ -306,6 +300,9 @@ type InitKernelArgs struct { // FeatureSet is the emulated CPU feature set. FeatureSet *cpuid.FeatureSet + // Timekeeper manages time for all tasks in the system. + Timekeeper *Timekeeper + // RootUserNamespace is the root user namespace. RootUserNamespace *auth.UserNamespace @@ -345,24 +342,18 @@ type InitKernelArgs struct { PIDNamespace *PIDNamespace } -// SetTimekeeper sets Kernel.timekeeper. SetTimekeeper must be called before -// Init. -func (k *Kernel) SetTimekeeper(tk *Timekeeper) { - k.timekeeper = tk -} - // Init initialize the Kernel with no tasks. // // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile -// and Kernel.SetTimekeeper before calling Init. +// before calling Init. func (k *Kernel) Init(args InitKernelArgs) error { if args.FeatureSet == nil { return fmt.Errorf("args.FeatureSet is nil") } - if k.timekeeper == nil { - return fmt.Errorf("timekeeper is nil") + if args.Timekeeper == nil { + return fmt.Errorf("args.Timekeeper is nil") } - if k.timekeeper.clocks == nil { + if args.Timekeeper.clocks == nil { return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") } if args.RootUserNamespace == nil { @@ -373,6 +364,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.featureSet = args.FeatureSet + k.timekeeper = args.Timekeeper k.tasks = newTaskSet(args.PIDNamespace) k.rootUserNamespace = args.RootUserNamespace k.rootUTSNamespace = args.RootUTSNamespace @@ -397,8 +389,6 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.extraAuxv = args.ExtraAuxv k.vdso = args.Vdso - k.realtimeClock = &timekeeperClock{tk: k.timekeeper, c: sentrytime.Realtime} - k.monotonicClock = &timekeeperClock{tk: k.timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() k.ptraceExceptions = make(map[*Task]*Task) @@ -531,6 +521,8 @@ func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error { } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) + // Save the timekeeper's state. + // Save the kernel state. kernelStart := time.Now() stats, err := state.Save(ctx, w, k) @@ -675,7 +667,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { } // LoadFrom returns a new Kernel loaded from args. -func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { +func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { loadStart := time.Now() initAppCores := k.applicationCores @@ -722,6 +714,11 @@ func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, net inet.Stack, cl log.Infof("Overall load took [%s]", time.Since(loadStart)) k.Timekeeper().SetClocks(clocks) + + if timeReady != nil { + close(timeReady) + } + if net != nil { net.Resume() } @@ -1103,7 +1100,7 @@ func (k *Kernel) Start() error { } k.started = true - k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k)) + k.cpuClockTicker = ktime.NewTimer(k.timekeeper.monotonicClock, newKernelCPUClockTicker(k)) k.cpuClockTicker.Swap(ktime.Setting{ Enabled: true, Period: linux.ClockTick, @@ -1258,7 +1255,7 @@ func (k *Kernel) incRunningTasks() { // These cause very different value of cpuClock. But again, since // nothing was running while the ticker was disabled, those differences // don't matter. - setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now()) + setting, exp := k.cpuClockTickerSetting.At(k.timekeeper.monotonicClock.Now()) if exp > 0 { atomic.AddUint64(&k.cpuClock, exp) } @@ -1302,11 +1299,11 @@ func (k *Kernel) WaitExited() { } // Kill requests that all tasks in k immediately exit as if group exiting with -// status es. Kill does not wait for tasks to exit. -func (k *Kernel) Kill(es ExitStatus) { +// status ws. Kill does not wait for tasks to exit. +func (k *Kernel) Kill(ws linux.WaitStatus) { k.extMu.Lock() defer k.extMu.Unlock() - k.tasks.Kill(es) + k.tasks.Kill(ws) } // Pause requests that all tasks in k temporarily stop executing, and blocks @@ -1468,12 +1465,12 @@ func (k *Kernel) ApplicationCores() uint { // RealtimeClock returns the application CLOCK_REALTIME clock. func (k *Kernel) RealtimeClock() ktime.Clock { - return k.realtimeClock + return k.timekeeper.realtimeClock } // MonotonicClock returns the application CLOCK_MONOTONIC clock. func (k *Kernel) MonotonicClock() ktime.Clock { - return k.monotonicClock + return k.timekeeper.monotonicClock } // CPUClockNow returns the current value of k.cpuClock. @@ -1553,32 +1550,6 @@ func (k *Kernel) SetSaveError(err error) { } } -var _ tcpip.Clock = (*Kernel)(nil) - -// Now implements tcpip.Clock.NowNanoseconds. -func (k *Kernel) Now() time.Time { - nsec, err := k.timekeeper.GetTime(sentrytime.Realtime) - if err != nil { - panic("timekeeper.GetTime(sentrytime.Realtime): " + err.Error()) - } - return time.Unix(0, nsec) -} - -// NowMonotonic implements tcpip.Clock.NowMonotonic. -func (k *Kernel) NowMonotonic() tcpip.MonotonicTime { - nsec, err := k.timekeeper.GetTime(sentrytime.Monotonic) - if err != nil { - panic("timekeeper.GetTime(sentrytime.Monotonic): " + err.Error()) - } - var mt tcpip.MonotonicTime - return mt.Add(time.Duration(nsec) * time.Nanosecond) -} - -// AfterFunc implements tcpip.Clock.AfterFunc. -func (k *Kernel) AfterFunc(d time.Duration, f func()) tcpip.Timer { - return ktime.TcpipAfterFunc(k.realtimeClock, d, f) -} - // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or // LoadFrom. func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go index 2e66ec587..5ffafb0d1 100644 --- a/pkg/sentry/kernel/kernel_opts.go +++ b/pkg/sentry/kernel/kernel_opts.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package kernel // SpecialOpts contains non-standard options for the kernel. diff --git a/pkg/sentry/kernel/msgqueue/BUILD b/pkg/sentry/kernel/msgqueue/BUILD new file mode 100644 index 000000000..5ec11e1f6 --- /dev/null +++ b/pkg/sentry/kernel/msgqueue/BUILD @@ -0,0 +1,36 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "message_list", + out = "message_list.go", + package = "msgqueue", + prefix = "msg", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Message", + "Linker": "*Message", + }, +) + +go_library( + name = "msgqueue", + srcs = [ + "message_list.go", + "msgqueue.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/errors/linuxerr", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/ipc", + "//pkg/sentry/kernel/time", + "//pkg/sync", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/msgqueue/msgqueue.go b/pkg/sentry/kernel/msgqueue/msgqueue.go new file mode 100644 index 000000000..c7c5e41fb --- /dev/null +++ b/pkg/sentry/kernel/msgqueue/msgqueue.go @@ -0,0 +1,618 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package msgqueue implements System V message queues. +package msgqueue + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + // System-wide limit for maximum number of queues. + maxQueues = linux.MSGMNI + + // Maximum size of a queue in bytes. + maxQueueBytes = linux.MSGMNB + + // Maximum size of a message in bytes. + maxMessageBytes = linux.MSGMAX +) + +// Registry contains a set of message queues that can be referenced using keys +// or IDs. +// +// +stateify savable +type Registry struct { + // mu protects all the fields below. + mu sync.Mutex `state:"nosave"` + + // reg defines basic fields and operations needed for all SysV registries. + reg *ipc.Registry +} + +// NewRegistry returns a new Registry ready to be used. +func NewRegistry(userNS *auth.UserNamespace) *Registry { + return &Registry{ + reg: ipc.NewRegistry(userNS), + } +} + +// Queue represents a SysV message queue, described by sysvipc(7). +// +// +stateify savable +type Queue struct { + // registry is the registry owning this queue. Immutable. + registry *Registry + + // mu protects all the fields below. + mu sync.Mutex `state:"nosave"` + + // dead is set to true when a queue is removed from the registry and should + // not be used. Operations on the queue should check dead, and return + // EIDRM if set to true. + dead bool + + // obj defines basic fields that should be included in all SysV IPC objects. + obj *ipc.Object + + // senders holds a queue of blocked message senders. Senders are notified + // when enough space is available in the queue to insert their message. + senders waiter.Queue + + // receivers holds a queue of blocked receivers. Receivers are notified + // when a new message is inserted into the queue and can be received. + receivers waiter.Queue + + // messages is a list of sent messages. + messages msgList + + // sendTime is the last time a msgsnd was perfomed. + sendTime ktime.Time + + // receiveTime is the last time a msgrcv was performed. + receiveTime ktime.Time + + // changeTime is the last time the queue was modified using msgctl. + changeTime ktime.Time + + // byteCount is the current number of message bytes in the queue. + byteCount uint64 + + // messageCount is the current number of messages in the queue. + messageCount uint64 + + // maxBytes is the maximum allowed number of bytes in the queue, and is also + // used as a limit for the number of total possible messages. + maxBytes uint64 + + // sendPID is the PID of the process that performed the last msgsnd. + sendPID int32 + + // receivePID is the PID of the process that performed the last msgrcv. + receivePID int32 +} + +// Message represents a message exchanged through a Queue via msgsnd(2) and +// msgrcv(2). +// +// +stateify savable +type Message struct { + msgEntry + + // Type is an integer representing the type of the sent message. + Type int64 + + // Text is an untyped block of memory. + Text []byte + + // Size is the size of Text. + Size uint64 +} + +func (m *Message) makeCopy() *Message { + new := &Message{ + Type: m.Type, + Size: m.Size, + } + new.Text = make([]byte, len(m.Text)) + copy(new.Text, m.Text) + return new +} + +// Blocker is used for blocking Queue.Send, and Queue.Receive calls that serves +// as an abstracted version of kernel.Task. kernel.Task is not directly used to +// prevent circular dependencies. +type Blocker interface { + Block(C <-chan struct{}) error +} + +// FindOrCreate creates a new message queue or returns an existing one. See +// msgget(2). +func (r *Registry) FindOrCreate(ctx context.Context, key ipc.Key, mode linux.FileMode, private, create, exclusive bool) (*Queue, error) { + r.mu.Lock() + defer r.mu.Unlock() + + if !private { + queue, err := r.reg.Find(ctx, key, mode, create, exclusive) + if err != nil { + return nil, err + } + + if queue != nil { + return queue.(*Queue), nil + } + } + + // Check system-wide limits. + if r.reg.ObjectCount() >= maxQueues { + return nil, linuxerr.ENOSPC + } + + return r.newQueueLocked(ctx, key, fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode)) +} + +// newQueueLocked creates a new queue using the given fields. An error is +// returned if there're no more available identifiers. +// +// Precondition: r.mu must be held. +func (r *Registry) newQueueLocked(ctx context.Context, key ipc.Key, creator fs.FileOwner, perms fs.FilePermissions) (*Queue, error) { + q := &Queue{ + registry: r, + obj: ipc.NewObject(r.reg.UserNS, key, creator, creator, perms), + sendTime: ktime.ZeroTime, + receiveTime: ktime.ZeroTime, + changeTime: ktime.NowFromContext(ctx), + maxBytes: maxQueueBytes, + } + + err := r.reg.Register(q) + if err != nil { + return nil, err + } + return q, nil +} + +// Remove removes the queue with specified ID. All waiters (readers and +// writers) and writers will be awakened and fail. Remove will return an error +// if the ID is invalid, or the the user doesn't have privileges. +func (r *Registry) Remove(id ipc.ID, creds *auth.Credentials) error { + r.mu.Lock() + defer r.mu.Unlock() + + r.reg.Remove(id, creds) + return nil +} + +// FindByID returns the queue with the specified ID and an error if the ID +// doesn't exist. +func (r *Registry) FindByID(id ipc.ID) (*Queue, error) { + r.mu.Lock() + defer r.mu.Unlock() + + mech := r.reg.FindByID(id) + if mech == nil { + return nil, linuxerr.EINVAL + } + return mech.(*Queue), nil +} + +// IPCInfo reports global parameters for message queues. See msgctl(IPC_INFO). +func (r *Registry) IPCInfo(ctx context.Context) *linux.MsgInfo { + return &linux.MsgInfo{ + MsgPool: linux.MSGPOOL, + MsgMap: linux.MSGMAP, + MsgMax: linux.MSGMAX, + MsgMnb: linux.MSGMNB, + MsgMni: linux.MSGMNI, + MsgSsz: linux.MSGSSZ, + MsgTql: linux.MSGTQL, + MsgSeg: linux.MSGSEG, + } +} + +// MsgInfo reports global parameters for message queues. See msgctl(MSG_INFO). +func (r *Registry) MsgInfo(ctx context.Context) *linux.MsgInfo { + r.mu.Lock() + defer r.mu.Unlock() + + var messages, bytes uint64 + r.reg.ForAllObjects( + func(o ipc.Mechanism) { + q := o.(*Queue) + q.mu.Lock() + messages += q.messageCount + bytes += q.byteCount + q.mu.Unlock() + }, + ) + + return &linux.MsgInfo{ + MsgPool: int32(r.reg.ObjectCount()), + MsgMap: int32(messages), + MsgTql: int32(bytes), + MsgMax: linux.MSGMAX, + MsgMnb: linux.MSGMNB, + MsgMni: linux.MSGMNI, + MsgSsz: linux.MSGSSZ, + MsgSeg: linux.MSGSEG, + } +} + +// Send appends a message to the message queue, and returns an error if sending +// fails. See msgsnd(2). +func (q *Queue) Send(ctx context.Context, m Message, b Blocker, wait bool, pid int32) error { + // Try to perform a non-blocking send using queue.append. If EWOULDBLOCK + // is returned, start the blocking procedure. Otherwise, return normally. + creds := auth.CredentialsFromContext(ctx) + + // Fast path: first attempt a non-blocking push. + if err := q.push(ctx, m, creds, pid); err != linuxerr.EWOULDBLOCK { + return err + } + + if !wait { + return linuxerr.EAGAIN + } + + // Slow path: at this point, the queue was found to be full, and we were + // asked to block. + + e, ch := waiter.NewChannelEntry(nil) + q.senders.EventRegister(&e, waiter.EventOut) + defer q.senders.EventUnregister(&e) + + // Note: we need to check again before blocking the first time since space + // may have become available. + for { + if err := q.push(ctx, m, creds, pid); err != linuxerr.EWOULDBLOCK { + return err + } + if err := b.Block(ch); err != nil { + return err + } + } +} + +// push appends a message to the queue's message list and notifies waiting +// receivers that a message has been inserted. It returns an error if adding +// the message would cause the queue to exceed its maximum capacity, which can +// be used as a signal to block the task. Other errors should be returned as is. +func (q *Queue) push(ctx context.Context, m Message, creds *auth.Credentials, pid int32) error { + if m.Type <= 0 { + return linuxerr.EINVAL + } + + q.mu.Lock() + defer q.mu.Unlock() + + if !q.obj.CheckPermissions(creds, fs.PermMask{Write: true}) { + // The calling process does not have write permission on the message + // queue, and does not have the CAP_IPC_OWNER capability in the user + // namespace that governs its IPC namespace. + return linuxerr.EACCES + } + + // Queue was removed while the process was waiting. + if q.dead { + return linuxerr.EIDRM + } + + // Check if sufficient space is available (the queue isn't full.) From + // the man pages: + // + // "A message queue is considered to be full if either of the following + // conditions is true: + // + // • Adding a new message to the queue would cause the total number + // of bytes in the queue to exceed the queue's maximum size (the + // msg_qbytes field). + // + // • Adding another message to the queue would cause the total + // number of messages in the queue to exceed the queue's maximum + // size (the msg_qbytes field). This check is necessary to + // prevent an unlimited number of zero-length messages being + // placed on the queue. Although such messages contain no data, + // they nevertheless consume (locked) kernel memory." + // + // The msg_qbytes field in our implementation is q.maxBytes. + if m.Size+q.byteCount > q.maxBytes || q.messageCount+1 > q.maxBytes { + return linuxerr.EWOULDBLOCK + } + + // Copy the message into the queue. + q.messages.PushBack(&m) + + q.byteCount += m.Size + q.messageCount++ + q.sendPID = pid + q.sendTime = ktime.NowFromContext(ctx) + + // Notify receivers about the new message. + q.receivers.Notify(waiter.EventIn) + + return nil +} + +// Receive removes a message from the queue and returns it. See msgrcv(2). +func (q *Queue) Receive(ctx context.Context, b Blocker, mType int64, maxSize int64, wait, truncate, except bool, pid int32) (*Message, error) { + if maxSize < 0 || maxSize > maxMessageBytes { + return nil, linuxerr.EINVAL + } + max := uint64(maxSize) + creds := auth.CredentialsFromContext(ctx) + + // Fast path: first attempt a non-blocking pop. + if msg, err := q.pop(ctx, creds, mType, max, truncate, except, pid); err != linuxerr.EWOULDBLOCK { + return msg, err + } + + if !wait { + return nil, linuxerr.ENOMSG + } + + // Slow path: at this point, the queue was found to be empty, and we were + // asked to block. + + e, ch := waiter.NewChannelEntry(nil) + q.receivers.EventRegister(&e, waiter.EventIn) + defer q.receivers.EventUnregister(&e) + + // Note: we need to check again before blocking the first time since a + // message may have become available. + for { + if msg, err := q.pop(ctx, creds, mType, max, truncate, except, pid); err != linuxerr.EWOULDBLOCK { + return msg, err + } + if err := b.Block(ch); err != nil { + return nil, err + } + } +} + +// pop pops the first message from the queue that matches the given type. It +// returns an error for all the cases specified in msgrcv(2). If the queue is +// empty or no message of the specified type is available, a EWOULDBLOCK error +// is returned, which can then be used as a signal to block the process or fail. +func (q *Queue) pop(ctx context.Context, creds *auth.Credentials, mType int64, maxSize uint64, truncate, except bool, pid int32) (*Message, error) { + q.mu.Lock() + defer q.mu.Unlock() + + if !q.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { + // The calling process does not have read permission on the message + // queue, and does not have the CAP_IPC_OWNER capability in the user + // namespace that governs its IPC namespace. + return nil, linuxerr.EACCES + } + + // Queue was removed while the process was waiting. + if q.dead { + return nil, linuxerr.EIDRM + } + + if q.messages.Empty() { + return nil, linuxerr.EWOULDBLOCK + } + + // Get a message from the queue. + var msg *Message + switch { + case mType == 0: + msg = q.messages.Front() + case mType > 0: + msg = q.msgOfType(mType, except) + case mType < 0: + msg = q.msgOfTypeLessThan(-1 * mType) + } + + // If no message exists, return a blocking singal. + if msg == nil { + return nil, linuxerr.EWOULDBLOCK + } + + // Check message's size is acceptable. + if maxSize < msg.Size { + if !truncate { + return nil, linuxerr.E2BIG + } + msg.Size = maxSize + msg.Text = msg.Text[:maxSize+1] + } + + q.messages.Remove(msg) + + q.byteCount -= msg.Size + q.messageCount-- + q.receivePID = pid + q.receiveTime = ktime.NowFromContext(ctx) + + // Notify senders about available space. + q.senders.Notify(waiter.EventOut) + + return msg, nil +} + +// Copy copies a message from the queue without deleting it. If no message +// exists, an error is returned. See msgrcv(MSG_COPY). +func (q *Queue) Copy(mType int64) (*Message, error) { + q.mu.Lock() + defer q.mu.Unlock() + + if mType < 0 || q.messages.Empty() { + return nil, linuxerr.ENOMSG + } + + msg := q.msgAtIndex(mType) + if msg == nil { + return nil, linuxerr.ENOMSG + } + return msg.makeCopy(), nil +} + +// msgOfType returns the first message with the specified type, nil if no +// message is found. If except is true, the first message of a type not equal +// to mType will be returned. +// +// Precondition: caller must hold q.mu. +func (q *Queue) msgOfType(mType int64, except bool) *Message { + if except { + for msg := q.messages.Front(); msg != nil; msg = msg.Next() { + if msg.Type != mType { + return msg + } + } + return nil + } + + for msg := q.messages.Front(); msg != nil; msg = msg.Next() { + if msg.Type == mType { + return msg + } + } + return nil +} + +// msgOfTypeLessThan return the the first message with the lowest type less +// than or equal to mType, nil if no such message exists. +// +// Precondition: caller must hold q.mu. +func (q *Queue) msgOfTypeLessThan(mType int64) (m *Message) { + min := mType + for msg := q.messages.Front(); msg != nil; msg = msg.Next() { + if msg.Type <= mType && msg.Type < min { + m = msg + min = msg.Type + } + } + return m +} + +// msgAtIndex returns a pointer to a message at given index, nil if non exits. +// +// Precondition: caller must hold q.mu. +func (q *Queue) msgAtIndex(mType int64) *Message { + msg := q.messages.Front() + for ; mType != 0 && msg != nil; mType-- { + msg = msg.Next() + } + return msg +} + +// Set modifies some values of the queue. See msgctl(IPC_SET). +func (q *Queue) Set(ctx context.Context, ds *linux.MsqidDS) error { + q.mu.Lock() + defer q.mu.Unlock() + + creds := auth.CredentialsFromContext(ctx) + if ds.MsgQbytes > maxQueueBytes && !creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, q.obj.UserNS) { + // "An attempt (IPC_SET) was made to increase msg_qbytes beyond the + // system parameter MSGMNB, but the caller is not privileged (Linux: + // does not have the CAP_SYS_RESOURCE capability)." + return linuxerr.EPERM + } + + if err := q.obj.Set(ctx, &ds.MsgPerm); err != nil { + return err + } + + q.maxBytes = ds.MsgQbytes + q.changeTime = ktime.NowFromContext(ctx) + return nil +} + +// Stat returns a MsqidDS object filled with information about the queue. See +// msgctl(IPC_STAT) and msgctl(MSG_STAT). +func (q *Queue) Stat(ctx context.Context) (*linux.MsqidDS, error) { + return q.stat(ctx, fs.PermMask{Read: true}) +} + +// StatAny is similar to Queue.Stat, but doesn't require read permission. See +// msgctl(MSG_STAT_ANY). +func (q *Queue) StatAny(ctx context.Context) (*linux.MsqidDS, error) { + return q.stat(ctx, fs.PermMask{}) +} + +// stat returns a MsqidDS object filled with information about the queue. An +// error is returned if the user doesn't have the specified permissions. +func (q *Queue) stat(ctx context.Context, mask fs.PermMask) (*linux.MsqidDS, error) { + q.mu.Lock() + defer q.mu.Unlock() + + creds := auth.CredentialsFromContext(ctx) + if !q.obj.CheckPermissions(creds, mask) { + // "The caller must have read permission on the message queue." + return nil, linuxerr.EACCES + } + + return &linux.MsqidDS{ + MsgPerm: linux.IPCPerm{ + Key: uint32(q.obj.Key), + UID: uint32(creds.UserNamespace.MapFromKUID(q.obj.Owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(q.obj.Owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(q.obj.Creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(q.obj.Creator.GID)), + Mode: uint16(q.obj.Perms.LinuxMode()), + Seq: 0, // IPC sequences not supported. + }, + MsgStime: q.sendTime.TimeT(), + MsgRtime: q.receiveTime.TimeT(), + MsgCtime: q.changeTime.TimeT(), + MsgCbytes: q.byteCount, + MsgQnum: q.messageCount, + MsgQbytes: q.maxBytes, + MsgLspid: q.sendPID, + MsgLrpid: q.receivePID, + }, nil +} + +// Lock implements ipc.Mechanism.Lock. +func (q *Queue) Lock() { + q.mu.Lock() +} + +// Unlock implements ipc.mechanism.Unlock. +// +// +checklocksignore +func (q *Queue) Unlock() { + q.mu.Unlock() +} + +// Object implements ipc.Mechanism.Object. +func (q *Queue) Object() *ipc.Object { + return q.obj +} + +// Destroy implements ipc.Mechanism.Destroy. +func (q *Queue) Destroy() { + q.dead = true + + // Notify waiters. Senders and receivers will try to run, and return an + // error (EIDRM). Waiters should remove themselves from the queue after + // waking up. + q.senders.Notify(waiter.EventOut) + q.receivers.Notify(waiter.EventIn) +} + +// ID returns queue's ID. +func (q *Queue) ID() ipc.ID { + return q.obj.ID +} diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 34c617b08..5b2bac783 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/abi/linux", "//pkg/amutex", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/marshal/primitive", "//pkg/safemem", @@ -30,7 +31,6 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -47,9 +47,9 @@ go_test( library = ":pipe", deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 6497dc4ba..615591507 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -17,10 +17,10 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // inodeOperations implements fs.InodeOperations for pipes. @@ -94,7 +94,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { if !waitFor(&i.mu, &i.wWakeup, ctx) { r.DecRef(ctx) - return nil, syserror.ErrInterrupted + return nil, linuxerr.ErrInterrupted } } @@ -112,12 +112,12 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi // read side isn't open yet. if flags.NonBlocking { w.DecRef(ctx) - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } if !waitFor(&i.mu, &i.rWakeup, ctx) { w.DecRef(ctx) - return nil, syserror.ErrInterrupted + return nil, linuxerr.ErrInterrupted } } return w, nil @@ -130,10 +130,10 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi return rw, nil default: - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { - return syserror.EPIPE + return linuxerr.EPIPE } diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index d6fb0fdb8..31bd7910a 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -19,9 +19,9 @@ import ( "time" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/syserror" ) type sleeper struct { @@ -239,7 +239,7 @@ func TestBlockedOpenIsCancellable(t *testing.T) { // If the cancel on the sleeper didn't work, the open for read would never // return. res := <-done - if res.error != syserror.ErrInterrupted { + if res.error != linuxerr.ErrInterrupted { t.Fatalf("Cancellation didn't cause GetFile to return fs.ErrInterrupted, got %v.", res.error) } @@ -258,7 +258,7 @@ func TestNonblockingWriteOpenFileNoReaders(t *testing.T) { ctx := newSleeperContext(t) f := NewInodeOperations(ctx, perms, newNamedPipe(t)) - if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != syserror.ENXIO { + if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); !linuxerr.Equals(linuxerr.ENXIO, err) { t.Fatalf("Nonblocking open for write failed unexpected error %v.", err) } } diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 06769931a..86beee6fe 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -22,11 +22,11 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -200,7 +200,7 @@ func (p *Pipe) peekLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) if !p.HasWriters() { return 0, io.EOF } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } count = p.size } @@ -249,7 +249,7 @@ func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error) avail := p.max - p.size if avail == 0 { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } short := false if count > avail { @@ -257,7 +257,7 @@ func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error) // (PIPE_BUF) be atomic, but requires no atomicity for writes // larger than this. if count <= atomicIOBytes { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } count = avail short = true @@ -306,7 +306,7 @@ func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error) // If we shortened the write, adjust the returned error appropriately. if short { - return done, syserror.ErrWouldBlock + return done, linuxerr.ErrWouldBlock } return done, nil @@ -428,18 +428,18 @@ func (p *Pipe) FifoSize(context.Context, *fs.File) (int64, error) { // SetFifoSize implements fs.FifoSizer.SetFifoSize. func (p *Pipe) SetFifoSize(size int64) (int64, error) { if size < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if size < MinimumPipeSize { size = MinimumPipeSize // Per spec. } if size > MaximumPipeSize { - return 0, syserror.EPERM + return 0, linuxerr.EPERM } p.mu.Lock() defer p.mu.Unlock() if size < p.size { - return 0, syserror.EBUSY + return 0, linuxerr.EBUSY } p.max = size return size, nil diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index 867f4a76b..aa3ab305d 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -18,8 +18,8 @@ import ( "bytes" "testing" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -51,8 +51,8 @@ func TestPipeReadBlock(t *testing.T) { defer w.DecRef(ctx) n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1))) - if n != 0 || err != syserror.ErrWouldBlock { - t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, syserror.ErrWouldBlock) + if n != 0 || err != linuxerr.ErrWouldBlock { + t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, linuxerr.ErrWouldBlock) } } @@ -67,7 +67,7 @@ func TestPipeWriteBlock(t *testing.T) { msg := make([]byte, capacity+1) n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) - if wantN, wantErr := int64(capacity), syserror.ErrWouldBlock; n != wantN || err != wantErr { + if wantN, wantErr := int64(capacity), linuxerr.ErrWouldBlock; n != wantN || err != wantErr { t.Fatalf("Writev: got (%d, %v), wanted (%d, %v)", n, err, wantN, wantErr) } } @@ -102,7 +102,7 @@ func TestPipeWriteUntilEnd(t *testing.T) { for { n, err := r.Readv(ctx, dst) dst = dst.DropFirst64(n) - if err == syserror.ErrWouldBlock { + if err == linuxerr.ErrWouldBlock { select { case <-ch: continue @@ -129,7 +129,7 @@ func TestPipeWriteUntilEnd(t *testing.T) { for src.NumBytes() != 0 { n, err := w.Writev(ctx, src) src = src.DropFirst64(n) - if err == syserror.ErrWouldBlock { + if err == linuxerr.ErrWouldBlock { <-ch continue } diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go index dd60cba24..077c5d596 100644 --- a/pkg/sentry/kernel/pipe/pipe_unsafe.go +++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go @@ -23,6 +23,8 @@ import ( // concurrent calls cannot deadlock. // // Preconditions: x != y. +// +checklocksacquire:x.mu +// +checklocksacquire:y.mu func lockTwoPipes(x, y *Pipe) { // Lock the two pipes in order of increasing address. if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) { diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 24e467e93..c883a9014 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -86,7 +87,7 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) if n > 0 { p.Notify(waiter.ReadableEvents) } - if err == unix.EPIPE { + if linuxerr.Equals(linuxerr.EPIPE, err) { // If we are returning EPIPE send SIGPIPE to the task. if sendSig := linux.SignalNoInfoFuncFromContext(ctx); sendSig != nil { sendSig(linux.SIGPIPE) @@ -135,7 +136,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume v = math.MaxInt32 // Silently truncate. } // Copy result to userspace. - iocc := primitive.IOCopyContext{ + iocc := usermem.IOCopyContext{ IO: io, Ctx: ctx, Opts: usermem.IOOpts{ @@ -156,6 +157,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume // // mu must be held by the caller. waitFor returns with mu held, but it will // drop mu before blocking for any reader/writers. +// +checklocks:mu func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { // Ideally this function would simply use a condition variable. However, the // wait needs to be interruptible via 'sleeper', so we must sychronize via a diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 95b948edb..a6f1989f5 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -17,12 +17,12 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -79,7 +79,7 @@ func (vp *VFSPipe) ReaderWriterPair(ctx context.Context, mnt *vfs.Mount, vfsd *v // Allocate implements vfs.FileDescriptionImpl.Allocate. func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error { - return syserror.ESPIPE + return linuxerr.ESPIPE } // Open opens the pipe represented by vp. @@ -90,7 +90,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s readable := vfs.MayReadFileWithOpenFlags(statusFlags) writable := vfs.MayWriteFileWithOpenFlags(statusFlags) if !readable && !writable { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fd, err := vp.newFD(mnt, vfsd, statusFlags, locks) @@ -120,7 +120,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // writer, we have to wait for a writer to open the other end. if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { fd.DecRef(ctx) - return nil, syserror.EINTR + return nil, linuxerr.EINTR } case writable: @@ -131,12 +131,12 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // side isn't open yet. if statusFlags&linux.O_NONBLOCK != 0 { fd.DecRef(ctx) - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { fd.DecRef(ctx) - return nil, syserror.EINTR + return nil, linuxerr.EINTR } } @@ -224,7 +224,7 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.ESPIPE + return linuxerr.ESPIPE } // EventRegister implements waiter.Waitable.EventRegister. @@ -415,7 +415,7 @@ func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { // Preconditions: count > 0. func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) { if dst.pipe == src.pipe { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } lockTwoPipes(dst.pipe, src.pipe) diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go index d801a3d83..319754a42 100644 --- a/pkg/sentry/kernel/posixtimer.go +++ b/pkg/sentry/kernel/posixtimer.go @@ -18,8 +18,8 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" ) // IntervalTimer represents a POSIX interval timer as described by @@ -175,7 +175,7 @@ func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux. break } if t.tg.nextTimerID == end { - return 0, syserror.EAGAIN + return 0, linuxerr.EAGAIN } } @@ -214,16 +214,16 @@ func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux. target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)] t.tg.pidns.owner.mu.RUnlock() if !ok || target.tg != t.tg { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } it.target = target default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if sigev.Notify != linux.SIGEV_NONE { it.signo = linux.Signal(sigev.Signo) if !it.signo.IsValid() { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } it.timer = ktime.NewTimer(c, it) @@ -238,7 +238,7 @@ func (t *Task) IntervalTimerDelete(id linux.TimerID) error { defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { - return syserror.EINVAL + return linuxerr.EINVAL } delete(t.tg.timers, id) it.DestroyTimer() @@ -251,7 +251,7 @@ func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { - return linux.Itimerspec{}, syserror.EINVAL + return linux.Itimerspec{}, linuxerr.EINVAL } newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock()) @@ -269,7 +269,7 @@ func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error) defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { - return linux.Itimerspec{}, syserror.EINVAL + return linux.Itimerspec{}, linuxerr.EINVAL } tm, s := it.timer.Get() @@ -285,7 +285,7 @@ func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) { defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // By timer_create(2) invariant, either it.target == nil (in which case // it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index a6287fd6a..717c9a6b3 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -19,10 +19,10 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -294,7 +294,7 @@ func (t *Task) isYAMADescendantOfLocked(ancestor *Task) bool { // Precondition: the TaskSet mutex must be locked (for reading or writing). func (t *Task) hasYAMAExceptionForLocked(tracer *Task) bool { - allowed, ok := t.k.ptraceExceptions[t] + allowed, ok := t.k.ptraceExceptions[t.tg.leader] if !ok { return false } @@ -464,7 +464,7 @@ func (t *Task) ptraceUnfreezeLocked() { // stop. func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error { if sig != 0 && !sig.IsValid() { - return syserror.EIO + return linuxerr.EIO } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() @@ -481,7 +481,7 @@ func (t *Task) ptraceTraceme() error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if t.hasTracer() { - return syserror.EPERM + return linuxerr.EPERM } if t.parent == nil { // In Linux, only init can not have a parent, and init is assumed never @@ -497,7 +497,7 @@ func (t *Task) ptraceTraceme() error { return nil } if !t.parent.canTraceLocked(t, true) { - return syserror.EPERM + return linuxerr.EPERM } if t.parent.exitState != TaskExitNone { // Fail silently, as if we were successfully attached but then @@ -513,25 +513,25 @@ func (t *Task) ptraceTraceme() error { // ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller. func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { if t.tg == target.tg { - return syserror.EPERM + return linuxerr.EPERM } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if !t.canTraceLocked(target, true) { - return syserror.EPERM + return linuxerr.EPERM } if target.hasTracer() { - return syserror.EPERM + return linuxerr.EPERM } // Attaching to zombies and dead tasks is not permitted; the exit // notification logic relies on this. Linux allows attaching to PF_EXITING // tasks, though. if target.exitState >= TaskExitZombie { - return syserror.EPERM + return linuxerr.EPERM } if seize { if err := target.ptraceSetOptionsLocked(opts); err != nil { - return syserror.EIO + return linuxerr.EIO } } target.ptraceTracer.Store(t) @@ -568,7 +568,7 @@ func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { // ptrace stop. func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error { if sig != 0 && !sig.IsValid() { - return syserror.EIO + return linuxerr.EIO } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() @@ -651,6 +651,7 @@ func (t *Task) forgetTracerLocked() { // Preconditions: // * The signal mutex must be locked. // * The caller must be running on the task goroutine. +// +checklocks:t.tg.signalHandlers.mu func (t *Task) ptraceSignalLocked(info *linux.SignalInfo) bool { if linux.Signal(info.Signo) == linux.SIGKILL { return false @@ -766,14 +767,14 @@ const ( // ptraceClone is called at the end of a clone or fork syscall to check if t // should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK // stop. child is the new task. -func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool { +func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, args *linux.CloneArgs) bool { if !t.hasTracer() { return false } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() event := false - if !opts.Untraced { + if args.Flags&linux.CLONE_UNTRACED == 0 { switch kind { case ptraceCloneKindClone: if t.ptraceOpts.TraceClone { @@ -808,7 +809,7 @@ func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() => // include/linux/ptrace.h:ptrace_init_task(). - if event || opts.InheritTracer { + if event || args.Flags&linux.CLONE_PTRACE != 0 { tracer := t.Tracer() if tracer != nil { child.ptraceTracer.Store(tracer) @@ -910,7 +911,7 @@ func (t *Task) ptraceExit() { return } t.tg.signalHandlers.mu.Lock() - status := t.exitStatus.Status() + status := t.exitStatus t.tg.signalHandlers.mu.Unlock() t.Debugf("Entering PTRACE_EVENT_EXIT stop") t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status)) @@ -938,7 +939,7 @@ func (t *Task) ptraceKill(target *Task) error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if target.Tracer() != t { - return syserror.ESRCH + return linuxerr.ESRCH } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() @@ -962,10 +963,10 @@ func (t *Task) ptraceInterrupt(target *Task) error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if target.Tracer() != t { - return syserror.ESRCH + return linuxerr.ESRCH } if !target.ptraceSeized { - return syserror.EIO + return linuxerr.EIO } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() @@ -994,7 +995,7 @@ func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { linux.PTRACE_O_TRACEVFORK | linux.PTRACE_O_TRACEVFORKDONE) if opts&^valid != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } t.ptraceOpts = ptraceOptions{ ExitKill: opts&linux.PTRACE_O_EXITKILL != 0, @@ -1020,7 +1021,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { // specified by pid. target := t.tg.pidns.TaskWithID(pid) if target == nil { - return syserror.ESRCH + return linuxerr.ESRCH } // PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already @@ -1028,7 +1029,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE { seize := req == linux.PTRACE_SEIZE if seize && addr != 0 { - return syserror.EIO + return linuxerr.EIO } return t.ptraceAttach(target, seize, uintptr(data)) } @@ -1045,7 +1046,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { t.tg.pidns.owner.mu.RLock() if target.Tracer() != t { t.tg.pidns.owner.mu.RUnlock() - return syserror.ESRCH + return linuxerr.ESRCH } if !target.ptraceFreeze() { t.tg.pidns.owner.mu.RUnlock() @@ -1053,7 +1054,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." - // ptrace(2) - return syserror.ESRCH + return linuxerr.ESRCH } t.tg.pidns.owner.mu.RUnlock() // Even if the target has a ptrace-stop active, the tracee's task goroutine @@ -1118,13 +1119,13 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !target.ptraceSeized { - return syserror.EIO + return linuxerr.EIO } if target.ptraceSiginfo == nil { - return syserror.EIO + return linuxerr.EIO } if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP { - return syserror.EIO + return linuxerr.EIO } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() @@ -1221,7 +1222,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if target.ptraceSiginfo == nil { - return syserror.EINVAL + return linuxerr.EINVAL } _, err := target.ptraceSiginfo.CopyOut(t, data) return err @@ -1234,14 +1235,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if target.ptraceSiginfo == nil { - return syserror.EINVAL + return linuxerr.EINVAL } target.ptraceSiginfo = &info return nil case linux.PTRACE_GETSIGMASK: if addr != linux.SignalSetSize { - return syserror.EINVAL + return linuxerr.EINVAL } mask := target.SignalMask() _, err := mask.CopyOut(t, data) @@ -1249,7 +1250,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { case linux.PTRACE_SETSIGMASK: if addr != linux.SignalSetSize { - return syserror.EINVAL + return linuxerr.EINVAL } var mask linux.SignalSet if _, err := mask.CopyIn(t, data); err != nil { diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index 5ae05b5c3..564add01b 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -12,14 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -87,6 +88,6 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) err return err default: - return syserror.EIO + return linuxerr.EIO } } diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index 46dd84cbc..7c2b94339 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -12,16 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kernel import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/syserror" ) // ptraceArch implements arch-specific ptrace commands. func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error { - return syserror.EIO + return linuxerr.EIO } diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index 4bc5bca44..de352f4f2 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -18,9 +18,9 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/hostcpu" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -59,23 +59,23 @@ func (t *Task) RSeqAvailable() bool { func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error { if t.rseqAddr != 0 { if t.rseqAddr != addr { - return syserror.EINVAL + return linuxerr.EINVAL } if t.rseqSignature != signature { - return syserror.EINVAL + return linuxerr.EINVAL } - return syserror.EBUSY + return linuxerr.EBUSY } // rseq must be aligned and correctly sized. if addr&(linux.AlignOfRSeq-1) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if length != linux.SizeOfRSeq { - return syserror.EINVAL + return linuxerr.EINVAL } if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok { - return syserror.EFAULT + return linuxerr.EFAULT } t.rseqAddr = addr @@ -92,7 +92,7 @@ func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error { t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) - return syserror.EFAULT + return linuxerr.EFAULT } return nil @@ -103,16 +103,16 @@ func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error { // Preconditions: The caller must be running on the task goroutine. func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error { if t.rseqAddr == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if t.rseqAddr != addr { - return syserror.EINVAL + return linuxerr.EINVAL } if length != linux.SizeOfRSeq { - return syserror.EINVAL + return linuxerr.EINVAL } if t.rseqSignature != signature { - return syserror.EPERM + return linuxerr.EPERM } if err := t.rseqClearCPU(); err != nil { @@ -152,10 +152,10 @@ func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error { return nil } if r.CriticalSection.Start >= r.CriticalSection.End { - return syserror.EINVAL + return linuxerr.EINVAL } if r.CriticalSection.Contains(r.Restart) { - return syserror.EINVAL + return linuxerr.EINVAL } // TODO(jamieliu): check that r.CriticalSection and r.Restart are in // the application address range, for consistency with Linux. @@ -187,7 +187,7 @@ func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error { // unfortunate, but unlikely in a correct program. if err := t.rseqUpdateCPU(); err != nil { t.oldRSeqCPUAddr = 0 - return syserror.EINVAL // yes, EINVAL, not err or EFAULT + return linuxerr.EINVAL // yes, EINVAL, not err or EFAULT } return nil } diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index 54ca43c2e..0d66648c3 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -18,9 +18,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/syserror" ) const maxSyscallFilterInstructions = 1 << 15 @@ -176,7 +176,7 @@ func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error { } if totalLength > maxSyscallFilterInstructions { - return syserror.ENOMEM + return linuxerr.ENOMEM } newFilters = append(newFilters, p) diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index 65e5427c1..6aa74219e 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -25,12 +25,12 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", - "//pkg/log", + "//pkg/errors/linuxerr", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/ipc", "//pkg/sentry/kernel/time", "//pkg/sync", - "//pkg/syserror", ], ) @@ -40,10 +40,11 @@ go_test( srcs = ["semaphore_test.go"], library = ":semaphore", deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/sentry/contexttest", - "//pkg/sentry/kernel/auth", - "//pkg/syserror", + "//pkg/abi/linux", # keep + "//pkg/context", # keep + "//pkg/errors/linuxerr", #keep + "//pkg/sentry/contexttest", # keep + "//pkg/sentry/kernel/auth", # keep + "//pkg/sentry/kernel/ipc", # keep ], ) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 47bb66b42..28e466948 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -20,12 +20,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -46,15 +46,15 @@ const ( // // +stateify savable type Registry struct { - // userNS owning the ipc name this registry belongs to. Immutable. - userNS *auth.UserNamespace // mu protects all fields below. - mu sync.Mutex `state:"nosave"` - semaphores map[int32]*Set - lastIDUsed int32 + mu sync.Mutex `state:"nosave"` + + // reg defines basic fields and operations needed for all SysV registries. + reg *ipc.Registry + // indexes maintains a mapping between a set's index in virtual array and // its identifier. - indexes map[int32]int32 + indexes map[int32]ipc.ID } // Set represents a set of semaphores that can be operated atomically. @@ -64,19 +64,11 @@ type Set struct { // registry owning this sem set. Immutable. registry *Registry - // Id is a handle that identifies the set. - ID int32 - - // key is an user provided key that can be shared between processes. - key int32 + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` - // creator is the user that created the set. Immutable. - creator fs.FileOwner + obj *ipc.Object - // mu protects all fields below. - mu sync.Mutex `state:"nosave"` - owner fs.FileOwner - perms fs.FilePermissions opTime ktime.Time changeTime ktime.Time @@ -114,9 +106,8 @@ type waiter struct { // NewRegistry creates a new semaphore set registry. func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ - userNS: userNS, - semaphores: make(map[int32]*Set), - indexes: make(map[int32]int32), + reg: ipc.NewRegistry(userNS), + indexes: make(map[int32]ipc.ID), } } @@ -125,62 +116,48 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry { // a new set is always created. If create is false, it fails if a set cannot // be found. If exclusive is true, it fails if a set with the same key already // exists. -func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { +func (r *Registry) FindOrCreate(ctx context.Context, key ipc.Key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { if nsems < 0 || nsems > semsMax { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } r.mu.Lock() defer r.mu.Unlock() if !private { - // Look up an existing semaphore. - if set := r.findByKey(key); set != nil { - set.mu.Lock() - defer set.mu.Unlock() - - // Check that caller can access semaphore set. - creds := auth.CredentialsFromContext(ctx) - if !set.checkPerms(creds, fs.PermsFromMode(mode)) { - return nil, syserror.EACCES - } + set, err := r.reg.Find(ctx, key, mode, create, exclusive) + if err != nil { + return nil, err + } - // Validate parameters. + // Validate semaphore-specific parameters. + if set != nil { + set := set.(*Set) if nsems > int32(set.Size()) { - return nil, syserror.EINVAL - } - if create && exclusive { - return nil, syserror.EEXIST + return nil, linuxerr.EINVAL } return set, nil } - - if !create { - // Semaphore not found and should not be created. - return nil, syserror.ENOENT - } } // Zero is only valid if an existing set is found. if nsems == 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // Apply system limits. // - // Map semaphores and map indexes in a registry are of the same size, - // check map semaphores only here for the system limit. - if len(r.semaphores) >= setsMax { - return nil, syserror.ENOSPC + // Map reg.objects and map indexes in a registry are of the same size, + // check map reg.objects only here for the system limit. + if r.reg.ObjectCount() >= setsMax { + return nil, linuxerr.ENOSPC } if r.totalSems() > int(semsTotalMax-nsems) { - return nil, syserror.ENOSPC + return nil, linuxerr.ENOSPC } // Finally create a new set. - owner := fs.FileOwnerFromContext(ctx) - perms := fs.FilePermsFromMode(mode) - return r.newSet(ctx, key, owner, owner, perms, nsems) + return r.newSetLocked(ctx, key, fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode), nsems) } // IPCInfo returns information about system-wide semaphore limits and parameters. @@ -207,7 +184,7 @@ func (r *Registry) SemInfo() *linux.SemInfo { defer r.mu.Unlock() info := r.IPCInfo() - info.SemUsz = uint32(len(r.semaphores)) + info.SemUsz = uint32(r.reg.ObjectCount()) info.SemAem = uint32(r.totalSems()) return info @@ -230,77 +207,59 @@ func (r *Registry) HighestIndex() int32 { return highestIndex } -// RemoveID removes set with give 'id' from the registry and marks the set as +// Remove removes set with give 'id' from the registry and marks the set as // dead. All waiters will be awakened and fail. -func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { +func (r *Registry) Remove(id ipc.ID, creds *auth.Credentials) error { r.mu.Lock() defer r.mu.Unlock() - set := r.semaphores[id] - if set == nil { - return syserror.EINVAL - } index, found := r.findIndexByID(id) if !found { - // Inconsistent state. - panic(fmt.Sprintf("unable to find an index for ID: %d", id)) + return linuxerr.EINVAL } + delete(r.indexes, index) - set.mu.Lock() - defer set.mu.Unlock() - - // "The effective user ID of the calling process must match the creator or - // owner of the semaphore set, or the caller must be privileged." - if !set.checkCredentials(creds) && !set.checkCapability(creds) { - return syserror.EACCES - } + r.reg.Remove(id, creds) - delete(r.semaphores, set.ID) - delete(r.indexes, index) - set.destroy() return nil } -func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) { +// newSetLocked creates a new Set using given fields. An error is returned if there +// are no more available identifiers. +// +// Precondition: r.mu must be held. +func (r *Registry) newSetLocked(ctx context.Context, key ipc.Key, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) { set := &Set{ registry: r, - key: key, - owner: owner, - creator: owner, - perms: perms, + obj: ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, perms), changeTime: ktime.NowFromContext(ctx), sems: make([]sem, nsems), } - // Find the next available ID. - for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { - // Handle wrap around. - if id < 0 { - id = 0 - continue - } - if r.semaphores[id] == nil { - index, found := r.findFirstAvailableIndex() - if !found { - panic("unable to find an available index") - } - r.indexes[index] = id - r.lastIDUsed = id - r.semaphores[id] = set - set.ID = id - return set, nil - } + err := r.reg.Register(set) + if err != nil { + return nil, err + } + + index, found := r.findFirstAvailableIndex() + if !found { + // See linux, ipc/sem.c:newary(). + return nil, linuxerr.ENOSPC } + r.indexes[index] = set.obj.ID - log.Warningf("Semaphore map is full, they must be leaking") - return nil, syserror.ENOMEM + return set, nil } // FindByID looks up a set given an ID. -func (r *Registry) FindByID(id int32) *Set { +func (r *Registry) FindByID(id ipc.ID) *Set { r.mu.Lock() defer r.mu.Unlock() - return r.semaphores[id] + mech := r.reg.FindByID(id) + if mech == nil { + return nil + } + return mech.(*Set) } // FindByIndex looks up a set given an index. @@ -312,19 +271,10 @@ func (r *Registry) FindByIndex(index int32) *Set { if !present { return nil } - return r.semaphores[id] + return r.reg.FindByID(id).(*Set) } -func (r *Registry) findByKey(key int32) *Set { - for _, v := range r.semaphores { - if v.key == key { - return v - } - } - return nil -} - -func (r *Registry) findIndexByID(id int32) (int32, bool) { +func (r *Registry) findIndexByID(id ipc.ID) (int32, bool) { for k, v := range r.indexes { if v == id { return k, true @@ -344,12 +294,36 @@ func (r *Registry) findFirstAvailableIndex() (int32, bool) { func (r *Registry) totalSems() int { totalSems := 0 - for _, v := range r.semaphores { - totalSems += v.Size() - } + r.reg.ForAllObjects( + func(o ipc.Mechanism) { + totalSems += o.(*Set).Size() + }, + ) return totalSems } +// ID returns semaphore's ID. +func (s *Set) ID() ipc.ID { + return s.obj.ID +} + +// Object implements ipc.Mechanism.Object. +func (s *Set) Object() *ipc.Object { + return s.obj +} + +// Lock implements ipc.Mechanism.Lock. +func (s *Set) Lock() { + s.mu.Lock() +} + +// Unlock implements ipc.mechanism.Unlock. +// +// +checklocksignore +func (s *Set) Unlock() { + s.mu.Unlock() +} + func (s *Set) findSem(num int32) *sem { if num < 0 || int(num) >= s.Size() { return nil @@ -362,19 +336,15 @@ func (s *Set) Size() int { return len(s.sems) } -// Change changes some fields from the set atomically. -func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error { +// Set modifies attributes for a semaphore set. See semctl(IPC_SET). +func (s *Set) Set(ctx context.Context, ds *linux.SemidDS) error { s.mu.Lock() defer s.mu.Unlock() - // "The effective UID of the calling process must match the owner or creator - // of the semaphore set, or the caller must be privileged." - if !s.checkCredentials(creds) && !s.checkCapability(creds) { - return syserror.EACCES + if err := s.obj.Set(ctx, &ds.SemPerm); err != nil { + return err } - s.owner = owner - s.perms = perms s.changeTime = ktime.NowFromContext(ctx) return nil } @@ -394,18 +364,18 @@ func (s *Set) semStat(creds *auth.Credentials, permMask fs.PermMask) (*linux.Sem s.mu.Lock() defer s.mu.Unlock() - if !s.checkPerms(creds, permMask) { - return nil, syserror.EACCES + if !s.obj.CheckPermissions(creds, permMask) { + return nil, linuxerr.EACCES } return &linux.SemidDS{ SemPerm: linux.IPCPerm{ - Key: uint32(s.key), - UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), - GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), - CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), - CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), - Mode: uint16(s.perms.LinuxMode()), + Key: uint32(s.obj.Key), + UID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Creator.GID)), + Mode: uint16(s.obj.Perms.LinuxMode()), Seq: 0, // IPC sequence not supported. }, SemOTime: s.opTime.TimeT(), @@ -417,20 +387,20 @@ func (s *Set) semStat(creds *auth.Credentials, permMask fs.PermMask) (*linux.Sem // SetVal overrides a semaphore value, waking up waiters as needed. func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error { if val < 0 || val > valueMax { - return syserror.ERANGE + return linuxerr.ERANGE } s.mu.Lock() defer s.mu.Unlock() // "The calling process must have alter permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Write: true}) { - return syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Write: true}) { + return linuxerr.EACCES } sem := s.findSem(num) if sem == nil { - return syserror.ERANGE + return linuxerr.ERANGE } // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. @@ -452,7 +422,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti for _, val := range vals { if val > valueMax { - return syserror.ERANGE + return linuxerr.ERANGE } } @@ -460,8 +430,8 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti defer s.mu.Unlock() // "The calling process must have alter permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Write: true}) { - return syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Write: true}) { + return linuxerr.EACCES } for i, val := range vals { @@ -482,13 +452,13 @@ func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) { defer s.mu.Unlock() // "The calling process must have read permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Read: true}) { - return 0, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { + return 0, linuxerr.EACCES } sem := s.findSem(num) if sem == nil { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } return sem.value, nil } @@ -499,8 +469,8 @@ func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) { defer s.mu.Unlock() // "The calling process must have read permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Read: true}) { - return nil, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { + return nil, linuxerr.EACCES } vals := make([]uint16, s.Size()) @@ -516,13 +486,13 @@ func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) { defer s.mu.Unlock() // "The calling process must have read permission on the semaphore set." - if !s.checkPerms(creds, fs.PermMask{Read: true}) { - return 0, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { + return 0, linuxerr.EACCES } sem := s.findSem(num) if sem == nil { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } return sem.pid, nil } @@ -532,13 +502,13 @@ func (s *Set) countWaiters(num int32, creds *auth.Credentials, pred func(w *wait defer s.mu.Unlock() // The calling process must have read permission on the semaphore set. - if !s.checkPerms(creds, fs.PermMask{Read: true}) { - return 0, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { + return 0, linuxerr.EACCES } sem := s.findSem(num) if sem == nil { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } var cnt uint16 for w := sem.waiters.Front(); w != nil; w = w.Next() { @@ -574,22 +544,22 @@ func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Cr // Did it race with a removal operation? if s.dead { - return nil, 0, syserror.EIDRM + return nil, 0, linuxerr.EIDRM } // Validate the operations. readOnly := true for _, op := range ops { if s.findSem(int32(op.SemNum)) == nil { - return nil, 0, syserror.EFBIG + return nil, 0, linuxerr.EFBIG } if op.SemOp != 0 { readOnly = false } } - if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) { - return nil, 0, syserror.EACCES + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) { + return nil, 0, linuxerr.EACCES } ch, num, err := s.executeOps(ctx, ops, pid) @@ -613,7 +583,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch if tmpVals[op.SemNum] != 0 { // Semaphore isn't 0, must wait. if op.SemFlg&linux.IPC_NOWAIT != 0 { - return nil, 0, syserror.ErrWouldBlock + return nil, 0, linuxerr.ErrWouldBlock } w := newWaiter(op.SemOp) @@ -624,12 +594,12 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch if op.SemOp < 0 { // Handle 'wait' operation. if -op.SemOp > valueMax { - return nil, 0, syserror.ERANGE + return nil, 0, linuxerr.ERANGE } if -op.SemOp > tmpVals[op.SemNum] { // Not enough resources, must wait. if op.SemFlg&linux.IPC_NOWAIT != 0 { - return nil, 0, syserror.ErrWouldBlock + return nil, 0, linuxerr.ErrWouldBlock } w := newWaiter(op.SemOp) @@ -639,7 +609,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch } else { // op.SemOp > 0: Handle 'signal' operation. if tmpVals[op.SemNum] > valueMax-op.SemOp { - return nil, 0, syserror.ERANGE + return nil, 0, linuxerr.ERANGE } } @@ -674,38 +644,10 @@ func (s *Set) AbortWait(num int32, ch chan struct{}) { // Waiter may not be found in case it raced with wakeWaiters(). } -func (s *Set) checkCredentials(creds *auth.Credentials) bool { - return s.owner.UID == creds.EffectiveKUID || - s.owner.GID == creds.EffectiveKGID || - s.creator.UID == creds.EffectiveKUID || - s.creator.GID == creds.EffectiveKGID -} - -func (s *Set) checkCapability(creds *auth.Credentials) bool { - return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok() -} - -func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool { - // Are we owner, or in group, or other? - p := s.perms.Other - if s.owner.UID == creds.EffectiveKUID { - p = s.perms.User - } else if creds.InGroup(s.owner.GID) { - p = s.perms.Group - } - - // Are permissions satisfied without capability checks? - if p.SupersetOf(reqPerms) { - return true - } - - return s.checkCapability(creds) -} - -// destroy destroys the set. +// Destroy implements ipc.Mechanism.Destroy. // // Preconditions: Caller must hold 's.mu'. -func (s *Set) destroy() { +func (s *Set) Destroy() { // Notify all waiters. They will fail on the next attempt to execute // operations and return error. s.dead = true diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go index e47acefdf..59ac92ef1 100644 --- a/pkg/sentry/kernel/semaphore/semaphore_test.go +++ b/pkg/sentry/kernel/semaphore/semaphore_test.go @@ -19,9 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ) func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} { @@ -55,7 +56,7 @@ func signalled(ch chan struct{}) bool { func TestBasic(t *testing.T) { ctx := contexttest.Context(t) - set := &Set{ID: 123, sems: make([]sem, 1)} + set := &Set{obj: &ipc.Object{ID: 123}, sems: make([]sem, 1)} ops := []linux.Sembuf{ {SemOp: 1}, } @@ -76,7 +77,7 @@ func TestBasic(t *testing.T) { func TestWaitForZero(t *testing.T) { ctx := contexttest.Context(t) - set := &Set{ID: 123, sems: make([]sem, 1)} + set := &Set{obj: &ipc.Object{ID: 123}, sems: make([]sem, 1)} ops := []linux.Sembuf{ {SemOp: 0}, } @@ -115,7 +116,7 @@ func TestWaitForZero(t *testing.T) { func TestNoWait(t *testing.T) { ctx := contexttest.Context(t) - set := &Set{ID: 123, sems: make([]sem, 1)} + set := &Set{obj: &ipc.Object{ID: 123}, sems: make([]sem, 1)} ops := []linux.Sembuf{ {SemOp: 1}, } @@ -123,14 +124,14 @@ func TestNoWait(t *testing.T) { ops[0].SemOp = -2 ops[0].SemFlg = linux.IPC_NOWAIT - if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock { - t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock) + if _, _, err := set.executeOps(ctx, ops, 123); err != linuxerr.ErrWouldBlock { + t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, linuxerr.ErrWouldBlock) } ops[0].SemOp = 0 ops[0].SemFlg = linux.IPC_NOWAIT - if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock { - t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock) + if _, _, err := set.executeOps(ctx, ops, 123); err != linuxerr.ErrWouldBlock { + t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, linuxerr.ErrWouldBlock) } } @@ -138,11 +139,12 @@ func TestUnregister(t *testing.T) { ctx := contexttest.Context(t) r := NewRegistry(auth.NewRootUserNamespace()) set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true) + if err != nil { t.Fatalf("FindOrCreate() failed, err: %v", err) } - if got := r.FindByID(set.ID); got.ID != set.ID { - t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set) + if got := r.FindByID(set.obj.ID); got.obj.ID != set.obj.ID { + t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.obj.ID, got, set) } ops := []linux.Sembuf{ @@ -155,14 +157,14 @@ func TestUnregister(t *testing.T) { } creds := auth.CredentialsFromContext(ctx) - if err := r.RemoveID(set.ID, creds); err != nil { - t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err) + if err := r.Remove(set.obj.ID, creds); err != nil { + t.Fatalf("Remove(%d) failed, err: %v", set.obj.ID, err) } if !set.dead { t.Fatalf("set is not dead: %+v", set) } - if got := r.FindByID(set.ID); got != nil { - t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got) + if got := r.FindByID(set.obj.ID); got != nil { + t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.obj.ID, got) } for i, ch := range chs { if !signalled(ch) { diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index ca9076406..f9f872522 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,7 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // SessionID is the public identifier. @@ -120,8 +120,9 @@ func (pg *ProcessGroup) Originator() *ThreadGroup { // IsOrphan returns true if this process group is an orphan. func (pg *ProcessGroup) IsOrphan() bool { - pg.originator.TaskSet().mu.RLock() - defer pg.originator.TaskSet().mu.RUnlock() + ts := pg.originator.TaskSet() + ts.mu.RLock() + defer ts.mu.RUnlock() return pg.ancestors == 0 } @@ -277,14 +278,14 @@ func (tg *ThreadGroup) createSession() error { continue } if s.leader == tg { - return syserror.EPERM + return linuxerr.EPERM } if s.id == SessionID(id) { - return syserror.EPERM + return linuxerr.EPERM } for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { if pg.id == ProcessGroupID(id) { - return syserror.EPERM + return linuxerr.EPERM } } } @@ -369,17 +370,22 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // Get the ID for this thread in the current namespace. id := tg.pidns.tgids[tg] + // Check whether a process still exists or not. + if id == 0 { + return linuxerr.ESRCH + } + // Per above, check for a Session leader or existing group. for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { if s.leader.pidns != tg.pidns { continue } if s.leader == tg { - return syserror.EPERM + return linuxerr.EPERM } for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { if pg.id == ProcessGroupID(id) { - return syserror.EPERM + return linuxerr.EPERM } } } @@ -437,17 +443,17 @@ func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID // Lookup the ProcessGroup. pg := pidns.processGroups[pgid] if pg == nil { - return syserror.EPERM + return linuxerr.EPERM } // Disallow the join if an execve has performed, per POSIX. if checkExec && tg.execed { - return syserror.EACCES + return linuxerr.EACCES } // See if it's in the same session as ours. if pg.session != tg.processGroup.session { - return syserror.EPERM + return linuxerr.EPERM } // Join the group; adjust children. diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 1c3c0794f..2547957ba 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -28,6 +28,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/refs", @@ -35,12 +36,12 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/ipc", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index a73f1bdca..ab938fa3c 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -38,24 +38,19 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) -// Key represents a shm segment key. Analogous to a file name. -type Key int32 - -// ID represents the opaque handle for a shm segment. Analogous to an fd. -type ID int32 - // Registry tracks all shared memory segments in an IPC namespace. The registry // provides the mechanisms for creating and finding segments, and reporting // global shm parameters. @@ -68,50 +63,51 @@ type Registry struct { // mu protects all fields below. mu sync.Mutex `state:"nosave"` - // shms maps segment ids to segments. + // reg defines basic fields and operations needed for all SysV registries. + // + // Withing reg, there are two maps, Objects and KeysToIDs. // - // shms holds all referenced segments, which are removed on the last + // reg.objects holds all referenced segments, which are removed on the last // DecRef. Thus, it cannot itself hold a reference on the Shm. // // Since removal only occurs after the last (unlocked) DecRef, there // exists a short window during which a Shm still exists in Shm, but is // unreferenced. Users must use TryIncRef to determine if the Shm is // still valid. - shms map[ID]*Shm - - // keysToShms maps segment keys to segments. // - // Shms in keysToShms are guaranteed to be referenced, as they are + // keysToIDs maps segment keys to IDs. + // + // Shms in keysToIDs are guaranteed to be referenced, as they are // removed by disassociateKey before the last DecRef. - keysToShms map[Key]*Shm + reg *ipc.Registry // Sum of the sizes of all existing segments rounded up to page size, in // units of page size. totalPages uint64 - - // ID assigned to the last created segment. Used to quickly find the next - // unused ID. - lastIDUsed ID } // NewRegistry creates a new shm registry. func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ - userNS: userNS, - shms: make(map[ID]*Shm), - keysToShms: make(map[Key]*Shm), + userNS: userNS, + reg: ipc.NewRegistry(userNS), } } // FindByID looks up a segment given an ID. // // FindByID returns a reference on Shm. -func (r *Registry) FindByID(id ID) *Shm { +func (r *Registry) FindByID(id ipc.ID) *Shm { r.mu.Lock() defer r.mu.Unlock() - s := r.shms[id] + mech := r.reg.FindByID(id) + if mech == nil { + return nil + } + s := mech.(*Shm) + // Take a reference on s. If TryIncRef fails, s has reached the last - // DecRef, but hasn't quite been removed from r.shms yet. + // DecRef, but hasn't quite been removed from r.reg.objects yet. if s != nil && s.TryIncRef() { return s } @@ -128,9 +124,9 @@ func (r *Registry) dissociateKey(s *Shm) { defer r.mu.Unlock() s.mu.Lock() defer s.mu.Unlock() - if s.key != linux.IPC_PRIVATE { - delete(r.keysToShms, s.key) - s.key = linux.IPC_PRIVATE + if s.obj.Key != linux.IPC_PRIVATE { + r.reg.DissociateKey(s.obj.Key) + s.obj.Key = linux.IPC_PRIVATE } } @@ -138,82 +134,60 @@ func (r *Registry) dissociateKey(s *Shm) { // analogous to open(2). // // FindOrCreate returns a reference on Shm. -func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { +func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key ipc.Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) { // "A new segment was to be created and size is less than SHMMIN or // greater than SHMMAX." - man shmget(2) // // Note that 'private' always implies the creation of a new segment // whether IPC_CREAT is specified or not. - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } r.mu.Lock() defer r.mu.Unlock() - if len(r.shms) >= linux.SHMMNI { + if r.reg.ObjectCount() >= linux.SHMMNI { // "All possible shared memory IDs have been taken (SHMMNI) ..." // - man shmget(2) - return nil, syserror.ENOSPC + return nil, linuxerr.ENOSPC } if !private { - // Look up an existing segment. - if shm := r.keysToShms[key]; shm != nil { - shm.mu.Lock() - defer shm.mu.Unlock() - - // Check that caller can access the segment. - if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) { - // "The user does not have permission to access the shared - // memory segment, and does not have the CAP_IPC_OWNER - // capability in the user namespace that governs its IPC - // namespace." - man shmget(2) - return nil, syserror.EACCES - } + shm, err := r.reg.Find(ctx, key, mode, create, exclusive) + if err != nil { + return nil, err + } + // Validate shm-specific parameters. + if shm != nil { + shm := shm.(*Shm) if size > shm.size { // "A segment for the given key exists, but size is greater than // the size of that segment." - man shmget(2) - return nil, syserror.EINVAL - } - - if create && exclusive { - // "IPC_CREAT and IPC_EXCL were specified in shmflg, but a - // shared memory segment already exists for key." - // - man shmget(2) - return nil, syserror.EEXIST + return nil, linuxerr.EINVAL } - shm.IncRef() return shm, nil } - - if !create { - // "No segment exists for the given key, and IPC_CREAT was not - // specified." - man shmget(2) - return nil, syserror.ENOENT - } } var sizeAligned uint64 if val, ok := hostarch.Addr(size).RoundUp(); ok { sizeAligned = uint64(val) } else { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL { // "... allocating a segment of the requested size would cause the // system to exceed the system-wide limit on shared memory (SHMALL)." // - man shmget(2) - return nil, syserror.ENOSPC + return nil, linuxerr.ENOSPC } // Need to create a new segment. - creator := fs.FileOwnerFromContext(ctx) - perms := fs.FilePermsFromMode(mode) - s, err := r.newShm(ctx, pid, key, creator, perms, size) + s, err := r.newShmLocked(ctx, pid, key, fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode), size) if err != nil { return nil, err } @@ -223,10 +197,10 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui return s, nil } -// newShm creates a new segment in the registry. +// newShmLocked creates a new segment in the registry. // // Precondition: Caller must hold r.mu. -func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) { +func (r *Registry) newShmLocked(ctx context.Context, pid int32, key ipc.Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) { mfp := pgalloc.MemoryFileProviderFromContext(ctx) if mfp == nil { panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider)) @@ -241,40 +215,21 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi shm := &Shm{ mfp: mfp, registry: r, - creator: creator, size: size, effectiveSize: effectiveSize, + obj: ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, perms), fr: fr, - key: key, - perms: perms, - owner: creator, creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } shm.InitRefs() - // Find the next available ID. - for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { - // Handle wrap around. - if id < 0 { - id = 0 - continue - } - if r.shms[id] == nil { - r.lastIDUsed = id - - shm.ID = id - r.shms[id] = shm - r.keysToShms[key] = shm - - r.totalPages += effectiveSize / hostarch.PageSize - - return shm, nil - } + if err := r.reg.Register(shm); err != nil { + return nil, err } + r.totalPages += effectiveSize / hostarch.PageSize - log.Warningf("Shm ids exhuasted, they may be leaking") - return nil, syserror.ENOSPC + return shm, nil } // IPCInfo reports global parameters for sysv shared memory segments on this @@ -296,7 +251,7 @@ func (r *Registry) ShmInfo() *linux.ShmInfo { defer r.mu.Unlock() return &linux.ShmInfo{ - UsedIDs: int32(r.lastIDUsed), + UsedIDs: int32(r.reg.LastIDUsed()), ShmTot: r.totalPages, ShmRss: r.totalPages, // We could probably get a better estimate from memory accounting. ShmSwp: 0, // No reclaim at the moment. @@ -313,11 +268,11 @@ func (r *Registry) remove(s *Shm) { s.mu.Lock() defer s.mu.Unlock() - if s.key != linux.IPC_PRIVATE { + if s.obj.Key != linux.IPC_PRIVATE { panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked())) } - delete(r.shms, s.ID) + r.reg.DissociateID(s.obj.ID) r.totalPages -= s.effectiveSize / hostarch.PageSize } @@ -329,13 +284,16 @@ func (r *Registry) Release(ctx context.Context) { // the IPC namespace containing it has no more references. toRelease := make([]*Shm, 0) r.mu.Lock() - for _, s := range r.keysToShms { - s.mu.Lock() - if !s.pendingDestruction { - toRelease = append(toRelease, s) - } - s.mu.Unlock() - } + r.reg.ForAllObjects( + func(o ipc.Mechanism) { + s := o.(*Shm) + s.mu.Lock() + if !s.pendingDestruction { + toRelease = append(toRelease, s) + } + s.mu.Unlock() + }, + ) r.mu.Unlock() for _, s := range toRelease { @@ -373,12 +331,6 @@ type Shm struct { // registry points to the shm registry containing this segment. Immutable. registry *Registry - // ID is the kernel identifier for this segment. Immutable. - ID ID - - // creator is the user that created the segment. Immutable. - creator fs.FileOwner - // size is the requested size of the segment at creation, in // bytes. Immutable. size uint64 @@ -396,14 +348,8 @@ type Shm struct { // mu protects all fields below. mu sync.Mutex `state:"nosave"` - // key is the public identifier for this segment. - key Key - - // perms is the access permissions for the segment. - perms fs.FilePermissions + obj *ipc.Object - // owner of this segment. - owner fs.FileOwner // attachTime is updated on every successful shmat. attachTime ktime.Time // detachTime is updated on every successful shmdt. @@ -425,17 +371,44 @@ type Shm struct { pendingDestruction bool } +// ID returns object's ID. +func (s *Shm) ID() ipc.ID { + return s.obj.ID +} + +// Object implements ipc.Mechanism.Object. +func (s *Shm) Object() *ipc.Object { + return s.obj +} + +// Destroy implements ipc.Mechanism.Destroy. No work is performed on shm.Destroy +// because a different removal mechanism is used in shm. See Shm.MarkDestroyed. +func (s *Shm) Destroy() { +} + +// Lock implements ipc.Mechanism.Lock. +func (s *Shm) Lock() { + s.mu.Lock() +} + +// Unlock implements ipc.mechanism.Unlock. +// +// +checklocksignore +func (s *Shm) Unlock() { + s.mu.Unlock() +} + // Precondition: Caller must hold s.mu. func (s *Shm) debugLocked() string { return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}", - s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction) + s.obj.ID, s.obj.Key, s.size, s.ReadRefs(), s.pendingDestruction) } // MappedName implements memmap.MappingIdentity.MappedName. func (s *Shm) MappedName(ctx context.Context) string { s.mu.Lock() defer s.mu.Unlock() - return fmt.Sprintf("SYSV%08d", s.key) + return fmt.Sprintf("SYSV%08d", s.obj.Key) } // DeviceID implements memmap.MappingIdentity.DeviceID. @@ -447,7 +420,7 @@ func (s *Shm) DeviceID() uint64 { func (s *Shm) InodeID() uint64 { // "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use // this. Changing this will break them." -- Linux, ipc/shm.c:newseg() - return uint64(s.ID) + return uint64(s.obj.ID) } // DecRef drops a reference on s. @@ -511,7 +484,7 @@ func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > s.fr.Length() { - err = &memmap.BusError{syserror.EFAULT} + err = &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ @@ -547,10 +520,11 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts Atta s.mu.Lock() defer s.mu.Unlock() if s.pendingDestruction && s.ReadRefs() == 0 { - return memmap.MMapOpts{}, syserror.EIDRM + return memmap.MMapOpts{}, linuxerr.EIDRM } - if !s.checkPermissions(ctx, fs.PermMask{ + creds := auth.CredentialsFromContext(ctx) + if !s.obj.CheckPermissions(creds, fs.PermMask{ Read: true, Write: !opts.Readonly, Execute: opts.Execute, @@ -558,7 +532,7 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts Atta // "The calling process does not have the required permissions for the // requested attach type, and does not have the CAP_IPC_OWNER capability // in the user namespace that governs its IPC namespace." - man shmat(2) - return memmap.MMapOpts{}, syserror.EACCES + return memmap.MMapOpts{}, linuxerr.EACCES } return memmap.MMapOpts{ Length: s.size, @@ -590,19 +564,19 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { // "The caller must have read permission on the shared memory segment." // - man shmctl(2) - if !s.checkPermissions(ctx, fs.PermMask{Read: true}) { + creds := auth.CredentialsFromContext(ctx) + if !s.obj.CheckPermissions(creds, fs.PermMask{Read: true}) { // "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow // read access for shmid, and the calling process does not have the // CAP_IPC_OWNER capability in the user namespace that governs its IPC // namespace." - man shmctl(2) - return nil, syserror.EACCES + return nil, linuxerr.EACCES } var mode uint16 if s.pendingDestruction { mode |= linux.SHM_DEST } - creds := auth.CredentialsFromContext(ctx) // Use the reference count as a rudimentary count of the number of // attaches. We exclude: @@ -619,12 +593,12 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { ds := &linux.ShmidDS{ ShmPerm: linux.IPCPerm{ - Key: uint32(s.key), - UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), - GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), - CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), - CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), - Mode: mode | uint16(s.perms.LinuxMode()), + Key: uint32(s.obj.Key), + UID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.Creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.Creator.GID)), + Mode: mode | uint16(s.obj.Perms.LinuxMode()), Seq: 0, // IPC sequences not supported. }, ShmSegsz: s.size, @@ -644,25 +618,10 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { s.mu.Lock() defer s.mu.Unlock() - if !s.checkOwnership(ctx) { - return syserror.EPERM - } - - creds := auth.CredentialsFromContext(ctx) - uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID)) - gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID)) - if !uid.Ok() || !gid.Ok() { - return syserror.EINVAL + if err := s.obj.Set(ctx, &ds.ShmPerm); err != nil { + return err } - // User may only modify the lower 9 bits of the mode. All the other bits are - // always 0 for the underlying inode. - mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff) - s.perms = fs.FilePermsFromMode(mode) - - s.owner.UID = uid - s.owner.GID = gid - s.changeTime = ktime.NowFromContext(ctx) return nil } @@ -690,40 +649,3 @@ func (s *Shm) MarkDestroyed(ctx context.Context) { s.DecRef(ctx) return } - -// checkOwnership verifies whether a segment may be accessed by ctx as an -// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux. -// -// Precondition: Caller must hold s.mu. -func (s *Shm) checkOwnership(ctx context.Context) bool { - creds := auth.CredentialsFromContext(ctx) - if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID { - return true - } - - // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux - // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented - // for use to "override IPC ownership checks". - return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS) -} - -// checkPermissions verifies whether a segment is accessible by ctx for access -// described by req. See ipc/util.c:ipcperms() in Linux. -// -// Precondition: Caller must hold s.mu. -func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool { - creds := auth.CredentialsFromContext(ctx) - - p := s.perms.Other - if s.owner.UID == creds.EffectiveKUID { - p = s.perms.User - } else if creds.InGroup(s.owner.GID) { - p = s.perms.Group - } - if p.SupersetOf(req) { - return true - } - - // Tasks with CAP_IPC_OWNER may bypass permission checks. - return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) -} diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 76d472292..4180ca28e 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -9,12 +9,12 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index f58ec4194..9c5e6698c 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -18,12 +18,12 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -64,7 +64,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { t := kernel.TaskFromContext(ctx) if t == nil { // No task context? Not valid. - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } // name matches fs/signalfd.c:signalfd4. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[signalfd]") @@ -98,7 +98,7 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS info, err := s.target.Sigtimedwait(s.Mask(), 0) if err != nil { // There must be no signal available. - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Copy out the signal info using the specified format. diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index b21a6ad57..9a95bf44c 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -29,10 +30,10 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -232,7 +233,7 @@ type Task struct { // exitStatus is the task's exit status. // // exitStatus is protected by the signal mutex. - exitStatus ExitStatus + exitStatus linux.WaitStatus // syscallRestartBlock represents a custom restart function to run in // restart_syscall(2) to resume an interrupted syscall. @@ -846,21 +847,19 @@ func (t *Task) OOMScoreAdj() int32 { // value should be between -1000 and 1000 inclusive. func (t *Task) SetOOMScoreAdj(adj int32) error { if adj > 1000 || adj < -1000 { - return syserror.EINVAL + return linuxerr.EINVAL } atomic.StoreInt32(&t.tg.oomScoreAdj, adj) return nil } -// UID returns t's uid. -// TODO(gvisor.dev/issue/170): This method is not namespaced yet. -func (t *Task) UID() uint32 { +// KUID returns t's kuid. +func (t *Task) KUID() uint32 { return uint32(t.Credentials().EffectiveKUID) } -// GID returns t's gid. -// TODO(gvisor.dev/issue/170): This method is not namespaced yet. -func (t *Task) GID() uint32 { +// KGID returns t's kgid. +func (t *Task) KGID() uint32 { return uint32(t.Credentials().EffectiveKGID) } @@ -876,3 +875,23 @@ func (t *Task) ResetKcov() { t.kcov = nil } } + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) loadSeccheckInfoLocked(req seccheck.TaskFieldSet, mask *seccheck.TaskFieldSet, info *seccheck.TaskInfo) { + if req.Contains(seccheck.TaskFieldThreadID) { + info.ThreadID = int32(t.k.tasks.Root.tids[t]) + mask.Add(seccheck.TaskFieldThreadID) + } + if req.Contains(seccheck.TaskFieldThreadStartTime) { + info.ThreadStartTime = t.startTime + mask.Add(seccheck.TaskFieldThreadStartTime) + } + if req.Contains(seccheck.TaskFieldThreadGroupID) { + info.ThreadGroupID = int32(t.k.tasks.Root.tgids[t.tg]) + mask.Add(seccheck.TaskFieldThreadGroupID) + } + if req.Contains(seccheck.TaskFieldThreadGroupStartTime) { + info.ThreadGroupStartTime = t.tg.leader.startTime + mask.Add(seccheck.TaskFieldThreadGroupStartTime) + } +} diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go index e574997f7..dd364ae50 100644 --- a/pkg/sentry/kernel/task_acct.go +++ b/pkg/sentry/kernel/task_acct.go @@ -18,10 +18,10 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // Getitimer implements getitimer(2). @@ -44,7 +44,7 @@ func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) { s, _ = t.tg.itimerProfSetting.At(tm) t.tg.signalHandlers.mu.Unlock() default: - return linux.ItimerVal{}, syserror.EINVAL + return linux.ItimerVal{}, linuxerr.EINVAL } val, iv := ktime.SpecFromSetting(tm, s) return linux.ItimerVal{ @@ -105,7 +105,7 @@ func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, err return linux.ItimerVal{}, err } default: - return linux.ItimerVal{}, syserror.EINVAL + return linux.ItimerVal{}, linuxerr.EINVAL } oldval, oldiv := ktime.SpecFromSetting(tm, olds) return linux.ItimerVal{ diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go index ecbe8f920..9bfc155e4 100644 --- a/pkg/sentry/kernel/task_block.go +++ b/pkg/sentry/kernel/task_block.go @@ -19,9 +19,9 @@ import ( "runtime/trace" "time" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // BlockWithTimeout blocks t until an event is received from C, the application @@ -32,7 +32,7 @@ import ( // and is unspecified if haveTimeout is false. // // - An error which is nil if an event is received from C, ETIMEDOUT if the timeout -// expired, and syserror.ErrInterrupted if t is interrupted. +// expired, and linuxerr.ErrInterrupted if t is interrupted. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) { @@ -45,7 +45,7 @@ func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time. err := t.BlockWithDeadline(C, true, deadline) // Timeout, explicitly return a remaining duration of 0. - if err == syserror.ETIMEDOUT { + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return 0, err } @@ -66,7 +66,7 @@ func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time. // application monotonic clock indicates a time of deadline (only if // haveDeadline is true), or t is interrupted. It returns nil if an event is // received from C, ETIMEDOUT if the deadline expired, and -// syserror.ErrInterrupted if t is interrupted. +// linuxerr.ErrInterrupted if t is interrupted. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithDeadline(C <-chan struct{}, haveDeadline bool, deadline ktime.Time) error { @@ -94,7 +94,7 @@ func (t *Task) BlockWithDeadline(C <-chan struct{}, haveDeadline bool, deadline // BlockWithTimer blocks t until an event is received from C or tchan, or t is // interrupted. It returns nil if an event is received from C, ETIMEDOUT if an -// event is received from tchan, and syserror.ErrInterrupted if t is +// event is received from tchan, and linuxerr.ErrInterrupted if t is // interrupted. // // Most clients should use BlockWithDeadline or BlockWithTimeout instead. @@ -105,7 +105,7 @@ func (t *Task) BlockWithTimer(C <-chan struct{}, tchan <-chan struct{}) error { } // Block blocks t until an event is received from C or t is interrupted. It -// returns nil if an event is received from C and syserror.ErrInterrupted if t +// returns nil if an event is received from C and linuxerr.ErrInterrupted if t // is interrupted. // // Preconditions: The caller must be running on the task goroutine. @@ -156,13 +156,13 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { region.End() t.SleepFinish(false) // Return the indicated error on interrupt. - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted case <-timerChan: region.End() t.SleepFinish(true) // We've timed out. - return syserror.ETIMEDOUT + return linuxerr.ETIMEDOUT } } diff --git a/pkg/sentry/kernel/task_cgroup.go b/pkg/sentry/kernel/task_cgroup.go index 7c138e80f..828b90014 100644 --- a/pkg/sentry/kernel/task_cgroup.go +++ b/pkg/sentry/kernel/task_cgroup.go @@ -20,15 +20,13 @@ import ( "sort" "strings" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/syserror" ) // EnterInitialCgroups moves t into an initial set of cgroups. // // Precondition: t isn't in any cgroups yet, t.cgs is empty. -// -// +checklocksignore parent.mu is conditionally acquired. func (t *Task) EnterInitialCgroups(parent *Task) { var inherit map[Cgroup]struct{} if parent != nil { @@ -67,7 +65,7 @@ func (t *Task) EnterCgroup(c Cgroup) error { // // TODO(b/183137098): Implement cgroup migration. log.Warningf("Cgroup migration is not implemented") - return syserror.EBUSY + return linuxerr.EBUSY } } } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 405771f3f..26a981f36 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -20,147 +20,47 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/pkg/usermem" ) -// SharingOptions controls what resources are shared by a new task created by -// Task.Clone, or an existing task affected by Task.Unshare. -type SharingOptions struct { - // If NewAddressSpace is true, the task should have an independent virtual - // address space. - NewAddressSpace bool - - // If NewSignalHandlers is true, the task should use an independent set of - // signal handlers. - NewSignalHandlers bool - - // If NewThreadGroup is true, the task should be the leader of its own - // thread group. TerminationSignal is the signal that the thread group - // will send to its parent when it exits. If NewThreadGroup is false, - // TerminationSignal is ignored. - NewThreadGroup bool - TerminationSignal linux.Signal - - // If NewPIDNamespace is true: - // - // - In the context of Task.Clone, the new task should be the init task - // (TID 1) in a new PID namespace. - // - // - In the context of Task.Unshare, the task should create a new PID - // namespace, and all subsequent clones of the task should be members of - // the new PID namespace. - NewPIDNamespace bool - - // If NewUserNamespace is true, the task should have an independent user - // namespace. - NewUserNamespace bool - - // If NewNetworkNamespace is true, the task should have an independent - // network namespace. - NewNetworkNamespace bool - - // If NewFiles is true, the task should use an independent file descriptor - // table. - NewFiles bool - - // If NewFSContext is true, the task should have an independent FSContext. - NewFSContext bool - - // If NewUTSNamespace is true, the task should have an independent UTS - // namespace. - NewUTSNamespace bool - - // If NewIPCNamespace is true, the task should have an independent IPC - // namespace. - NewIPCNamespace bool -} - -// CloneOptions controls the behavior of Task.Clone. -type CloneOptions struct { - // SharingOptions defines the set of resources that the new task will share - // with its parent. - SharingOptions - - // Stack is the initial stack pointer of the new task. If Stack is 0, the - // new task will start with the same stack pointer as its parent. - Stack hostarch.Addr - - // If SetTLS is true, set the new task's TLS (thread-local storage) - // descriptor to TLS. If SetTLS is false, TLS is ignored. - SetTLS bool - TLS hostarch.Addr - - // If ChildClearTID is true, when the child exits, 0 is written to the - // address ChildTID in the child's memory, and if the write is successful a - // futex wake on the same address is performed. - // - // If ChildSetTID is true, the child's thread ID (in the child's PID - // namespace) is written to address ChildTID in the child's memory. (As in - // Linux, failed writes are silently ignored.) - ChildClearTID bool - ChildSetTID bool - ChildTID hostarch.Addr - - // If ParentSetTID is true, the child's thread ID (in the parent's PID - // namespace) is written to address ParentTID in the parent's memory. (As - // in Linux, failed writes are silently ignored.) - // - // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID - // causes the child's thread ID to be written to ptid in both the parent - // and child's memory, but this is a documentation error fixed by - // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). - ParentSetTID bool - ParentTID hostarch.Addr - - // If Vfork is true, place the parent in vforkStop until the cloned task - // releases its TaskImage. - Vfork bool - - // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for - // this clone(), and do not ptrace-attach the caller's tracer to the new - // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate). - Untraced bool - - // If InheritTracer is true, ptrace-attach the caller's tracer to the new - // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported - // for it. If both Untraced and InheritTracer are true, no event will be - // reported, but tracer inheritance will still occur. - InheritTracer bool -} - // Clone implements the clone(2) syscall and returns the thread ID of the new // task in t's PID namespace. Clone may return both a non-zero thread ID and a // non-nil error. // // Preconditions: The caller must be running Task.doSyscallInvoke on the task // goroutine. -func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { +func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { // Since signal actions may refer to application signal handlers by virtual // address, any set of signal handlers must refer to the same address // space. - if !opts.NewSignalHandlers && opts.NewAddressSpace { - return 0, nil, syserror.EINVAL + if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND { + return 0, nil, linuxerr.EINVAL } // In order for the behavior of thread-group-directed signals to be sane, // all tasks in a thread group must share signal handlers. - if !opts.NewThreadGroup && opts.NewSignalHandlers { - return 0, nil, syserror.EINVAL + if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD { + return 0, nil, linuxerr.EINVAL } // All tasks in a thread group must be in the same PID namespace. - if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) { - return 0, nil, syserror.EINVAL + if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) { + return 0, nil, linuxerr.EINVAL } // The two different ways of specifying a new PID namespace are // incompatible. - if opts.NewPIDNamespace && t.childPIDNamespace != nil { - return 0, nil, syserror.EINVAL + if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil { + return 0, nil, linuxerr.EINVAL } // Thread groups and FS contexts cannot span user namespaces. - if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) { - return 0, nil, syserror.EINVAL + if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 { + return 0, nil, linuxerr.EINVAL + } + // args.ExitSignal must be a valid signal. + if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() { + return 0, nil, linuxerr.EINVAL } // Pull task registers and FPU state, a cloned task will inherit the @@ -174,7 +74,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // user_namespaces(7) creds := t.Credentials() userns := creds.UserNamespace - if opts.NewUserNamespace { + if args.Flags&linux.CLONE_NEWUSER != 0 { var err error // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and // the caller is in a chroot environment (i.e., the caller's root @@ -182,28 +82,26 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // in which it resides)." - clone(2). Neither chroot(2) nor // user_namespaces(7) document this. if t.IsChrooted() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } userns, err = creds.NewChildUserNamespace() if err != nil { return 0, nil, err } } - if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { - return 0, nil, syserror.EPERM + if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { + return 0, nil, linuxerr.EPERM } utsns := t.UTSNamespace() - if opts.NewUTSNamespace { + if args.Flags&linux.CLONE_NEWUTS != 0 { // Note that this must happen after NewUserNamespace so we get // the new userns if there is one. utsns = t.UTSNamespace().Clone(userns) } ipcns := t.IPCNamespace() - if opts.NewIPCNamespace { - // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC - // namespace" + if args.Flags&linux.CLONE_NEWIPC != 0 { ipcns = NewIPCNamespace(userns) } else { ipcns.IncRef() @@ -214,7 +112,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { defer cu.Clean() netns := t.NetworkNamespace() - if opts.NewNetworkNamespace { + if args.Flags&linux.CLONE_NEWNET != 0 { netns = inet.NewNamespace(netns) } @@ -227,7 +125,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { }) } - image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace) + image, err := t.image.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0) if err != nil { return 0, nil, err } @@ -236,17 +134,17 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { }) // clone() returns 0 in the child. image.Arch.SetReturn(0) - if opts.Stack != 0 { - image.Arch.SetStack(uintptr(opts.Stack)) + if args.Stack != 0 { + image.Arch.SetStack(uintptr(args.Stack)) } - if opts.SetTLS { - if !image.Arch.SetTLS(uintptr(opts.TLS)) { - return 0, nil, syserror.EPERM + if args.Flags&linux.CLONE_SETTLS != 0 { + if !image.Arch.SetTLS(uintptr(args.TLS)) { + return 0, nil, linuxerr.EPERM } } var fsContext *FSContext - if opts.NewFSContext { + if args.Flags&linux.CLONE_FS == 0 { fsContext = t.fsContext.Fork() } else { fsContext = t.fsContext @@ -254,7 +152,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } var fdTable *FDTable - if opts.NewFiles { + if args.Flags&linux.CLONE_FILES == 0 { fdTable = t.fdTable.Fork(t) } else { fdTable = t.fdTable @@ -264,22 +162,22 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { pidns := t.tg.pidns if t.childPIDNamespace != nil { pidns = t.childPIDNamespace - } else if opts.NewPIDNamespace { + } else if args.Flags&linux.CLONE_NEWPID != 0 { pidns = pidns.NewChild(userns) } tg := t.tg rseqAddr := hostarch.Addr(0) rseqSignature := uint32(0) - if opts.NewThreadGroup { + if args.Flags&linux.CLONE_THREAD == 0 { if tg.mounts != nil { tg.mounts.IncRef() } sh := t.tg.signalHandlers - if opts.NewSignalHandlers { + if args.Flags&linux.CLONE_SIGHAND == 0 { sh = sh.Fork() } - tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy()) tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) rseqAddr = t.rseqAddr rseqSignature = t.rseqSignature @@ -304,7 +202,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), } - if opts.NewThreadGroup { + if args.Flags&linux.CLONE_THREAD == 0 { cfg.Parent = t } else { cfg.InheritParent = t @@ -322,7 +220,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // // However kernel/fork.c:copy_process() adds a limitation to this: // "sigaltstack should be cleared when sharing the same VM". - if opts.NewAddressSpace || opts.Vfork { + if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 { nt.SetSignalStack(t.SignalStack()) } @@ -338,7 +236,23 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // nt that it must receive before its task goroutine starts running. tid := nt.k.tasks.Root.IDOfTask(nt) defer nt.Start(tid) - t.traceCloneEvent(tid) + + if seccheck.Global.Enabled(seccheck.PointClone) { + mask, info := getCloneSeccheckInfo(t, nt, args) + if err := seccheck.Global.Clone(t, mask, &info); err != nil { + // nt has been visible to the rest of the system since NewTask, so + // it may be blocking execve or a group stop, have been notified + // for group signal delivery, had children reparented to it, etc. + // Thus we can't just drop it on the floor. Instead, instruct the + // task goroutine to exit immediately, as quietly as possible. + nt.exitTracerNotified = true + nt.exitTracerAcked = true + nt.exitParentNotified = true + nt.exitParentAcked = true + nt.runState = (*runExitMain)(nil) + return 0, nil, err + } + } // "If fork/clone and execve are allowed by @prog, any child processes will // be constrained to the same filters and system call ABI as the parent." - @@ -347,41 +261,58 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...) nt.syscallFilters.Store(copiedFilters) } - if opts.Vfork { + if args.Flags&linux.CLONE_VFORK != 0 { nt.vforkParent = t } - if opts.ChildClearTID { - nt.SetClearTID(opts.ChildTID) + if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 { + nt.SetClearTID(hostarch.Addr(args.ChildTID)) } - if opts.ChildSetTID { + if args.Flags&linux.CLONE_CHILD_SETTID != 0 { ctid := nt.ThreadID() - ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID) + ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID)) } ntid := t.tg.pidns.IDOfTask(nt) - if opts.ParentSetTID { - ntid.CopyOut(t, opts.ParentTID) + if args.Flags&linux.CLONE_PARENT_SETTID != 0 { + ntid.CopyOut(t, hostarch.Addr(args.ParentTID)) } + t.traceCloneEvent(tid) kind := ptraceCloneKindClone - if opts.Vfork { + if args.Flags&linux.CLONE_VFORK != 0 { kind = ptraceCloneKindVfork - } else if opts.TerminationSignal == linux.SIGCHLD { + } else if linux.Signal(args.ExitSignal) == linux.SIGCHLD { kind = ptraceCloneKindFork } - if t.ptraceClone(kind, nt, opts) { - if opts.Vfork { + if t.ptraceClone(kind, nt, args) { + if args.Flags&linux.CLONE_VFORK != 0 { return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil } return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil } - if opts.Vfork { + if args.Flags&linux.CLONE_VFORK != 0 { t.maybeBeginVforkStop(nt) return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil } return ntid, nil, nil } +func getCloneSeccheckInfo(t, nt *Task, args *linux.CloneArgs) (seccheck.CloneFieldSet, seccheck.CloneInfo) { + req := seccheck.Global.CloneReq() + info := seccheck.CloneInfo{ + Credentials: t.Credentials(), + Args: *args, + } + var mask seccheck.CloneFieldSet + mask.Add(seccheck.CloneFieldCredentials) + mask.Add(seccheck.CloneFieldArgs) + t.k.tasks.mu.RLock() + defer t.k.tasks.mu.RUnlock() + t.loadSeccheckInfoLocked(req.Invoker, &mask.Invoker, &info.Invoker) + nt.loadSeccheckInfoLocked(req.Created, &mask.Created, &info.Created) + return mask, info +} + // maybeBeginVforkStop checks if a previously-started vfork child is still // running and has not yet released its MM, such that its parent t should enter // a vforkStop. @@ -446,39 +377,47 @@ func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { } // Unshare changes the set of resources t shares with other tasks, as specified -// by opts. +// by flags. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) Unshare(opts *SharingOptions) error { - // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and - // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if - // t is the only task using its MM, which due to clone(2)'s rules imply - // that it is also the only task using its signal handlers / in its thread - // group, and cause EINVAL to be returned otherwise. +func (t *Task) Unshare(flags int32) error { + // "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if + // the caller is single threaded (i.e., it is not sharing its address space + // with another process or thread). In this case, these flags have no + // effect. (Note also that specifying CLONE_THREAD automatically implies + // CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.) + // If the process is multithreaded, then the use of these flags results in + // an error." - unshare(2). This is incorrect (cf. + // kernel/fork.c:ksys_unshare()): + // + // - CLONE_THREAD does not imply CLONE_VM. + // + // - CLONE_SIGHAND implies CLONE_THREAD. + // + // - Only CLONE_VM requires that the caller is not sharing its address + // space with another thread. CLONE_SIGHAND requires that the caller is not + // sharing its signal handlers, and CLONE_THREAD requires that the caller + // is the only thread in its thread group. // // Since we don't count the number of tasks using each address space or set - // of signal handlers, we reject NewSignalHandlers and NewAddressSpace - // altogether, and interpret NewThreadGroup as requiring that t be the only - // member of its thread group. This seems to be logically coherent, in the - // sense that clone(2) allows a task to share signal handlers and address - // spaces with tasks in other thread groups. - if opts.NewAddressSpace || opts.NewSignalHandlers { - return syserror.EINVAL + // of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether. + if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 { + return linuxerr.EINVAL } creds := t.Credentials() - if opts.NewThreadGroup { + if flags&linux.CLONE_THREAD != 0 { t.tg.signalHandlers.mu.Lock() if t.tg.tasksCount != 1 { t.tg.signalHandlers.mu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } t.tg.signalHandlers.mu.Unlock() // This isn't racy because we're the only living task, and therefore // the only task capable of creating new ones, in our thread group. } - if opts.NewUserNamespace { + if flags&linux.CLONE_NEWUSER != 0 { if t.IsChrooted() { - return syserror.EPERM + return linuxerr.EPERM } newUserNS, err := creds.NewChildUserNamespace() if err != nil { @@ -492,34 +431,34 @@ func (t *Task) Unshare(opts *SharingOptions) error { creds = t.Credentials() } haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) - if opts.NewPIDNamespace { + if flags&linux.CLONE_NEWPID != 0 { if !haveCapSysAdmin { - return syserror.EPERM + return linuxerr.EPERM } t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) } t.mu.Lock() // Can't defer unlock: DecRefs must occur without holding t.mu. - if opts.NewNetworkNamespace { + if flags&linux.CLONE_NEWNET != 0 { if !haveCapSysAdmin { t.mu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } t.netns = inet.NewNamespace(t.netns) } - if opts.NewUTSNamespace { + if flags&linux.CLONE_NEWUTS != 0 { if !haveCapSysAdmin { t.mu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } // Note that this must happen after NewUserNamespace, so the // new user namespace is used if there is one. t.utsns = t.utsns.Clone(creds.UserNamespace) } - if opts.NewIPCNamespace { + if flags&linux.CLONE_NEWIPC != 0 { if !haveCapSysAdmin { t.mu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" @@ -527,12 +466,12 @@ func (t *Task) Unshare(opts *SharingOptions) error { t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldFDTable *FDTable - if opts.NewFiles { + if flags&linux.CLONE_FILES != 0 { oldFDTable = t.fdTable t.fdTable = oldFDTable.Fork(t) } var oldFSContext *FSContext - if opts.NewFSContext { + if flags&linux.CLONE_FS != 0 { oldFSContext = t.fsContext t.fsContext = oldFSContext.Fork() } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index cf8571262..db91fc4d8 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -66,10 +66,10 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // execStop is a TaskStop that a task sets on itself when it wants to execve @@ -97,7 +97,7 @@ func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) { // We lost to a racing group-exit, kill, or exec from another thread // and should just exit. newImage.release() - return nil, syserror.EINTR + return nil, linuxerr.EINTR } // Cancel any racing group stops. @@ -222,9 +222,15 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // Update credentials to reflect the execve. This should precede switching // MMs to ensure that dumpability has been reset first, if needed. t.updateCredsForExecLocked() - t.image.release() + oldImage := t.image t.image = *r.image t.mu.Unlock() + + // Don't hold t.mu while calling t.image.release(), that may + // attempt to acquire TaskImage.MemoryManager.mappingMu, a lock order + // violation. + oldImage.release() + t.unstopVforkParent() t.p.FullStateChanged() // NOTE(b/30316266): All locks must be dropped prior to calling Activate. diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index d115b8783..b3931445b 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -28,66 +28,14 @@ import ( "errors" "fmt" "strconv" - "strings" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) -// An ExitStatus is a value communicated from an exiting task or thread group -// to the party that reaps it. -// -// +stateify savable -type ExitStatus struct { - // Code is the numeric value passed to the call to exit or exit_group that - // caused the exit. If the exit was not caused by such a call, Code is 0. - Code int - - // Signo is the signal that caused the exit. If the exit was not caused by - // a signal, Signo is 0. - Signo int -} - -func (es ExitStatus) String() string { - var b strings.Builder - if code := es.Code; code != 0 { - if b.Len() != 0 { - b.WriteByte(' ') - } - _, _ = fmt.Fprintf(&b, "Code=%d", code) - } - if signal := es.Signo; signal != 0 { - if b.Len() != 0 { - b.WriteByte(' ') - } - _, _ = fmt.Fprintf(&b, "Signal=%d", signal) - } - return b.String() -} - -// Signaled returns true if the ExitStatus indicates that the exiting task or -// thread group was killed by a signal. -func (es ExitStatus) Signaled() bool { - return es.Signo != 0 -} - -// Status returns the numeric representation of the ExitStatus returned by e.g. -// the wait4() system call. -func (es ExitStatus) Status() uint32 { - return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff) -} - -// ShellExitCode returns the numeric exit code that Bash would return for an -// exit status of es. -func (es ExitStatus) ShellExitCode() int { - if es.Signaled() { - return 128 + es.Signo - } - return es.Code -} - // TaskExitState represents a step in the task exit path. // // "Exiting" and "exited" are often ambiguous; prefer to name specific states. @@ -163,13 +111,13 @@ func (t *Task) killedLocked() bool { return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0 } -// PrepareExit indicates an exit with status es. +// PrepareExit indicates an exit with the given status. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) PrepareExit(es ExitStatus) { +func (t *Task) PrepareExit(ws linux.WaitStatus) { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() - t.exitStatus = es + t.exitStatus = ws } // PrepareGroupExit indicates a group exit with status es to t's thread group. @@ -180,7 +128,7 @@ func (t *Task) PrepareExit(es ExitStatus) { // ptrace.) // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) PrepareGroupExit(es ExitStatus) { +func (t *Task) PrepareGroupExit(ws linux.WaitStatus) { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() if t.tg.exiting || t.tg.execing != nil { @@ -198,8 +146,8 @@ func (t *Task) PrepareGroupExit(es ExitStatus) { return } t.tg.exiting = true - t.tg.exitStatus = es - t.exitStatus = es + t.tg.exitStatus = ws + t.exitStatus = ws for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { if sibling != t { sibling.killLocked() @@ -207,11 +155,11 @@ func (t *Task) PrepareGroupExit(es ExitStatus) { } } -// Kill requests that all tasks in ts exit as if group exiting with status es. +// Kill requests that all tasks in ts exit as if group exiting with status ws. // Kill does not wait for tasks to exit. // // Kill has no analogue in Linux; it's provided for save/restore only. -func (ts *TaskSet) Kill(es ExitStatus) { +func (ts *TaskSet) Kill(ws linux.WaitStatus) { ts.mu.Lock() defer ts.mu.Unlock() ts.Root.exiting = true @@ -219,7 +167,7 @@ func (ts *TaskSet) Kill(es ExitStatus) { t.tg.signalHandlers.mu.Lock() if !t.tg.exiting { t.tg.exiting = true - t.tg.exitStatus = es + t.tg.exitStatus = ws } t.killLocked() t.tg.signalHandlers.mu.Unlock() @@ -282,9 +230,16 @@ func (*runExitMain) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Lock() t.updateRSSLocked() t.tg.pidns.owner.mu.Unlock() + + // Release the task image resources. Accessing these fields must be + // done with t.mu held, but the mm.DecUsers() call must be done outside + // of that lock. t.mu.Lock() - t.image.release() + mm := t.image.MemoryManager + t.image.MemoryManager = nil + t.image.fu = nil t.mu.Unlock() + mm.DecUsers(t) // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() @@ -730,10 +685,10 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.S info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) if t.exitStatus.Signaled() { info.Code = linux.CLD_KILLED - info.SetStatus(int32(t.exitStatus.Signo)) + info.SetStatus(int32(t.exitStatus.TerminationSignal())) } else { info.Code = linux.CLD_EXITED - info.SetStatus(int32(t.exitStatus.Code)) + info.SetStatus(int32(t.exitStatus.ExitStatus())) } // TODO(b/72102453): Set utime, stime. return info @@ -741,7 +696,7 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.S // ExitStatus returns t's exit status, which is only guaranteed to be // meaningful if t.ExitState() != TaskExitNone. -func (t *Task) ExitStatus() ExitStatus { +func (t *Task) ExitStatus() linux.WaitStatus { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() @@ -751,7 +706,7 @@ func (t *Task) ExitStatus() ExitStatus { // ExitStatus returns the exit status that would be returned by a consuming // wait*() on tg. -func (tg *ThreadGroup) ExitStatus() ExitStatus { +func (tg *ThreadGroup) ExitStatus() linux.WaitStatus { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() tg.signalHandlers.mu.Lock() @@ -762,7 +717,9 @@ func (tg *ThreadGroup) ExitStatus() ExitStatus { return tg.leader.exitStatus } -// TerminationSignal returns the thread group's termination signal. +// TerminationSignal returns the thread group's termination signal, which is +// the signal that will be sent to its leader's parent when all threads have +// exited. func (tg *ThreadGroup) TerminationSignal() linux.Signal { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() @@ -888,8 +845,8 @@ type WaitResult struct { // Event is exactly one of the events defined above. Event waiter.EventMask - // Status is the numeric status associated with the event. - Status uint32 + // Status is the wait status associated with the event. + Status linux.WaitStatus } // Wait waits for an event from a thread group that is a child of t's thread @@ -909,7 +866,7 @@ func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) { return wr, err } if err := t.Block(ch); err != nil { - return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr) + return wr, syserr.ConvertIntr(err, opts.BlockInterruptErr) } } } @@ -942,7 +899,7 @@ func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) { if anyWaitableTasks { return nil, ErrNoWaitableEvent } - return nil, syserror.ECHILD + return nil, linuxerr.ECHILD } // Preconditions: The TaskSet mutex must be locked for writing. @@ -1042,7 +999,7 @@ func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtrace } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() - status := target.exitStatus.Status() + status := target.exitStatus if !opts.ConsumeEvent { return &WaitResult{ Task: target, @@ -1056,7 +1013,7 @@ func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtrace // differ from that reported by a consuming wait; the latter will return // the group exit code if one is available. if target.tg.exiting { - status = target.tg.exitStatus.Status() + status = target.tg.exitStatus } // t may be (in the thread group of) target's parent, tracer, or both. We // don't need to check for !exitTracerAcked because tracees are detached @@ -1122,12 +1079,11 @@ func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) target.tg.groupStopWaitable = false } return &WaitResult{ - Task: target, - TID: pid, - UID: uid, - Event: EventChildGroupStop, - // There is no name for these status constants. - Status: (uint32(sig)&0xff)<<8 | 0x7f, + Task: target, + TID: pid, + UID: uid, + Event: EventChildGroupStop, + Status: linux.WaitStatusStopped(uint32(sig)), } } @@ -1148,7 +1104,7 @@ func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) * TID: pid, UID: uid, Event: EventGroupContinue, - Status: 0xffff, + Status: linux.WaitStatusContinued(), } } @@ -1176,7 +1132,7 @@ func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *Wai TID: pid, UID: uid, Event: EventTraceeStop, - Status: uint32(code)<<8 | 0x7f, + Status: linux.WaitStatusStopped(uint32(code)), } } diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index 0325967e4..a9067b682 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -16,9 +16,9 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" ) // Credentials returns t's credentials. @@ -47,7 +47,7 @@ func (t *Task) HasCapability(cp linux.Capability) bool { func (t *Task) SetUID(uid auth.UID) error { // setuid considers -1 to be invalid. if !uid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } t.mu.Lock() @@ -56,7 +56,7 @@ func (t *Task) SetUID(uid auth.UID) error { creds := t.Credentials() kuid := creds.UserNamespace.MapToKUID(uid) if !kuid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } // "setuid() sets the effective user ID of the calling process. If the // effective UID of the caller is root (more precisely: if the caller has @@ -70,7 +70,7 @@ func (t *Task) SetUID(uid auth.UID) error { // capability) and uid does not match the real UID or saved set-user-ID of // the calling process." if kuid != creds.RealKUID && kuid != creds.SavedKUID { - return syserror.EPERM + return linuxerr.EPERM } t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID) return nil @@ -87,26 +87,26 @@ func (t *Task) SetREUID(r, e auth.UID) error { if r.Ok() { newR = creds.UserNamespace.MapToKUID(r) if !newR.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } } newE := creds.EffectiveKUID if e.Ok() { newE = creds.UserNamespace.MapToKUID(e) if !newE.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } } if !creds.HasCapability(linux.CAP_SETUID) { // "Unprivileged processes may only set the effective user ID to the // real user ID, the effective user ID, or the saved set-user-ID." if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID { - return syserror.EPERM + return linuxerr.EPERM } // "Unprivileged users may only set the real user ID to the real user // ID or the effective user ID." if newR != creds.RealKUID && newR != creds.EffectiveKUID { - return syserror.EPERM + return linuxerr.EPERM } } // "If the real user ID is set (i.e., ruid is not -1) or the effective user @@ -223,7 +223,7 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // SetGID implements the semantics of setgid(2). func (t *Task) SetGID(gid auth.GID) error { if !gid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } t.mu.Lock() @@ -232,14 +232,14 @@ func (t *Task) SetGID(gid auth.GID) error { creds := t.Credentials() kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } if creds.HasCapability(linux.CAP_SETGID) { t.setKGIDsUncheckedLocked(kgid, kgid, kgid) return nil } if kgid != creds.RealKGID && kgid != creds.SavedKGID { - return syserror.EPERM + return linuxerr.EPERM } t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID) return nil @@ -255,22 +255,22 @@ func (t *Task) SetREGID(r, e auth.GID) error { if r.Ok() { newR = creds.UserNamespace.MapToKGID(r) if !newR.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } } newE := creds.EffectiveKGID if e.Ok() { newE = creds.UserNamespace.MapToKGID(e) if !newE.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } } if !creds.HasCapability(linux.CAP_SETGID) { if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID { - return syserror.EPERM + return linuxerr.EPERM } if newR != creds.RealKGID && newR != creds.EffectiveKGID { - return syserror.EPERM + return linuxerr.EPERM } } newS := creds.SavedKGID @@ -343,13 +343,13 @@ func (t *Task) SetExtraGIDs(gids []auth.GID) error { defer t.mu.Unlock() creds := t.Credentials() if !creds.HasCapability(linux.CAP_SETGID) { - return syserror.EPERM + return linuxerr.EPERM } kgids := make([]auth.KGID, len(gids)) for i, gid := range gids { kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } kgids[i] = kgid } @@ -367,25 +367,25 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili // "Permitted: This is a limiting superset for the effective capabilities // that the thread may assume." - capabilities(7) if effective & ^permitted != 0 { - return syserror.EPERM + return linuxerr.EPERM } creds := t.Credentials() // "It is also a limiting superset for the capabilities that may be added // to the inheritable set by a thread that does not have the CAP_SETPCAP // capability in its effective set." if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) { - return syserror.EPERM + return linuxerr.EPERM } // "If a thread drops a capability from its permitted set, it can never // reacquire that capability (unless it execve(2)s ..." if permitted & ^creds.PermittedCaps != 0 { - return syserror.EPERM + return linuxerr.EPERM } // "... if a capability is not in the bounding set, then a thread can't add // this capability to its inheritable set, even if it was in its permitted // capabilities ..." if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 { - return syserror.EPERM + return linuxerr.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.PermittedCaps = permitted @@ -402,7 +402,7 @@ func (t *Task) DropBoundingCapability(cp linux.Capability) error { defer t.mu.Unlock() creds := t.Credentials() if !creds.HasCapability(linux.CAP_SETPCAP) { - return syserror.EPERM + return linuxerr.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.BoundingCaps &^= auth.CapabilitySetOf(cp) @@ -422,7 +422,7 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN // in ns (by rule 3 in auth.Credentials.HasCapability). if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { - return syserror.EPERM + return linuxerr.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go index bd5543d4e..6002ffb42 100644 --- a/pkg/sentry/kernel/task_image.go +++ b/pkg/sentry/kernel/task_image.go @@ -17,7 +17,7 @@ package kernel import ( "fmt" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -27,7 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/syserr" ) -var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) +var errNoSyscalls = syserr.New("no syscall table found", errno.ENOEXEC) // Auxmap contains miscellaneous data for the task. type Auxmap map[string]interface{} @@ -53,7 +53,7 @@ type TaskImage struct { } // release releases all resources held by the TaskImage. release is called by -// the task when it execs into a new TaskImage or exits. +// the task when it execs into a new TaskImage. func (image *TaskImage) release() { // Nil out pointers so that if the task is saved after release, it doesn't // follow the pointers to possibly now-invalid objects. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index 72b9a0384..c5b099559 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -235,7 +235,7 @@ func (t *Task) traceExitEvent() { if !trace.IsEnabled() { return } - trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status()) + trace.Logf(t.traceContext, traceCategory, "exit status: %s", t.exitStatus) } // traceExecEvent is called when a task calls exec. @@ -249,5 +249,9 @@ func (t *Task) traceExecEvent(image *TaskImage) { return } defer file.DecRef(t) - trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) + + // traceExecEvent function may be called before the task goroutine + // starts, so we must use the async context. + name := file.PathnameWithDeleted(t.AsyncContext()) + trace.Logf(t.traceContext, traceCategory, "exec: %s", name) } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 068f25af1..7b336a46b 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -22,6 +22,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/goid" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -29,7 +30,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/syserror" ) // A taskRunState is a reified state in the task state machine. See README.md @@ -197,8 +197,8 @@ func (app *runApp) execute(t *Task) taskRunState { // a pending signal, causing another interruption, but that signal should // not interact with the interrupted syscall.) if t.haveSyscallReturn { - if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { - if sre == syserror.ERESTART_RESTARTBLOCK { + if sre, ok := linuxerr.SyscallRestartErrorFromReturn(t.Arch().Return()); ok { + if sre == linuxerr.ERESTART_RESTARTBLOCK { t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) t.Arch().RestartSyscallWithRestartBlock() } else { @@ -377,7 +377,7 @@ func (app *runApp) execute(t *Task) taskRunState { default: // What happened? Can't continue. t.Warningf("Unexpected SwitchToApp error: %v", err) - t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)}) + t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1)))) return (*runExit)(nil) } } diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index f142feab4..9d9fa76a6 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -23,12 +23,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // TaskGoroutineState is a coarse representation of the current execution @@ -601,7 +601,7 @@ func (t *Task) SetCPUMask(mask sched.CPUSet) error { // Ensure that at least 1 CPU is still allowed. if mask.NumCPUs() == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if t.k.useHostCores { diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 8ca61ed48..eeb3c5e69 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -22,12 +22,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -156,10 +156,11 @@ func (t *Task) PendingSignals() linux.SignalSet { // deliverSignal delivers the given signal and returns the following run state. func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRunState { - sigact := computeAction(linux.Signal(info.Signo), act) + sig := linux.Signal(info.Signo) + sigact := computeAction(sig, act) if t.haveSyscallReturn { - if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre, ok := linuxerr.SyscallRestartErrorFromReturn(t.Arch().Return()); ok { // Signals that are ignored, cause a thread group stop, or // terminate the thread group do not interact with interrupted // syscalls; in Linux terms, they are never returned to the signal @@ -168,13 +169,13 @@ func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRu // signal that is actually handled (by userspace). if sigact == SignalActionHandler { switch { - case sre == syserror.ERESTARTNOHAND: + case sre == linuxerr.ERESTARTNOHAND: fallthrough - case sre == syserror.ERESTART_RESTARTBLOCK: + case sre == linuxerr.ERESTART_RESTARTBLOCK: fallthrough - case (sre == syserror.ERESTARTSYS && act.Flags&linux.SA_RESTART == 0): + case (sre == linuxerr.ERESTARTSYS && act.Flags&linux.SA_RESTART == 0): t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) - t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1))) + t.Arch().SetReturn(uintptr(-ExtractErrno(linuxerr.EINTR, -1))) default: t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) t.Arch().RestartSyscall() @@ -197,14 +198,14 @@ func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRu } // Attach an fault address if appropriate. - switch linux.Signal(info.Signo) { + switch sig { case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS: ucs.FaultAddr = info.Addr() } eventchannel.Emit(ucs) - t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)}) + t.PrepareGroupExit(linux.WaitStatusTerminationSignal(sig)) return (*runExit)(nil) case SignalActionStop: @@ -224,12 +225,12 @@ func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRu // Send a forced SIGSEGV. If the signal that couldn't be delivered // was a SIGSEGV, force the handler to SIG_DFL. - t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */) + t.forceSignal(linux.SIGSEGV, sig == linux.SIGSEGV /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) } default: - panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act))) + panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(sig, act))) } return (*runInterrupt)(nil) } @@ -338,7 +339,7 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux. } if timeout == 0 { - return nil, syserror.EAGAIN + return nil, linuxerr.EAGAIN } // Unblock signals we're waiting for. Remember the original signal mask so @@ -359,8 +360,8 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux. if info := t.dequeueSignalLocked(mask); info != nil { return info, nil } - if err == syserror.ETIMEDOUT { - return nil, syserror.EAGAIN + if err == linuxerr.ETIMEDOUT { + return nil, linuxerr.EAGAIN } return nil, err } @@ -369,9 +370,9 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux. // // The following errors may be returned: // -// syserror.ESRCH - The task has exited. -// syserror.EINVAL - The signal is not valid. -// syserror.EAGAIN - THe signal is realtime, and cannot be queued. +// linuxerr.ESRCH - The task has exited. +// linuxerr.EINVAL - The signal is not valid. +// linuxerr.EAGAIN - THe signal is realtime, and cannot be queued. // func (t *Task) SendSignal(info *linux.SignalInfo) error { t.tg.pidns.owner.mu.RLock() @@ -406,14 +407,14 @@ func (t *Task) sendSignalLocked(info *linux.SignalInfo, group bool) error { func (t *Task) sendSignalTimerLocked(info *linux.SignalInfo, group bool, timer *IntervalTimer) error { if t.exitState == TaskExitDead { - return syserror.ESRCH + return linuxerr.ESRCH } sig := linux.Signal(info.Signo) if sig == 0 { return nil } if !sig.IsValid() { - return syserror.EINVAL + return linuxerr.EINVAL } // Signal side effects apply even if the signal is ultimately discarded. @@ -450,7 +451,7 @@ func (t *Task) sendSignalTimerLocked(info *linux.SignalInfo, group bool, timer * } if !q.enqueue(info, timer) { if sig.IsRealtime() { - return syserror.EAGAIN + return linuxerr.EAGAIN } t.Debugf("Discarding duplicate signal %d", sig) if timer != nil { @@ -505,7 +506,7 @@ func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) { // ignores tg.execing. if !tg.exiting { tg.exiting = true - tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)} + tg.exitStatus = linux.WaitStatusTerminationSignal(linux.SIGKILL) } for t := tg.tasks.Front(); t != nil; t = t.Next() { t.killLocked() @@ -684,7 +685,7 @@ func (t *Task) SetSignalStack(alt linux.SignalStack) bool { // to *actptr (if actptr is not nil) and returns the old signal action. func (tg *ThreadGroup) SetSigAction(sig linux.Signal, actptr *linux.SigAction) (linux.SigAction, error) { if !sig.IsValid() { - return linux.SigAction{}, syserror.EINVAL + return linux.SigAction{}, linuxerr.EINVAL } tg.pidns.owner.mu.RLock() @@ -695,7 +696,7 @@ func (tg *ThreadGroup) SetSigAction(sig linux.Signal, actptr *linux.SigAction) ( oldact := sh.actions[sig] if actptr != nil { if sig == linux.SIGKILL || sig == linux.SIGSTOP { - return oldact, syserror.EINVAL + return oldact, linuxerr.EINVAL } act := *actptr diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 41fd2d471..217c6f531 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -24,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // TaskConfig defines the configuration of a new Task (see below). @@ -169,7 +169,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { // doesn't matter too much since the caller will exit before it returns // to userspace. If the caller isn't in the same thread group, then // we're in uncharted territory and can return whatever we want. - return nil, syserror.EINTR + return nil, linuxerr.EINTR } if err := ts.assignTIDsLocked(t); err != nil { return nil, err @@ -267,7 +267,7 @@ func (ns *PIDNamespace) allocateTID() (ThreadID, error) { // fail with the error ENOMEM; it is not possible to create a new // processes [sic] in a PID namespace whose init process has // terminated." - pid_namespaces(7) - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } tid := ns.last for { @@ -299,7 +299,7 @@ func (ns *PIDNamespace) allocateTID() (ThreadID, error) { // Did we do a full cycle? if tid == ns.last { // No tid available. - return 0, syserror.EAGAIN + return 0, linuxerr.EAGAIN } } } diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index 601fc0d3a..2b1d7e114 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -22,12 +22,13 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/errors" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" ) // SyscallRestartBlock represents the restart block for a syscall restartable @@ -159,7 +160,7 @@ func (t *Task) doSyscall() taskRunState { // ok case linux.SECCOMP_RET_KILL_THREAD: t.Debugf("Syscall %d: killed by seccomp", sysno) - t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) return (*runExit)(nil) case linux.SECCOMP_RET_TRACE: t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno) @@ -309,7 +310,7 @@ func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState { return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller} case linux.SECCOMP_RET_KILL_THREAD: t.Debugf("vsyscall %d: killed by seccomp", sysno) - t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) return (*runExit)(nil) default: panic(fmt.Sprintf("Unknown seccomp result %d", r)) @@ -336,7 +337,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip // causes do_exit(SIGSYS), and changing sp is ignored. if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr { - t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) return (*runExit)(nil) } if sysno == ^uintptr(0) { @@ -357,7 +358,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle t.Arch().SetReturn(uintptr(rval)) } else { t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err) - if err == syserror.EFAULT { + if linuxerr.Equals(linuxerr.EFAULT, err) { t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) // A return is not emulated in this case. @@ -379,8 +380,8 @@ func ExtractErrno(err error, sysno int) int { return 0 case unix.Errno: return int(err) - case syserror.SyscallRestartErrno: - return int(err) + case *errors.Error: + return int(err.Errno()) case *memmap.BusError: // Bus errors may generate SIGBUS, but for syscalls they still // return EFAULT. See case in task_run.go where the fault is @@ -393,8 +394,8 @@ func ExtractErrno(err error, sysno int) int { case *os.SyscallError: return ExtractErrno(err.Err, sysno) default: - if errno, ok := syserror.TranslateError(err); ok { - return int(errno) + if errno, ok := linuxerr.TranslateError(err); ok { + return int(errno.Errno()) } } panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err)) diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index fc6d9438a..bff226a11 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -104,7 +104,7 @@ func (t *Task) CopyInVector(addr hostarch.Addr, maxElemSize, maxTotalSize int) ( // Each string has a zero terminating byte counted, so copying out a string // requires at least one byte of space. Also, see the calculation below. if maxTotalSize <= 0 { - return nil, syserror.ENOMEM + return nil, linuxerr.ENOMEM } thisMax := maxElemSize if maxTotalSize < thisMax { @@ -132,7 +132,7 @@ func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) erro case 8: const itemLen = 16 if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok { - return syserror.EFAULT + return linuxerr.EFAULT } b := t.CopyScratchBuffer(itemLen) @@ -147,7 +147,7 @@ func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) erro } default: - return syserror.ENOSYS + return linuxerr.ENOSYS } return nil @@ -190,7 +190,7 @@ func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRan case 8: const itemLen = 16 if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok { - return hostarch.AddrRangeSeq{}, syserror.EFAULT + return hostarch.AddrRangeSeq{}, linuxerr.EFAULT } b := t.CopyScratchBuffer(itemLen) @@ -202,11 +202,11 @@ func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRan base := hostarch.Addr(hostarch.ByteOrder.Uint64(b[0:8])) length := hostarch.ByteOrder.Uint64(b[8:16]) if length > math.MaxInt64 { - return hostarch.AddrRangeSeq{}, syserror.EINVAL + return hostarch.AddrRangeSeq{}, linuxerr.EINVAL } ar, ok := t.MemoryManager().CheckIORange(base, int64(length)) if !ok { - return hostarch.AddrRangeSeq{}, syserror.EFAULT + return hostarch.AddrRangeSeq{}, linuxerr.EFAULT } if numIovecs == 1 { @@ -219,7 +219,7 @@ func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRan } default: - return hostarch.AddrRangeSeq{}, syserror.ENOSYS + return hostarch.AddrRangeSeq{}, linuxerr.ENOSYS } // Truncate to MAX_RW_COUNT. @@ -252,7 +252,7 @@ func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOO } ar, ok := t.MemoryManager().CheckIORange(addr, int64(length)) if !ok { - return usermem.IOSequence{}, syserror.EFAULT + return usermem.IOSequence{}, linuxerr.EFAULT } return usermem.IOSequence{ IO: t.MemoryManager(), @@ -270,7 +270,7 @@ func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOO // Preconditions: Same as Task.CopyInIovecs. func (t *Task) IovecsIOSequence(addr hostarch.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } ars, err := t.CopyInIovecs(addr, iovcnt) if err != nil { @@ -312,7 +312,7 @@ func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) { tmm := cc.t.MemoryManager() cc.t.mu.Unlock() if !tmm.IncUsers() { - return nil, syserror.EFAULT + return nil, linuxerr.EFAULT } return tmm, nil } diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 891e2201d..5814a4eca 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -19,13 +19,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // A ThreadGroup is a logical grouping of tasks that has widespread @@ -143,7 +143,7 @@ type ThreadGroup struct { // // While exiting is false, exitStatus is protected by the signal mutex. // When exiting becomes true, exitStatus becomes immutable. - exitStatus ExitStatus + exitStatus linux.WaitStatus // terminationSignal is the signal that this thread group's leader will // send to its parent when it exits. @@ -278,7 +278,7 @@ func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, s limits: limits, mounts: mntns, } - tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg}) + tg.itimerRealTimer = ktime.NewTimer(k.timekeeper.monotonicClock, &itimerRealListener{tg: tg}) tg.timers = make(map[linux.TimerID]*IntervalTimer) tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) return tg @@ -357,7 +357,7 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool) // "The calling process must be a session leader and not have a // controlling terminal already." - tty_ioctl(4) if tg.processGroup.session.leader != tg || tg.tty != nil { - return syserror.EINVAL + return linuxerr.EINVAL } creds := auth.CredentialsFromContext(tg.leader) @@ -371,7 +371,7 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool) if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session { // Stealing requires CAP_SYS_ADMIN in the root user namespace. if !hasAdmin || !steal { - return syserror.EPERM + return linuxerr.EPERM } // Steal the TTY away. Unlike TIOCNOTTY, don't send signals. for othertg := range tg.pidns.owner.Root.tgids { @@ -391,7 +391,7 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool) } if !isReadable && !hasAdmin { - return syserror.EPERM + return linuxerr.EPERM } // Set the controlling terminal and foreground process group. @@ -419,7 +419,7 @@ func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error { if tg.tty == nil || tg.tty != tty { tg.signalHandlers.mu.Unlock() - return syserror.ENOTTY + return linuxerr.ENOTTY } // "If the process was session leader, then send SIGHUP and SIGCONT to @@ -473,7 +473,7 @@ func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) { // "When fd does not refer to the controlling terminal of the calling // process, -1 is returned" - tcgetpgrp(3) if tg.tty != tty { - return -1, syserror.ENOTTY + return -1, linuxerr.ENOTTY } return int32(tg.processGroup.session.foreground.id), nil @@ -489,31 +489,36 @@ func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() - // TODO(gvisor.dev/issue/6148): "If tcsetpgrp() is called by a member of a - // background process group in its session, and the calling process is not - // blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all members of - // this background process group." - // tty must be the controlling terminal. if tg.tty != tty { - return -1, syserror.ENOTTY + return -1, linuxerr.ENOTTY } // pgid must be positive. if pgid < 0 { - return -1, syserror.EINVAL + return -1, linuxerr.EINVAL } // pg must not be empty. Empty process groups are removed from their // pid namespaces. pg, ok := tg.pidns.processGroups[pgid] if !ok { - return -1, syserror.ESRCH + return -1, linuxerr.ESRCH } // pg must be part of this process's session. if tg.processGroup.session != pg.session { - return -1, syserror.EPERM + return -1, linuxerr.EPERM + } + + signalAction := tg.signalHandlers.actions[linux.SIGTTOU] + // If the calling process is a member of a background group, a SIGTTOU + // signal is sent to all members of this background process group. + // We need also need to check whether it is ignoring or blocking SIGTTOU. + ignored := signalAction.Handler == linux.SIG_IGN + blocked := tg.leader.signalMask == linux.SignalSetOf(linux.SIGTTOU) + if tg.processGroup.id != tg.processGroup.session.foreground.id && !ignored && !blocked { + tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGTTOU), true) } tg.processGroup.session.foreground.id = pgid diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 2817aa3ba..e293d9a0f 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -13,8 +13,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sync", - "//pkg/syserror", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index 26aa34aa6..191b92811 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -22,8 +22,8 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -322,7 +322,7 @@ func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Sett // interpreted as a time relative to now. func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) { if value < 0 { - return Setting{}, syserror.EINVAL + return Setting{}, linuxerr.EINVAL } if value == 0 { return Setting{Period: interval}, nil @@ -338,7 +338,7 @@ func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (S // interpreted as an absolute time. func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) { if value.Before(ZeroTime) { - return Setting{}, syserror.EINVAL + return Setting{}, linuxerr.EINVAL } if value.IsZero() { return Setting{Period: interval}, nil diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index 7c4fefb16..6255bae7a 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" ) // Timekeeper manages all of the kernel clocks. @@ -39,6 +40,12 @@ type Timekeeper struct { // It is set only once, by SetClocks. clocks sentrytime.Clocks `state:"nosave"` + // realtimeClock is a ktime.Clock based on timekeeper's Realtime. + realtimeClock *timekeeperClock + + // monotonicClock is a ktime.Clock based on timekeeper's Monotonic. + monotonicClock *timekeeperClock + // bootTime is the realtime when the system "booted". i.e., when // SetClocks was called in the initial (not restored) run. bootTime ktime.Time @@ -90,10 +97,13 @@ type Timekeeper struct { // NewTimekeeper does not take ownership of paramPage. // // SetClocks must be called on the returned Timekeeper before it is usable. -func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) { - return &Timekeeper{ +func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) *Timekeeper { + t := Timekeeper{ params: NewVDSOParamPage(mfp, paramPage), - }, nil + } + t.realtimeClock = &timekeeperClock{tk: &t, c: sentrytime.Realtime} + t.monotonicClock = &timekeeperClock{tk: &t, c: sentrytime.Monotonic} + return &t } // SetClocks the backing clock source. @@ -167,6 +177,32 @@ func (t *Timekeeper) SetClocks(c sentrytime.Clocks) { } } +var _ tcpip.Clock = (*Timekeeper)(nil) + +// Now implements tcpip.Clock. +func (t *Timekeeper) Now() time.Time { + nsec, err := t.GetTime(sentrytime.Realtime) + if err != nil { + panic("timekeeper.GetTime(sentrytime.Realtime): " + err.Error()) + } + return time.Unix(0, nsec) +} + +// NowMonotonic implements tcpip.Clock. +func (t *Timekeeper) NowMonotonic() tcpip.MonotonicTime { + nsec, err := t.GetTime(sentrytime.Monotonic) + if err != nil { + panic("timekeeper.GetTime(sentrytime.Monotonic): " + err.Error()) + } + var mt tcpip.MonotonicTime + return mt.Add(time.Duration(nsec) * time.Nanosecond) +} + +// AfterFunc implements tcpip.Clock. +func (t *Timekeeper) AfterFunc(d time.Duration, f func()) tcpip.Timer { + return ktime.TcpipAfterFunc(t.realtimeClock, d, f) +} + // startUpdater starts an update goroutine that keeps the clocks updated. // // mu must be held. diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go index dfc3c0719..b6039505a 100644 --- a/pkg/sentry/kernel/timekeeper_test.go +++ b/pkg/sentry/kernel/timekeeper_test.go @@ -17,12 +17,12 @@ package kernel import ( "testing" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/pgalloc" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // mockClocks is a sentrytime.Clocks that simply returns the times in the @@ -45,7 +45,7 @@ func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) { case sentrytime.Realtime: return c.realtime, nil default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index 4c65215fa..560a0f33c 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -17,8 +17,10 @@ go_library( deps = [ "//pkg/abi", "//pkg/abi/linux", + "//pkg/abi/linux/errno", "//pkg/context", "//pkg/cpuid", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/rand", @@ -35,7 +37,6 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/syserr", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 8fc3e2a79..fb213d109 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -31,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -115,7 +115,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { log.Infof("Error reading ELF ident: %v", err) // The entire ident array always exists. if err == io.EOF || err == io.ErrUnexpectedEOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return elfInfo{}, err } @@ -123,22 +123,22 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { // Only some callers pre-check the ELF magic. if !bytes.Equal(ident[:len(elfMagic)], []byte(elfMagic)) { log.Infof("File is not an ELF") - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } // We only support 64-bit, little endian binaries if class := elf.Class(ident[elf.EI_CLASS]); class != elf.ELFCLASS64 { log.Infof("Unsupported ELF class: %v", class) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if endian := elf.Data(ident[elf.EI_DATA]); endian != elf.ELFDATA2LSB { log.Infof("Unsupported ELF endianness: %v", endian) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if version := elf.Version(ident[elf.EI_VERSION]); version != elf.EV_CURRENT { log.Infof("Unsupported ELF version: %v", version) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } // EI_OSABI is ignored by Linux, which is the only OS supported. os := abi.Linux @@ -150,7 +150,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { log.Infof("Error reading ELF header: %v", err) // The entire header always exists. if err == io.EOF || err == io.ErrUnexpectedEOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return elfInfo{}, err } @@ -165,7 +165,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { a = arch.ARM64 default: log.Infof("Unsupported ELF machine %d", machine) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } var sharedObject bool @@ -177,25 +177,25 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { sharedObject = true default: log.Infof("Unsupported ELF type %v", elfType) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if int(hdr.Phentsize) != prog64Size { log.Infof("Unsupported phdr size %d", hdr.Phentsize) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } totalPhdrSize := prog64Size * int(hdr.Phnum) if totalPhdrSize < prog64Size { log.Warningf("No phdrs or total phdr size overflows: prog64Size: %d phnum: %d", prog64Size, int(hdr.Phnum)) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if totalPhdrSize > maxTotalPhdrSize { log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if int64(hdr.Phoff) < 0 || int64(hdr.Phoff+uint64(totalPhdrSize)) < 0 { ctx.Infof("Unsupported phdr offset %d", hdr.Phoff) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } phdrBuf := make([]byte, totalPhdrSize) @@ -204,7 +204,7 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { log.Infof("Error reading ELF phdrs: %v", err) // If phdrs were specified, they should all exist. if err == io.EOF || err == io.ErrUnexpectedEOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return elfInfo{}, err } @@ -247,19 +247,19 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr if !ok { // If offset != 0 we should have ensured this would fit. ctx.Warningf("Computed segment load address overflows: %#x + %#x", phdr.Vaddr, offset) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } addr -= hostarch.Addr(adjust) fileSize := phdr.Filesz + adjust if fileSize < phdr.Filesz { ctx.Infof("Computed segment file size overflows: %#x + %#x", phdr.Filesz, adjust) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } ms, ok := hostarch.Addr(fileSize).RoundUp() if !ok { ctx.Infof("fileSize %#x too large", fileSize) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } mapSize := uint64(ms) @@ -320,7 +320,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr memSize := phdr.Memsz + adjust if memSize < phdr.Memsz { ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } // Allocate more anonymous pages if necessary. @@ -332,7 +332,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr anonSize, ok := hostarch.Addr(memSize - mapSize).RoundUp() if !ok { ctx.Infof("extra anon pages too large: %#x", memSize-mapSize) - return syserror.ENOEXEC + return linuxerr.ENOEXEC } // N.B. Linux uses vm_brk_flags to map these pages, which only @@ -422,27 +422,27 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in // NOTE(b/37474556): Linux allows out-of-order // segments, in violation of the spec. ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } var ok bool end, ok = vaddr.AddLength(phdr.Memsz) if !ok { ctx.Infof("PT_LOAD header size overflows. %#x + %#x", vaddr, phdr.Memsz) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } case elf.PT_INTERP: if phdr.Filesz < 2 { ctx.Infof("PT_INTERP path too small: %v", phdr.Filesz) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } if phdr.Filesz > linux.PATH_MAX { ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } if int64(phdr.Off) < 0 || int64(phdr.Off+phdr.Filesz) < 0 { ctx.Infof("Unsupported PT_INTERP offset %d", phdr.Off) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } path := make([]byte, phdr.Filesz) @@ -450,12 +450,12 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in if err != nil { // If an interpreter was specified, it should exist. ctx.Infof("Error reading PT_INTERP path: %v", err) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } if path[len(path)-1] != 0 { ctx.Infof("PT_INTERP path not NUL-terminated: %v", path) - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } // Strip NUL-terminator and everything beyond from @@ -476,7 +476,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in // the open path would return a different // error. ctx.Infof("PT_INTERP path is empty: %v", path) - return loadedELF{}, syserror.EACCES + return loadedELF{}, linuxerr.EACCES } } } @@ -497,7 +497,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in totalSize, ok := totalSize.RoundUp() if !ok { ctx.Infof("ELF PT_LOAD segments too big") - return loadedELF{}, syserror.ENOEXEC + return loadedELF{}, linuxerr.ENOEXEC } var err error @@ -517,13 +517,13 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in start, ok = start.AddLength(uint64(offset)) if !ok { ctx.Infof(fmt.Sprintf("Start %#x + offset %#x overflows?", start, offset)) - return loadedELF{}, syserror.EINVAL + return loadedELF{}, linuxerr.EINVAL } end, ok = end.AddLength(uint64(offset)) if !ok { ctx.Infof(fmt.Sprintf("End %#x + offset %#x overflows?", end, offset)) - return loadedELF{}, syserror.EINVAL + return loadedELF{}, linuxerr.EINVAL } info.entry, ok = info.entry.AddLength(uint64(offset)) @@ -591,7 +591,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS // Check Image Compatibility. if arch.Host != info.arch { ctx.Warningf("Found mismatch for platform %s with ELF type %s", arch.Host.String(), info.arch.String()) - return loadedELF{}, nil, syserror.ENOEXEC + return loadedELF{}, nil, linuxerr.ENOEXEC } // Create the arch.Context now so we can prepare the mmap layout before @@ -621,20 +621,20 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) { info, err := parseHeader(ctx, f) if err != nil { - if err == syserror.ENOEXEC { + if linuxerr.Equals(linuxerr.ENOEXEC, err) { // Bad interpreter. - err = syserror.ELIBBAD + err = linuxerr.ELIBBAD } return loadedELF{}, err } if info.os != initial.os { ctx.Infof("Initial ELF OS %v and interpreter ELF OS %v differ", initial.os, info.os) - return loadedELF{}, syserror.ELIBBAD + return loadedELF{}, linuxerr.ELIBBAD } if info.arch != initial.arch { ctx.Infof("Initial ELF arch %v and interpreter ELF arch %v differ", initial.arch, info.arch) - return loadedELF{}, syserror.ELIBBAD + return loadedELF{}, linuxerr.ELIBBAD } // The interpreter is not given a load offset, as its location does not @@ -680,7 +680,7 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error if interp.interpreter != "" { // No recursive interpreters! ctx.Infof("Interpreter requires an interpreter") - return loadedELF{}, nil, syserror.ENOEXEC + return loadedELF{}, nil, linuxerr.ENOEXEC } } diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go index 3e302d92c..1ec0d7019 100644 --- a/pkg/sentry/loader/interpreter.go +++ b/pkg/sentry/loader/interpreter.go @@ -19,8 +19,8 @@ import ( "io" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsbridge" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -43,14 +43,14 @@ func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.Fil // Short read is OK. if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return "", []string{}, err } line = line[:n] if !bytes.Equal(line[:2], []byte(interpreterScriptMagic)) { - return "", []string{}, syserror.ENOEXEC + return "", []string{}, linuxerr.ENOEXEC } // Ignore #!. line = line[2:] @@ -82,7 +82,7 @@ func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.Fil if string(interp) == "" { ctx.Infof("Interpreter script contains no interpreter: %v", line) - return "", []string{}, syserror.ENOEXEC + return "", []string{}, linuxerr.ENOEXEC } // Build the new argument list: diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 47e3775a3..2759ef71e 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -23,8 +23,10 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -33,7 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -89,7 +90,7 @@ type LoadArgs struct { func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) { if args.Filename == "" { ctx.Infof("cannot open empty name") - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // TODO(gvisor.dev/issue/160): Linux requires only execute permission, @@ -112,7 +113,7 @@ func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string } if t != linux.ModeRegular { ctx.Infof("%q is not a regular file: %v", filename, t) - return syserror.EACCES + return linuxerr.EACCES } return nil } @@ -170,7 +171,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context // (e.g., #!a). if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { - err = syserror.ENOEXEC + err = linuxerr.ENOEXEC } return loadedELF{}, nil, nil, nil, err } @@ -188,7 +189,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)): if args.CloseOnExec { - return loadedELF{}, nil, nil, nil, syserror.ENOENT + return loadedELF{}, nil, nil, nil, linuxerr.ENOENT } args.Filename, args.Argv, err = parseInterpreterScript(ctx, args.Filename, args.File, args.Argv) if err != nil { @@ -200,13 +201,13 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context default: ctx.Infof("Unknown magic: %v", hdr) - return loadedELF{}, nil, nil, nil, syserror.ENOEXEC + return loadedELF{}, nil, nil, nil, linuxerr.ENOEXEC } // Set to nil in case we loop on a Interpreter Script. args.File = nil } - return loadedELF{}, nil, nil, nil, syserror.ELOOP + return loadedELF{}, nil, nil, nil, linuxerr.ELOOP } // Load loads args.File into a MemoryManager. If args.File is nil, the path @@ -237,7 +238,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V // loaded.end is available for its use. e, ok := loaded.end.RoundUp() if !ok { - return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("brk overflows: %#x", loaded.end), linux.ENOEXEC) + return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("brk overflows: %#x", loaded.end), errno.ENOEXEC) } args.MemoryManager.BrkSetup(ctx, e) @@ -294,15 +295,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) m.SetExecutable(ctx, file) - - symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn") - if err != nil { - return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to find rt_sigreturn in vdso: %v", err), syserr.FromError(err).ToLinux()) - } - - // Found rt_sigretrun. - addr := uint64(vdsoAddr) + symbolValue - vdsoPrelink - m.SetVDSOSigReturn(addr) + m.SetVDSOSigReturn(uint64(vdsoAddr) + vdsoSigreturnOffset - vdsoPrelink) ac.SetIP(uintptr(loaded.entry)) ac.SetStack(uintptr(stack.Bottom)) diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index fd54261fd..bcee6aef6 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -19,10 +19,10 @@ import ( "debug/elf" "fmt" "io" - "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" @@ -33,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -58,7 +57,7 @@ type byteFullReader struct { func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset >= int64(len(b.data)) { return 0, io.EOF @@ -101,14 +100,14 @@ func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, erro first = &info.phdrs[i] if phdr.Off != 0 { log.Warningf("First PT_LOAD segment has non-zero file offset") - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } } memoryOffset := phdr.Vaddr - first.Vaddr if memoryOffset != phdr.Off { log.Warningf("PT_LOAD segment memory offset %#x != file offset %#x", memoryOffset, phdr.Off) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } // memsz larger than filesz means that extra zeroed space should be @@ -117,24 +116,24 @@ func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, erro // zeroes. if phdr.Memsz != phdr.Filesz { log.Warningf("PT_LOAD segment memsz %#x != filesz %#x", phdr.Memsz, phdr.Filesz) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } start := hostarch.Addr(memoryOffset) end, ok := start.AddLength(phdr.Memsz) if !ok { log.Warningf("PT_LOAD segment size overflows: %#x + %#x", start, end) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if uint64(end) > size { log.Warningf("PT_LOAD segment end %#x extends beyond end of file %#x", end, size) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } if prev != nil { if start < prevEnd { log.Warningf("PT_LOAD segments out of order") - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } // We mprotect entire pages, so each segment must be in @@ -143,7 +142,7 @@ func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, erro startPage := start.RoundDown() if prevEndPage >= startPage { log.Warningf("PT_LOAD segments share a page: %#x", prevEndPage) - return elfInfo{}, syserror.ENOEXEC + return elfInfo{}, linuxerr.ENOEXEC } } prev = &info.phdrs[i] @@ -177,27 +176,6 @@ type VDSO struct { phdrs []elf.ProgHeader `state:".([]elfProgHeader)"` } -// getSymbolValueFromVDSO returns the specific symbol value in vdso.so. -func getSymbolValueFromVDSO(symbol string) (uint64, error) { - f, err := elf.NewFile(bytes.NewReader(vdsodata.Binary)) - if err != nil { - return 0, err - } - syms, err := f.Symbols() - if err != nil { - return 0, err - } - - for _, sym := range syms { - if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF { - if strings.Contains(sym.Name, symbol) { - return sym.Value, nil - } - } - } - return 0, fmt.Errorf("no %v in vdso.so", symbol) -} - // PrepareVDSO validates the system VDSO and returns a VDSO, containing the // param page for updating by the kernel. func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) { @@ -270,11 +248,11 @@ func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) { func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (hostarch.Addr, error) { if v.os != bin.os { ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } if v.arch != bin.arch { ctx.Warningf("Binary ELF arch %v and VDSO ELF arch %v differ", bin.arch, v.arch) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } // Reserve address space for the VDSO and its parameter page, which is @@ -347,35 +325,35 @@ func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) segAddr, ok := vdsoAddr.AddLength(memoryOffset) if !ok { ctx.Warningf("PT_LOAD segment address overflows: %#x + %#x", segAddr, memoryOffset) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } segPage := segAddr.RoundDown() segSize := hostarch.Addr(phdr.Memsz) segSize, ok = segSize.AddLength(segAddr.PageOffset()) if !ok { ctx.Warningf("PT_LOAD segment memsize %#x + offset %#x overflows", phdr.Memsz, segAddr.PageOffset()) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } segSize, ok = segSize.RoundUp() if !ok { ctx.Warningf("PT_LOAD segment size overflows: %#x", phdr.Memsz+segAddr.PageOffset()) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } segEnd, ok := segPage.AddLength(uint64(segSize)) if !ok { ctx.Warningf("PT_LOAD segment range overflows: %#x + %#x", segAddr, segSize) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } if segEnd > vdsoEnd { ctx.Warningf("PT_LOAD segment ends beyond VDSO: %#x > %#x", segEnd, vdsoEnd) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } perms := progFlagsAsPerms(phdr.Flags) if perms != hostarch.Read { if err := m.MProtect(segPage, uint64(segSize), perms, false); err != nil { ctx.Warningf("Unable to set PT_LOAD segment protections %+v at [%#x, %#x): %v", perms, segAddr, segEnd, err) - return 0, syserror.ENOEXEC + return 0, linuxerr.ENOEXEC } } } @@ -388,3 +366,21 @@ func (v *VDSO) Release(ctx context.Context) { v.ParamPage.DecRef(ctx) v.vdso.DecRef(ctx) } + +var vdsoSigreturnOffset = func() uint64 { + f, err := elf.NewFile(bytes.NewReader(vdsodata.Binary)) + if err != nil { + panic(fmt.Sprintf("failed to parse vdso.so as ELF file: %v", err)) + } + syms, err := f.Symbols() + if err != nil { + panic(fmt.Sprintf("failed to read symbols from vdso.so: %v", err)) + } + const sigreturnSymbol = "__kernel_rt_sigreturn" + for _, sym := range syms { + if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF && sym.Name == sigreturnSymbol { + return sym.Value + } + } + panic(fmt.Sprintf("no symbol %q in vdso.so", sigreturnSymbol)) +}() diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index c30e88725..a89bfa680 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -54,7 +54,6 @@ go_library( "//pkg/hostarch", "//pkg/log", "//pkg/safemem", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index b417c2da7..b7d782b7f 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -125,6 +125,7 @@ go_library( "//pkg/abi/linux", "//pkg/atomicbitops", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/refs", @@ -143,7 +144,6 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip/buffer", "//pkg/usermem", ], @@ -156,6 +156,7 @@ go_test( library = ":mm", deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/sentry/arch", "//pkg/sentry/contexttest", @@ -163,7 +164,6 @@ go_test( "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 346866d3c..d71d64580 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -17,12 +17,12 @@ package mm import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -77,15 +77,6 @@ func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64) return nil } - // Only unmaps after it assured that the address is a valid aio context to - // prevent random memory from been unmapped. - // - // Note: It's possible to unmap this address and map something else into - // the same address. Then it would be unmapping memory that it doesn't own. - // This is, however, the way Linux implements AIO. Keeps the same [weird] - // semantics in case anyone relies on it. - mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize) - delete(mm.aioManager.contexts, id) aioCtx.destroy() return aioCtx @@ -158,11 +149,11 @@ func (ctx *AIOContext) Prepare() error { defer ctx.mu.Unlock() if ctx.dead { // Context died after the caller looked it up. - return syserror.EINVAL + return linuxerr.EINVAL } if ctx.outstanding >= ctx.maxOutstanding { // Context is busy. - return syserror.EAGAIN + return linuxerr.EAGAIN } ctx.outstanding++ return nil @@ -297,7 +288,7 @@ func (m *aioMappable) InodeID() uint64 { // Msync implements memmap.MappingIdentity.Msync. func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { // Linux: aio_ring_fops.fsync == NULL - return syserror.EINVAL + return linuxerr.EINVAL } // AddMapping implements memmap.Mappable.AddMapping. @@ -305,7 +296,7 @@ func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar ho // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() // sets VM_DONTEXPAND). if offset != 0 || uint64(ar.Length()) != aioRingBufferSize { - return syserror.EFAULT + return linuxerr.EFAULT } return nil } @@ -319,13 +310,13 @@ func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, s // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() // sets VM_DONTEXPAND). if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize { - return syserror.EFAULT + return linuxerr.EFAULT } // Require that the mapping correspond to a live AIOContext. Compare // Linux's fs/aio.c:aio_ring_mremap(). mm, ok := ms.(*MemoryManager) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } am := &mm.aioManager am.mu.Lock() @@ -333,12 +324,12 @@ func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, s oldID := uint64(srcAR.Start) aioCtx, ok := am.contexts[oldID] if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } aioCtx.mu.Lock() defer aioCtx.mu.Unlock() if aioCtx.dead { - return syserror.EINVAL + return linuxerr.EINVAL } // Use the new ID for the AIOContext. am.contexts[uint64(dstAR.Start)] = aioCtx @@ -350,7 +341,7 @@ func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, s func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > m.fr.Length() { - err = &memmap.BusError{syserror.EFAULT} + err = &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ @@ -399,7 +390,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint id := uint64(addr) if !mm.aioManager.newAIOContext(events, id) { mm.MUnmap(ctx, addr, aioRingBufferSize) - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } return id, nil } @@ -411,6 +402,15 @@ func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOC return nil } + // Only unmaps after it assured that the address is a valid aio context to + // prevent random memory from been unmapped. + // + // Note: It's possible to unmap this address and map something else into + // the same address. Then it would be unmapping memory that it doesn't own. + // This is, however, the way Linux implements AIO. Keeps the same [weird] + // semantics in case anyone relies on it. + mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize) + mm.aioManager.mu.Lock() defer mm.aioManager.mu.Unlock() return mm.destroyAIOContextLocked(ctx, id) diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go index 16f318ab3..5fcfeb473 100644 --- a/pkg/sentry/mm/io.go +++ b/pkg/sentry/mm/io.go @@ -16,10 +16,10 @@ package mm import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -97,14 +97,14 @@ func translateIOError(ctx context.Context, err error) error { if logIOErrors { ctx.Debugf("MM I/O error: %v", err) } - return syserror.EFAULT + return linuxerr.EFAULT } // CopyOut implements usermem.IO.CopyOut. func (mm *MemoryManager) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) { ar, ok := mm.CheckIORange(addr, int64(len(src))) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if len(src) == 0 { @@ -147,7 +147,7 @@ func (mm *MemoryManager) asCopyOut(ctx context.Context, addr hostarch.Addr, src func (mm *MemoryManager) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) { ar, ok := mm.CheckIORange(addr, int64(len(dst))) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if len(dst) == 0 { @@ -190,7 +190,7 @@ func (mm *MemoryManager) asCopyIn(ctx context.Context, addr hostarch.Addr, dst [ func (mm *MemoryManager) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { ar, ok := mm.CheckIORange(addr, toZero) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if toZero == 0 { @@ -231,7 +231,7 @@ func (mm *MemoryManager) asZeroOut(ctx context.Context, addr hostarch.Addr, toZe // CopyOutFrom implements usermem.IO.CopyOutFrom. func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { if !mm.checkIOVec(ars) { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if ars.NumBytes() == 0 { @@ -276,7 +276,7 @@ func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars hostarch.AddrRange // CopyInTo implements usermem.IO.CopyInTo. func (mm *MemoryManager) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { if !mm.checkIOVec(ars) { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } if ars.NumBytes() == 0 { @@ -314,7 +314,7 @@ func (mm *MemoryManager) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { ar, ok := mm.CheckIORange(addr, 4) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Do AddressSpace IO if applicable. @@ -339,7 +339,7 @@ func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { // Atomicity is unachievable across mappings. - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } im := ims.Head() var err error @@ -357,7 +357,7 @@ func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { ar, ok := mm.CheckIORange(addr, 4) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Do AddressSpace IO if applicable. @@ -382,7 +382,7 @@ func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { // Atomicity is unachievable across mappings. - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } im := ims.Head() var err error @@ -400,7 +400,7 @@ func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch func (mm *MemoryManager) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) { ar, ok := mm.CheckIORange(addr, 4) if !ok { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Do AddressSpace IO if applicable. @@ -425,7 +425,7 @@ func (mm *MemoryManager) LoadUint32(ctx context.Context, addr hostarch.Addr, opt _, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { // Atomicity is unachievable across mappings. - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } im := ims.Head() var err error diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 57969b26c..0fca59b64 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -28,6 +28,7 @@ // memmap.File locks // mm.aioManager.mu // mm.AIOContext.mu +// kernel.TaskSet.mu // // Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in // multiple mm.MemoryManagers, as it does so in a well-defined order (forked diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go index 1304b0a2f..84cb8158d 100644 --- a/pkg/sentry/mm/mm_test.go +++ b/pkg/sentry/mm/mm_test.go @@ -18,6 +18,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/contexttest" @@ -25,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -171,7 +171,7 @@ func TestIOAfterUnmap(t *testing.T) { } n, err = mm.CopyIn(ctx, addr, b, usermem.IOOpts{}) - if err != syserror.EFAULT { + if !linuxerr.Equals(linuxerr.EFAULT, err) { t.Errorf("CopyIn got err %v want EFAULT", err) } if n != 0 { @@ -212,7 +212,7 @@ func TestIOAfterMProtect(t *testing.T) { // Without IgnorePermissions, CopyOut should no longer succeed. n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{}) - if err != syserror.EFAULT { + if !linuxerr.Equals(linuxerr.EFAULT, err) { t.Errorf("CopyOut got err %v want EFAULT", err) } if n != 0 { @@ -249,7 +249,7 @@ func TestAIOPrepareAfterDestroy(t *testing.T) { mm.DestroyAIOContext(ctx, id) // Prepare should fail because aioCtx should be destroyed. - if err := aioCtx.Prepare(); err != syserror.EINVAL { + if err := aioCtx.Prepare(); !linuxerr.Equals(linuxerr.EINVAL, err) { t.Errorf("aioCtx.Prepare got err %v want nil", err) } else if err == nil { aioCtx.CancelPendingRequest() diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go index 5583f62b2..05cdcd8ae 100644 --- a/pkg/sentry/mm/pma.go +++ b/pkg/sentry/mm/pma.go @@ -18,12 +18,12 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // existingPMAsLocked checks that pmas exist for all addresses in ar, and @@ -116,7 +116,7 @@ func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar var alignerr error if !ok { end = ar.End.RoundDown() - alignerr = syserror.EFAULT + alignerr = linuxerr.EFAULT } ar = hostarch.AddrRange{ar.Start.RoundDown(), end} @@ -162,7 +162,7 @@ func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.Addr var alignerr error if !ok { end = ar.End.RoundDown() - alignerr = syserror.EFAULT + alignerr = linuxerr.EFAULT } ar = hostarch.AddrRange{ar.Start.RoundDown(), end} @@ -324,20 +324,37 @@ func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIter panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma)) } } - // The majority of copy-on-write breaks on executable pages - // come from: - // - // - The ELF loader, which must zero out bytes on the last - // page of each segment after the end of the segment. - // - // - gdb's use of ptrace to insert breakpoints. - // - // Neither of these cases has enough spatial locality to - // benefit from copying nearby pages, so if the vma is - // executable, only copy the pages required. var copyAR hostarch.AddrRange - if vseg.ValuePtr().effectivePerms.Execute { + if vma := vseg.ValuePtr(); vma.effectivePerms.Execute { + // The majority of copy-on-write breaks on executable + // pages come from: + // + // - The ELF loader, which must zero out bytes on the + // last page of each segment after the end of the + // segment. + // + // - gdb's use of ptrace to insert breakpoints. + // + // Neither of these cases has enough spatial locality + // to benefit from copying nearby pages, so if the vma + // is executable, only copy the pages required. copyAR = pseg.Range().Intersect(ar) + } else if vma.growsDown { + // In most cases, the new process will not use most of + // its stack before exiting or invoking execve(); it is + // especially unlikely to return very far down its call + // stack, since async-signal-safety concerns in + // multithreaded programs prevent the new process from + // being able to do much. So only copy up to one page + // before and after the pages required. + stackMaskAR := ar + if newStart := stackMaskAR.Start - hostarch.PageSize; newStart < stackMaskAR.Start { + stackMaskAR.Start = newStart + } + if newEnd := stackMaskAR.End + hostarch.PageSize; newEnd > stackMaskAR.End { + stackMaskAR.End = newEnd + } + copyAR = pseg.Range().Intersect(stackMaskAR) } else { copyAR = pseg.Range().Intersect(maskAR) } diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go index 3130be80c..94d5112a1 100644 --- a/pkg/sentry/mm/shm.go +++ b/pkg/sentry/mm/shm.go @@ -16,16 +16,16 @@ package mm import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" - "gvisor.dev/gvisor/pkg/syserror" ) // DetachShm unmaps a sysv shared memory segment. func (mm *MemoryManager) DetachShm(ctx context.Context, addr hostarch.Addr) error { if addr != addr.RoundDown() { // "... shmaddr is not aligned on a page boundary." - man shmdt(2) - return syserror.EINVAL + return linuxerr.EINVAL } var detached *shm.Shm @@ -48,7 +48,7 @@ func (mm *MemoryManager) DetachShm(ctx context.Context, addr hostarch.Addr) erro if detached == nil { // There is no shared memory segment attached at addr. - return syserror.EINVAL + return linuxerr.EINVAL } // Remove all vmas that could have been created by the same attach. diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index e748b7ff8..69c6e77a7 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -16,11 +16,11 @@ package mm import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) // SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with @@ -94,7 +94,7 @@ func (*SpecialMappable) CopyMapping(context.Context, memmap.MappingSpace, hostar func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > m.fr.Length() { - err = &memmap.BusError{syserror.EFAULT} + err = &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ @@ -144,11 +144,11 @@ func (m *SpecialMappable) Length() uint64 { // leak (b/143656263). Delete this function along with VFS1. func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) { if length == 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } alignedLen, ok := hostarch.Addr(length).RoundUp() if !ok { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } fr, err := mfp.MemoryFile().Allocate(uint64(alignedLen), usage.Anonymous) if err != nil { diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 7ad6b7c21..dc12ad357 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -21,12 +21,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" ) // HandleUserFault handles an application page fault. sp is the faulting @@ -36,7 +36,7 @@ import ( func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr hostarch.Addr, at hostarch.AccessType, sp hostarch.Addr) error { ar, ok := addr.RoundDown().ToRange(hostarch.PageSize) if !ok { - return syserror.EFAULT + return linuxerr.EFAULT } // Don't bother trying existingPMAsLocked; in most cases, if we did have @@ -74,22 +74,22 @@ func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr hostarch.Addr // MMap establishes a memory mapping. func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error) { if opts.Length == 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } length, ok := hostarch.Addr(opts.Length).RoundUp() if !ok { - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } opts.Length = uint64(length) if opts.Mappable != nil { // Offset must be aligned. if hostarch.Addr(opts.Offset).RoundDown() != hostarch.Addr(opts.Offset) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Offset + length must not overflow. if end := opts.Offset + opts.Length; end < opts.Offset { - return 0, syserror.ENOMEM + return 0, linuxerr.EOVERFLOW } } else { opts.Offset = 0 @@ -99,19 +99,19 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostar // MAP_FIXED requires addr to be page-aligned; non-fixed mappings // don't. if opts.Fixed { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } opts.Addr = opts.Addr.RoundDown() } if !opts.MaxPerms.SupersetOf(opts.Perms) { - return 0, syserror.EACCES + return 0, linuxerr.EACCES } if opts.Unmap && !opts.Fixed { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if opts.GrowsDown && opts.Mappable != nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get the new vma. @@ -203,6 +203,7 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar h // * vseg.Range().IsSupersetOf(ar). // // Postconditions: mm.mappingMu will be unlocked. +// +checklocksrelease:mm.mappingMu func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) { // See populateVMA above for commentary. if !vseg.ValuePtr().effectivePerms.Any() { @@ -251,7 +252,7 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, erro ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize) sz = maxStackSize } else if sz == 0 { - return hostarch.AddrRange{}, syserror.ENOMEM + return hostarch.AddrRange{}, linuxerr.ENOMEM } szaddr := hostarch.Addr(sz) ctx.Debugf("Allocating stack with size of %v bytes", sz) @@ -260,7 +261,7 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, erro // randomization can't be disabled. stackEnd := mm.layout.MaxAddr - hostarch.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown() if stackEnd < szaddr { - return hostarch.AddrRange{}, syserror.ENOMEM + return hostarch.AddrRange{}, linuxerr.ENOMEM } stackStart := stackEnd - szaddr mm.mappingMu.Lock() @@ -281,18 +282,18 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, erro // MUnmap implements the semantics of Linux's munmap(2). func (mm *MemoryManager) MUnmap(ctx context.Context, addr hostarch.Addr, length uint64) error { if addr != addr.RoundDown() { - return syserror.EINVAL + return linuxerr.EINVAL } if length == 0 { - return syserror.EINVAL + return linuxerr.EINVAL } la, ok := hostarch.Addr(length).RoundUp() if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } ar, ok := addr.ToRange(uint64(la)) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.Lock() @@ -331,7 +332,7 @@ const ( func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (hostarch.Addr, error) { // "Note that old_address has to be page aligned." - mremap(2) if oldAddr.RoundDown() != oldAddr { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Linux treats an old_size that rounds up to 0 as 0, which is otherwise a @@ -340,13 +341,13 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS oldSize = uint64(oldSizeAddr) newSizeAddr, ok := hostarch.Addr(newSize).RoundUp() if !ok || newSizeAddr == 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } newSize = uint64(newSizeAddr) oldEnd, ok := oldAddr.AddLength(oldSize) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } mm.mappingMu.Lock() @@ -355,7 +356,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS // All cases require that a vma exists at oldAddr. vseg := mm.vmas.FindSegment(oldAddr) if !vseg.Ok() { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Behavior matrix: @@ -379,7 +380,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit { - return 0, syserror.EAGAIN + return 0, linuxerr.EAGAIN } } } @@ -402,7 +403,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS // Check that oldEnd maps to the same vma as oldAddr. if vseg.End() < oldEnd { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // "Grow" the existing vma by creating a new mergeable one. vma := vseg.ValuePtr() @@ -450,15 +451,15 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS case MRemapMustMove: newAddr := opts.NewAddr if newAddr.RoundDown() != newAddr { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } var ok bool newAR, ok = newAddr.ToRange(newSize) if !ok { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if (hostarch.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that the new region is valid. @@ -492,19 +493,19 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS // Check that oldEnd maps to the same vma as oldAddr. if vseg.End() < oldEnd { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } // Check against RLIMIT_AS. newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } if vma := vseg.ValuePtr(); vma.mappable != nil { // Check that offset+length does not overflow. if vma.off+uint64(newAR.Length()) < vma.off { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Inform the Mappable, if any, of the new mapping. if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil { @@ -590,18 +591,18 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldS // MProtect implements the semantics of Linux's mprotect(2). func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms hostarch.AccessType, growsDown bool) error { if addr.RoundDown() != addr { - return syserror.EINVAL + return linuxerr.EINVAL } if length == 0 { return nil } rlength, ok := hostarch.Addr(length).RoundUp() if !ok { - return syserror.ENOMEM + return linuxerr.ENOMEM } ar, ok := addr.ToRange(uint64(rlength)) if !ok { - return syserror.ENOMEM + return linuxerr.ENOMEM } effectivePerms := realPerms.Effective() @@ -614,19 +615,19 @@ func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms h // the non-growsDown case. vseg := mm.vmas.LowerBoundSegment(ar.Start) if !vseg.Ok() { - return syserror.ENOMEM + return linuxerr.ENOMEM } if growsDown { if !vseg.ValuePtr().growsDown { - return syserror.EINVAL + return linuxerr.EINVAL } if ar.End <= vseg.Start() { - return syserror.ENOMEM + return linuxerr.ENOMEM } ar.Start = vseg.Start() } else { if ar.Start < vseg.Start() { - return syserror.ENOMEM + return linuxerr.ENOMEM } } @@ -644,7 +645,7 @@ func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms h // Check for permission validity before splitting vmas, for consistency // with Linux. if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) { - return syserror.EACCES + return linuxerr.EACCES } vseg = mm.vmas.Isolate(vseg, ar) @@ -686,7 +687,7 @@ func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms h } vseg, _ = vseg.NextNonEmpty() if !vseg.Ok() { - return syserror.ENOMEM + return linuxerr.ENOMEM } } } @@ -711,7 +712,7 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch. if addr < mm.brk.Start { addr = mm.brk.End mm.mappingMu.Unlock() - return addr, syserror.EINVAL + return addr, linuxerr.EINVAL } // TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is @@ -722,7 +723,7 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch. if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur { addr = mm.brk.End mm.mappingMu.Unlock() - return addr, syserror.ENOMEM + return addr, linuxerr.ENOMEM } oldbrkpg, _ := mm.brk.End.RoundUp() @@ -730,7 +731,7 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch. if !ok { addr = mm.brk.End mm.mappingMu.Unlock() - return addr, syserror.EFAULT + return addr, linuxerr.EFAULT } switch { @@ -780,7 +781,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u la, _ := hostarch.Addr(length + addr.PageOffset()).RoundUp() ar, ok := addr.RoundDown().ToRange(uint64(la)) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.Lock() @@ -792,11 +793,11 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { mm.mappingMu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit { mm.mappingMu.Unlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } } } @@ -833,7 +834,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u mm.vmas.MergeAdjacent(ar) if unmapped { mm.mappingMu.Unlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } if mode == memmap.MLockEager { @@ -848,18 +849,18 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length u // case, which is converted to ENOMEM by mlock. mm.activeMu.Unlock() mm.mappingMu.RUnlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), hostarch.NoAccess) if err != nil { mm.activeMu.Unlock() mm.mappingMu.RUnlock() // Linux: mm/mlock.c:__mlock_posix_error_return() - if err == syserror.EFAULT { - return syserror.ENOMEM + if linuxerr.Equals(linuxerr.EFAULT, err) { + return linuxerr.ENOMEM } - if err == syserror.ENOMEM { - return syserror.EAGAIN + if linuxerr.Equals(linuxerr.ENOMEM, err) { + return linuxerr.EAGAIN } return err } @@ -898,7 +899,7 @@ type MLockAllOpts struct { // depending on opts. func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error { if !opts.Current && !opts.Future { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.Lock() @@ -911,11 +912,11 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { mm.mappingMu.Unlock() - return syserror.EPERM + return linuxerr.EPERM } if uint64(mm.vmas.Span()) > mlockLimit { mm.mappingMu.Unlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } } } @@ -970,7 +971,7 @@ func (mm *MemoryManager) NumaPolicy(addr hostarch.Addr) (linux.NumaPolicy, uint6 defer mm.mappingMu.RUnlock() vseg := mm.vmas.FindSegment(addr) if !vseg.Ok() { - return 0, 0, syserror.EFAULT + return 0, 0, linuxerr.EFAULT } vma := vseg.ValuePtr() return vma.numaPolicy, vma.numaNodemask, nil @@ -979,13 +980,13 @@ func (mm *MemoryManager) NumaPolicy(addr hostarch.Addr) (linux.NumaPolicy, uint6 // SetNumaPolicy implements the semantics of Linux's mbind(). func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error { if !addr.IsPageAligned() { - return syserror.EINVAL + return linuxerr.EINVAL } // Linux allows this to overflow. la, _ := hostarch.Addr(length).RoundUp() ar, ok := addr.ToRange(uint64(la)) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } if ar.Length() == 0 { return nil @@ -1003,7 +1004,7 @@ func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy if !vseg.Ok() || lastEnd < vseg.Start() { // "EFAULT: ... there was an unmapped hole in the specified memory // range specified [sic] by addr and len." - mbind(2) - return syserror.EFAULT + return linuxerr.EFAULT } vseg = mm.vmas.Isolate(vseg, ar) vma := vseg.ValuePtr() @@ -1021,7 +1022,7 @@ func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork bool) error { ar, ok := addr.ToRange(length) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.Lock() @@ -1038,7 +1039,7 @@ func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork } if mm.vmas.SpanRange(ar) != ar.Length() { - return syserror.ENOMEM + return linuxerr.ENOMEM } return nil } @@ -1047,7 +1048,7 @@ func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error { ar, ok := addr.ToRange(length) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } mm.mappingMu.RLock() @@ -1063,7 +1064,7 @@ func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error { for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { vma := vseg.ValuePtr() if vma.mlockMode != memmap.MLockNone { - return syserror.EINVAL + return linuxerr.EINVAL } vsegAR := vseg.Range().Intersect(ar) // pseg should already correspond to either this vma or a later one, @@ -1097,7 +1098,7 @@ func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error { // to the rest (but returns ENOMEM from the system call, as it should)." - // madvise(2) if mm.vmas.SpanRange(ar) != ar.Length() { - return syserror.ENOMEM + return linuxerr.ENOMEM } return nil } @@ -1114,18 +1115,18 @@ type MSyncOpts struct { // MSync implements the semantics of Linux's msync(). func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length uint64, opts MSyncOpts) error { if addr != addr.RoundDown() { - return syserror.EINVAL + return linuxerr.EINVAL } if length == 0 { return nil } la, ok := hostarch.Addr(length).RoundUp() if !ok { - return syserror.ENOMEM + return linuxerr.ENOMEM } ar, ok := addr.ToRange(uint64(la)) if !ok { - return syserror.ENOMEM + return linuxerr.ENOMEM } mm.mappingMu.RLock() @@ -1133,7 +1134,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length u vseg := mm.vmas.LowerBoundSegment(ar.Start) if !vseg.Ok() { mm.mappingMu.RUnlock() - return syserror.ENOMEM + return linuxerr.ENOMEM } var unmapped bool lastEnd := ar.Start @@ -1150,7 +1151,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length u vma := vseg.ValuePtr() if opts.Invalidate && vma.mlockMode != memmap.MLockNone { mm.mappingMu.RUnlock() - return syserror.EBUSY + return linuxerr.EBUSY } // It's only possible to have dirtied the Mappable through a shared // mapping. Don't check if the mapping is writable, because mprotect @@ -1182,7 +1183,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length u } if unmapped { - return syserror.ENOMEM + return linuxerr.ENOMEM } return nil } @@ -1191,7 +1192,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length u func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr hostarch.Addr) (futex.Key, error) { ar, ok := addr.ToRange(4) // sizeof(int32). if !ok { - return futex.Key{}, syserror.EFAULT + return futex.Key{}, linuxerr.EFAULT } mm.mappingMu.RLock() diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 0d019e41d..e34b7a2f7 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -19,12 +19,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" ) // Preconditions: @@ -58,7 +58,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp newUsageAS -= uint64(mm.vmas.SpanRange(ar)) } if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { - return vmaIterator{}, hostarch.AddrRange{}, syserror.ENOMEM + return vmaIterator{}, hostarch.AddrRange{}, linuxerr.ENOMEM } if opts.MLockMode != memmap.MLockNone { @@ -66,14 +66,14 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { - return vmaIterator{}, hostarch.AddrRange{}, syserror.EPERM + return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EPERM } newLockedAS := mm.lockedAS + opts.Length if opts.Unmap { newLockedAS -= mm.mlockedBytesRangeLocked(ar) } if newLockedAS > mlockLimit { - return vmaIterator{}, hostarch.AddrRange{}, syserror.EAGAIN + return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EAGAIN } } } @@ -177,7 +177,7 @@ func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOp // Fixed mappings accept only the requested address. if opts.Fixed { - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } // Prefer hugepage alignment if a hugepage or more is requested. @@ -215,7 +215,7 @@ func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bou return gr.Start, nil } } - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } // Preconditions: mm.mappingMu must be locked. @@ -235,7 +235,7 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo return start, nil } } - return 0, syserror.ENOMEM + return 0, linuxerr.ENOMEM } // Preconditions: mm.mappingMu must be locked. @@ -288,7 +288,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRang vma := vseg.ValuePtr() if addr < vseg.Start() { // TODO(jamieliu): Implement vma.growsDown here. - return vbegin, vgap, syserror.EFAULT + return vbegin, vgap, linuxerr.EFAULT } perms := vma.effectivePerms @@ -296,7 +296,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRang perms = vma.maxPerms } if !perms.SupersetOf(at) { - return vbegin, vgap, syserror.EPERM + return vbegin, vgap, linuxerr.EPERM } addr = vseg.End() @@ -308,7 +308,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRang } // Ran out of vmas before ar.End. - return vbegin, vgap, syserror.EFAULT + return vbegin, vgap, linuxerr.EFAULT } // getVecVMAsLocked ensures that vmas exist for all addresses in ars, and diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 57d73d770..496a9fd97 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -85,6 +85,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/memutil", @@ -96,7 +97,6 @@ go_library( "//pkg/state", "//pkg/state/wire", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index d1a883da4..68e17d343 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -31,6 +31,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" @@ -38,7 +39,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // MemoryFile is a memmap.File whose pages may be allocated to arbitrary @@ -403,7 +403,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.File // Find a range in the underlying file. fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment) if !ok { - return memmap.FileRange{}, syserror.ENOMEM + return memmap.FileRange{}, linuxerr.ENOMEM } // Expand the file if needed. @@ -674,7 +674,7 @@ func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (s panic(fmt.Sprintf("invalid range: %v", fr)) } if at.Execute { - return safemem.BlockSeq{}, syserror.EACCES + return safemem.BlockSeq{}, linuxerr.EACCES } chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) @@ -944,7 +944,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( // NOTE(b/165896008): mincore (which is passed as checkCommitted) // by f.UpdateUsage() might take a really long time. So unlock f.mu // while checkCommitted runs. - f.mu.Unlock() + f.mu.Unlock() // +checklocksforce err := checkCommitted(s, buf) f.mu.Lock() if err != nil { diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index d761bbdee..0567c8d32 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm @@ -74,8 +75,27 @@ func (c *vCPU) KernelSyscall() { // therefore be guaranteed that there is no floating point state to be // loaded on resuming from halt. We only worry about saving on exit. ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. - ring0.Halt() - ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment. + // N.B. Since KernelSyscall is called when the kernel makes a syscall, + // FS_BASE is already set for correct execution of this function. + // + // Refresher on syscall/exception handling: + // 1. When the sentry is in guest mode and makes a syscall, it goes to + // sysenter(), which saves the register state (including RIP of SYSCALL + // instruction) to vCPU.registers. + // 2. It then calls KernelSyscall, which rewinds the IP and executes + // HLT. + // 3. HLT does a VM-exit to bluepillHandler, which returns from the + // signal handler using vCPU.registers, directly to the SYSCALL + // instruction. + // 4. Later, when we want to re-use the vCPU (perhaps on a different + // host thread), we set the new thread's registers in vCPU.registers + // (as opposed to setting the KVM registers with KVM_SET_REGS). + // 5. KVM_RUN thus enters the guest with the old register state, + // immediately following the HLT instruction, returning here. + // 6. We then restore FS_BASE and the full registers from vCPU.register + // to return from sysenter() back to the desired bluepill point from + // the host. + ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } // KernelException handles kernel exceptions. @@ -93,8 +113,8 @@ func (c *vCPU) KernelException(vector ring0.Vector) { } // See above. ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. - ring0.Halt() - ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment. + // See above. + ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } // bluepillArchExit is called during bluepillEnter. diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s index 953024600..c2a1dca11 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.s +++ b/pkg/sentry/platform/kvm/bluepill_amd64.s @@ -37,7 +37,15 @@ TEXT ·bluepill(SB),NOSPLIT,$0 begin: MOVQ vcpu+0(FP), AX LEAQ VCPU_CPU(AX), BX + + // The gorountine stack will be changed in guest which renders + // the frame pointer outdated and misleads perf tools. + // Disconnect the frame-chain with the zeroed frame pointer + // when it is saved in the frame in bluepillHandler(). + MOVQ BP, CX + MOVQ $0, BP BYTE CLI; + MOVQ CX, BP check_vcpu: MOVQ ENTRY_CPU_SELF(GS), CX CMPQ BX, CX diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index 198bafdea..4ba1d6f9c 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index 578852c3f..acb0cb05f 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm @@ -25,29 +26,6 @@ import ( var ( // The action for bluepillSignal is changed by sigaction(). bluepillSignal = unix.SIGILL - - // vcpuSErrBounce is the event of system error for bouncing KVM. - vcpuSErrBounce = kvmVcpuEvents{ - exception: exception{ - sErrPending: 1, - }, - } - - // vcpuSErrNMI is the event of system error to trigger sigbus. - vcpuSErrNMI = kvmVcpuEvents{ - exception: exception{ - sErrPending: 1, - sErrHasEsr: 1, - sErrEsr: _ESR_ELx_SERR_NMI, - }, - } - - // vcpuExtDabt is the event of ext_dabt. - vcpuExtDabt = kvmVcpuEvents{ - exception: exception{ - extDabtPending: 1, - }, - } ) // getTLS returns the value of TPIDR_EL0 register. diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go index 07fc4f216..ee7dba828 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm @@ -80,11 +81,18 @@ func getHypercallID(addr uintptr) int { // //go:nosplit func bluepillStopGuest(c *vCPU) { + // vcpuSErrBounce is the event of system error for bouncing KVM. + vcpuSErrBounce := &kvmVcpuEvents{ + exception: exception{ + sErrPending: 1, + }, + } + if _, _, errno := unix.RawSyscall( // escapes: no. unix.SYS_IOCTL, uintptr(c.fd), _KVM_SET_VCPU_EVENTS, - uintptr(unsafe.Pointer(&vcpuSErrBounce))); errno != 0 { + uintptr(unsafe.Pointer(vcpuSErrBounce))); errno != 0 { throw("bounce sErr injection failed") } } @@ -93,12 +101,21 @@ func bluepillStopGuest(c *vCPU) { // //go:nosplit func bluepillSigBus(c *vCPU) { + // vcpuSErrNMI is the event of system error to trigger sigbus. + vcpuSErrNMI := &kvmVcpuEvents{ + exception: exception{ + sErrPending: 1, + sErrHasEsr: 1, + sErrEsr: _ESR_ELx_SERR_NMI, + }, + } + // Host must support ARM64_HAS_RAS_EXTN. if _, _, errno := unix.RawSyscall( // escapes: no. unix.SYS_IOCTL, uintptr(c.fd), _KVM_SET_VCPU_EVENTS, - uintptr(unsafe.Pointer(&vcpuSErrNMI))); errno != 0 { + uintptr(unsafe.Pointer(vcpuSErrNMI))); errno != 0 { if errno == unix.EINVAL { throw("No ARM64_HAS_RAS_EXTN feature in host.") } @@ -110,11 +127,18 @@ func bluepillSigBus(c *vCPU) { // //go:nosplit func bluepillExtDabt(c *vCPU) { + // vcpuExtDabt is the event of ext_dabt. + vcpuExtDabt := &kvmVcpuEvents{ + exception: exception{ + extDabtPending: 1, + }, + } + if _, _, errno := unix.RawSyscall( // escapes: no. unix.SYS_IOCTL, uintptr(c.fd), _KVM_SET_VCPU_EVENTS, - uintptr(unsafe.Pointer(&vcpuExtDabt))); errno != 0 { + uintptr(unsafe.Pointer(vcpuExtDabt))); errno != 0 { throw("ext_dabt injection failed") } } diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index 28a613a54..7a3c97c5a 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -55,11 +55,7 @@ func calculateBluepillFault(physical uintptr, phyRegions []physicalRegion) (virt } // Adjust the block to match our size. - physicalStart = alignedPhysical & faultBlockMask - if physicalStart < pr.physical { - // Bound the starting point to the start of the region. - physicalStart = pr.physical - } + physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask virtualStart = pr.virtual + (physicalStart - pr.physical) physicalEnd := physicalStart + faultBlockSize if physicalEnd > end { @@ -101,7 +97,7 @@ func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegi // Store the physical address in the slot. This is used to // avoid calls to handleBluepillFault in the future (see // machine.mapPhysical). - atomic.StoreUintptr(&m.usedSlots[slot], physical) + atomic.StoreUintptr(&m.usedSlots[slot], physicalStart) // Successfully added region; we can increment nextSlot and // allow another set to proceed here. atomic.StoreUint32(&m.nextSlot, slot+1) diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 6f87236ad..0f0c1e73b 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.12 // +build go1.12 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package kvm @@ -28,7 +30,7 @@ import ( ) //go:linkname throw runtime.throw -func throw(string) +func throw(s string) // vCPUPtr returns a CPU for the given address. // @@ -85,6 +87,13 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { // signal stack. It should only execute raw system calls and functions that are // explicitly marked go:nosplit. // +// Ideally, this function should switch to gsignal, as runtime.sigtramp does, +// but that is tedious given all the runtime internals. That said, using +// gsignal inside a signal handler is not _required_, provided we avoid stack +// splits and allocations. Note that calling any splittable function here will +// be flaky; if the signal stack is below the G stack then we will trigger a +// split and crash. If above, we won't trigger a split. +// // +checkescape:all // //go:nosplit diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go index b9ed4a706..a5189d9e2 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64.go +++ b/pkg/sentry/platform/kvm/kvm_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go index b1cab89a0..c3fbbdc75 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_test.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm @@ -28,7 +29,7 @@ import ( ) func TestSegments(t *testing.T) { - applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTwiddleSegments(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestSegments(regs) for { var si linux.SignalInfo @@ -55,7 +56,7 @@ func TestSegments(t *testing.T) { func stmxcsr(addr *uint32) func TestMXCSR(t *testing.T) { - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo switchOpts := ring0.SwitchOpts{ Registers: regs, diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go index 0c43d72f4..7fdb6ac64 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go index b73340f0e..159808433 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64_test.go b/pkg/sentry/platform/kvm/kvm_arm64_test.go index 0e3d84d95..b53e354da 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64_test.go +++ b/pkg/sentry/platform/kvm/kvm_arm64_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go index f07a9f34d..54d579a2b 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index fe570aff9..3a30286e2 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -120,13 +120,13 @@ func TestKernelFloatingPoint(t *testing.T) { }) } -func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) { +func applicationTest(t testHarness, useHostMappings bool, targetFn uintptr, fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) { // Initialize registers & page tables. var ( regs arch.Registers pt *pagetables.PageTables ) - testutil.SetTestTarget(®s, target) + testutil.SetTestTarget(®s, targetFn) kvmTest(t, func(k *KVM) { // Create new page tables. @@ -157,7 +157,7 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func } func TestApplicationSyscall(t *testing.T) { - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -171,7 +171,7 @@ func TestApplicationSyscall(t *testing.T) { } return false }) - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -187,7 +187,7 @@ func TestApplicationSyscall(t *testing.T) { } func TestApplicationFault(t *testing.T) { - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ @@ -202,7 +202,7 @@ func TestApplicationFault(t *testing.T) { } return false }) - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ @@ -219,7 +219,7 @@ func TestApplicationFault(t *testing.T) { } func TestRegistersSyscall(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTwiddleRegsSyscall(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { var si linux.SignalInfo @@ -242,7 +242,7 @@ func TestRegistersSyscall(t *testing.T) { } func TestRegistersFault(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTwiddleRegsFault(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { var si linux.SignalInfo @@ -266,7 +266,7 @@ func TestRegistersFault(t *testing.T) { } func TestBounce(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { go func() { time.Sleep(time.Millisecond) c.BounceToKernel() @@ -281,7 +281,7 @@ func TestBounce(t *testing.T) { } return false }) - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { go func() { time.Sleep(time.Millisecond) c.BounceToKernel() @@ -300,7 +300,7 @@ func TestBounce(t *testing.T) { } func TestBounceStress(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { randomSleep := func() { // O(hundreds of microseconds) is appropriate to ensure // different overlaps and different schedules. @@ -336,7 +336,7 @@ func TestBounceStress(t *testing.T) { func TestInvalidate(t *testing.T) { var data uintptr // Used below. - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, &data) // Read legitimate value. for { var si linux.SignalInfo @@ -377,7 +377,7 @@ func IsFault(err error, si *linux.SignalInfo) bool { } func TestEmptyAddressSpace(t *testing.T) { - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, false, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -391,7 +391,7 @@ func TestEmptyAddressSpace(t *testing.T) { } return false }) - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, false, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -467,7 +467,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { i int // Iteration includes machine.Get() / machine.Put(). a int // Count for ErrContextInterrupt. ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -489,7 +489,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { func BenchmarkKernelSyscall(b *testing.B) { // Note that the target passed here is irrelevant, we never execute SwitchToUser. - applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.AddrOfGetpid(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { // iteration does not include machine.Get() / machine.Put(). for i := 0; i < b.N; i++ { testutil.Getpid() @@ -504,7 +504,7 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) { i int a int ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 1b5d5f66e..d67563958 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -70,7 +70,7 @@ type machine struct { // tscControl checks whether cpu supports TSC scaling tscControl bool - // usedSlots is the set of used physical addresses (sorted). + // usedSlots is the set of used physical addresses (not sorted). usedSlots []uintptr // nextID is the next vCPU ID. @@ -296,13 +296,20 @@ func newMachine(vm int) (*machine, error) { return m, nil } -// hasSlot returns true iff the given address is mapped. +// hasSlot returns true if the given address is mapped. // // This must be done via a linear scan. // //go:nosplit func (m *machine) hasSlot(physical uintptr) bool { - for i := 0; i < len(m.usedSlots); i++ { + slotLen := int(atomic.LoadUint32(&m.nextSlot)) + // When slots are being updated, nextSlot is ^uint32(0). As this situation + // is less likely happen, we just set the slotLen to m.maxSlots, and scan + // the whole usedSlots array. + if slotLen == int(^uint32(0)) { + slotLen = m.maxSlots + } + for i := 0; i < slotLen; i++ { if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical { return true } @@ -512,15 +519,21 @@ func (c *vCPU) lock() { // //go:nosplit func (c *vCPU) unlock() { - if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) { + origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) + if origState == vCPUUser|vCPUGuest { // Happy path: no exits are forced, and we can continue // executing on our merry way with a single atomic access. return } // Clear the lock. - origState := atomic.LoadUint32(&c.state) - atomicbitops.AndUint32(&c.state, ^vCPUUser) + for { + state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser) + if state == origState { + break + } + origState = state + } switch origState { case vCPUUser: // Normal state. diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index b8e1cd72c..a96634381 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm @@ -136,7 +137,7 @@ func (c *vCPU) initArchState() error { } // Set the entrypoint for the kernel. - kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer()) + kernelUserRegs.RIP = uint64(ring0.AddrOfStart()) kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) kernelUserRegs.RSP = c.StackTop() kernelUserRegs.RFLAGS = ring0.KernelFlagsSet @@ -469,7 +470,7 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) { } func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { - // Map all the executible regions so that all the entry functions + // Map all the executable regions so that all the entry functions // are mapped in the upper half. applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" { @@ -485,7 +486,7 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { pageTable.Map( hostarch.Addr(ring0.KernelStartAddress|r.virtual), r.length, - pagetables.MapOpts{AccessType: hostarch.Execute}, + pagetables.MapOpts{AccessType: hostarch.Execute, Global: true}, physical) } }) @@ -498,7 +499,7 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { pageTable.Map( hostarch.Addr(ring0.KernelStartAddress|start), regionLen, - pagetables.MapOpts{AccessType: hostarch.ReadWrite}, + pagetables.MapOpts{AccessType: hostarch.ReadWrite, Global: true}, physical) } } diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 83bcc7406..de798bb2c 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index edaccf9bc..7937a8481 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 1b0a6e0a7..1a4a9ce7d 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package kvm @@ -140,22 +141,15 @@ func (c *vCPU) initArchState() error { // vbar_el1 reg.id = _KVM_ARM64_REGS_VBAR_EL1 - - fromLocation := reflect.ValueOf(ring0.Vectors).Pointer() - offset := fromLocation & (1<<11 - 1) - if offset != 0 { - offset = 1<<11 - offset - } - - toLocation := fromLocation + offset - data = uint64(ring0.KernelStartAddress | toLocation) + vectorLocation := reflect.ValueOf(ring0.Vectors).Pointer() + data = uint64(ring0.KernelStartAddress | vectorLocation) if err := c.setOneRegister(®); err != nil { return err } // Use the address of the exception vector table as // the MMIO address base. - arm64HypercallMMIOBase = toLocation + arm64HypercallMMIOBase = vectorLocation // Initialize the PCID database. if hasGuestPCID { diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index 49e1c7136..cc3a1253b 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.12 // +build go1.12 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package kvm diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go index 5c1efa0fd..d8c273796 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil.go +++ b/pkg/sentry/platform/kvm/testutil/testutil.go @@ -23,23 +23,41 @@ import ( // Getpid executes a trivial system call. func Getpid() -// Touch touches the value in the first register. -func Touch() +// AddrOfGetpid returns the address of Getpid. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func AddrOfGetpid() uintptr + +// AddrOfTouch returns the address of a function that touches the value in the +// first register. +func AddrOfTouch() uintptr +func touch() -// SyscallLoop executes a syscall and loops. -func SyscallLoop() +// AddrOfSyscallLoop returns the address of a function that executes a syscall +// and loops. +func AddrOfSyscallLoop() uintptr +func syscallLoop() -// SpinLoop spins on the CPU. -func SpinLoop() +// AddrOfSpinLoop returns the address of a function that spins on the CPU. +func AddrOfSpinLoop() uintptr +func spinLoop() -// HaltLoop immediately halts and loops. -func HaltLoop() +// AddrOfHaltLoop returns the address of a function that immediately halts and +// loops. +func AddrOfHaltLoop() uintptr +func haltLoop() -// TwiddleRegsFault twiddles registers then faults. -func TwiddleRegsFault() +// AddrOfTwiddleRegsFault returns the address of a function that twiddles +// registers then faults. +func AddrOfTwiddleRegsFault() uintptr +func twiddleRegsFault() -// TwiddleRegsSyscall twiddles registers then executes a syscall. -func TwiddleRegsSyscall() +// AddrOfTwiddleRegsSyscall returns the address of a function that twiddles +// registers then executes a syscall. +func AddrOfTwiddleRegsSyscall() uintptr +func twiddleRegsSyscall() // FloatingPointWorks is a floating point test. // diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go index 8048eedec..98c52b2f5 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package testutil @@ -22,12 +23,14 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" ) -// TwiddleSegments reads segments into known registers. -func TwiddleSegments() +// AddrOfTwiddleSegments return the address of a function that reads segments +// into known registers. +func AddrOfTwiddleSegments() uintptr +func twiddleSegments() // SetTestTarget sets the rip appropriately. -func SetTestTarget(regs *arch.Registers, fn func()) { - regs.Rip = uint64(reflect.ValueOf(fn).Pointer()) +func SetTestTarget(regs *arch.Registers, fn uintptr) { + regs.Rip = uint64(fn) } // SetTouchTarget sets rax appropriately. diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s index 491ec0c2a..65e7c05ea 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s @@ -25,27 +25,46 @@ TEXT ·Getpid(SB),NOSPLIT,$0 SYSCALL RET -TEXT ·Touch(SB),NOSPLIT,$0 +// func AddrOfGetpid() uintptr +TEXT ·AddrOfGetpid(SB), $0-8 + MOVQ $·Getpid(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·touch(SB),NOSPLIT,$0 start: MOVQ 0(AX), BX // deref AX MOVQ $39, AX // getpid SYSCALL JMP start -TEXT ·HaltLoop(SB),NOSPLIT,$0 -start: - HLT - JMP start +// func AddrOfTouch() uintptr +TEXT ·AddrOfTouch(SB), $0-8 + MOVQ $·touch(SB), AX + MOVQ AX, ret+0(FP) + RET -TEXT ·SyscallLoop(SB),NOSPLIT,$0 +TEXT ·syscallLoop(SB),NOSPLIT,$0 start: SYSCALL JMP start -TEXT ·SpinLoop(SB),NOSPLIT,$0 +// func AddrOfSyscallLoop() uintptr +TEXT ·AddrOfSyscallLoop(SB), $0-8 + MOVQ $·syscallLoop(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·spinLoop(SB),NOSPLIT,$0 start: JMP start +// func AddrOfSpinLoop() uintptr +TEXT ·AddrOfSpinLoop(SB), $0-8 + MOVQ $·spinLoop(SB), AX + MOVQ AX, ret+0(FP) + RET + TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 NO_LOCAL_POINTERS MOVQ $1, AX @@ -75,20 +94,32 @@ TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 NOTQ DI; \ NOTQ SP; -TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 +TEXT ·twiddleRegsSyscall(SB),NOSPLIT,$0 TWIDDLE_REGS() SYSCALL RET // never reached -TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 +// func AddrOfTwiddleRegsSyscall() uintptr +TEXT ·AddrOfTwiddleRegsSyscall(SB), $0-8 + MOVQ $·twiddleRegsSyscall(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·twiddleRegsFault(SB),NOSPLIT,$0 TWIDDLE_REGS() JMP AX // must fault RET // never reached +// func AddrOfTwiddleRegsFault() uintptr +TEXT ·AddrOfTwiddleRegsFault(SB), $0-8 + MOVQ $·twiddleRegsFault(SB), AX + MOVQ AX, ret+0(FP) + RET + #define READ_FS() BYTE $0x64; BYTE $0x48; BYTE $0x8b; BYTE $0x00; #define READ_GS() BYTE $0x65; BYTE $0x48; BYTE $0x8b; BYTE $0x00; -TEXT ·TwiddleSegments(SB),NOSPLIT,$0 +TEXT ·twiddleSegments(SB),NOSPLIT,$0 MOVQ $0x0, AX READ_GS() MOVQ AX, BX @@ -96,3 +127,9 @@ TEXT ·TwiddleSegments(SB),NOSPLIT,$0 READ_FS() SYSCALL RET // never reached + +// func AddrOfTwiddleSegments() uintptr +TEXT ·AddrOfTwiddleSegments(SB), $0-8 + MOVQ $·twiddleSegments(SB), AX + MOVQ AX, ret+0(FP) + RET diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go index c5235ca9d..6d0ba8252 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package testutil diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go index 4f7fe993a..07eda0ef3 100644 --- a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ptrace diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 90b1ead56..13a55b784 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package ptrace @@ -176,6 +177,7 @@ func patchSignalInfo(regs *arch.Registers, signalInfo *linux.SignalInfo) { // // This is safe to call in an afterFork context. // +//go:norace //go:nosplit func enableCpuidFault() { unix.RawSyscall6(unix.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0) diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index e4257e3bf..8181db659 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package ptrace diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 4f0260432..129ca52e2 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package ptrace @@ -120,6 +121,17 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro return nil, err } + return forkStub(flags, instrs) +} + +// In the child, this function must not acquire any locks, because they might +// have been locked at the time of the fork. This means no rescheduling, no +// malloc calls, and no new stack segments. For the same reason compiler does +// not race instrument it. +// +// +//go:norace +func forkStub(flags uintptr, instrs []linux.BPFInstruction) (*thread, error) { // Declare all variables up front in order to ensure that there's no // need for allocations between beforeFork & afterFork. var ( @@ -181,7 +193,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro // Set an aggressive BPF filter for the stub and all it's children. See // the description of the BPF program built above. - if errno := seccomp.SetFilter(instrs); errno != 0 { + if errno := seccomp.SetFilterInChild(instrs); errno != 0 { unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) } diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go index 9c342c59b..f1e84059d 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux && (amd64 || arm64) // +build linux // +build amd64 arm64 @@ -26,6 +27,7 @@ import ( // unmaskAllSignals unmasks all signals on the current thread. // +//go:norace //go:nosplit func unmaskAllSignals() unix.Errno { var set linux.SignalSet diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go index 38b7b1a5e..304722200 100644 --- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.12 // +build go1.12 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package ptrace diff --git a/pkg/sentry/seccheck/BUILD b/pkg/sentry/seccheck/BUILD new file mode 100644 index 000000000..943fa180d --- /dev/null +++ b/pkg/sentry/seccheck/BUILD @@ -0,0 +1,54 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_fieldenum:defs.bzl", "go_fieldenum") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_fieldenum( + name = "seccheck_fieldenum", + srcs = [ + "clone.go", + "task.go", + ], + out = "seccheck_fieldenum.go", + package = "seccheck", +) + +go_template_instance( + name = "seqatomic_checkerslice", + out = "seqatomic_checkerslice_unsafe.go", + package = "seccheck", + suffix = "CheckerSlice", + template = "//pkg/sync/seqatomic:generic_seqatomic", + types = { + "Value": "[]Checker", + }, +) + +go_library( + name = "seccheck", + srcs = [ + "clone.go", + "seccheck.go", + "seccheck_fieldenum.go", + "seqatomic_checkerslice_unsafe.go", + "task.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/gohacks", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sync", + ], +) + +go_test( + name = "seccheck_test", + size = "small", + srcs = ["seccheck_test.go"], + library = ":seccheck", + deps = ["//pkg/context"], +) diff --git a/pkg/sentry/seccheck/clone.go b/pkg/sentry/seccheck/clone.go new file mode 100644 index 000000000..7546fa021 --- /dev/null +++ b/pkg/sentry/seccheck/clone.go @@ -0,0 +1,53 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccheck + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// CloneInfo contains information used by the Clone checkpoint. +// +// +fieldenum Clone +type CloneInfo struct { + // Invoker identifies the invoking thread. + Invoker TaskInfo + + // Credentials are the invoking thread's credentials. + Credentials *auth.Credentials + + // Args contains the arguments to kernel.Task.Clone(). + Args linux.CloneArgs + + // Created identifies the created thread. + Created TaskInfo +} + +// CloneReq returns fields required by the Clone checkpoint. +func (s *state) CloneReq() CloneFieldSet { + return s.cloneReq.Load() +} + +// Clone is called at the Clone checkpoint. +func (s *state) Clone(ctx context.Context, mask CloneFieldSet, info *CloneInfo) error { + for _, c := range s.getCheckers() { + if err := c.Clone(ctx, mask, *info); err != nil { + return err + } + } + return nil +} diff --git a/pkg/sentry/seccheck/seccheck.go b/pkg/sentry/seccheck/seccheck.go new file mode 100644 index 000000000..b6c9d44ce --- /dev/null +++ b/pkg/sentry/seccheck/seccheck.go @@ -0,0 +1,136 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package seccheck defines a structure for dynamically-configured security +// checks in the sentry. +package seccheck + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sync" +) + +// A Point represents a checkpoint, a point at which a security check occurs. +type Point uint + +// PointX represents the checkpoint X. +const ( + PointClone Point = iota + // Add new Points above this line. + pointLength + + numPointBitmaskUint32s = (int(pointLength)-1)/32 + 1 +) + +// A Checker performs security checks at checkpoints. +// +// Each Checker method X is called at checkpoint X; if the method may return a +// non-nil error and does so, it causes the checked operation to fail +// immediately (without calling subsequent Checkers) and return the error. The +// info argument contains information relevant to the check. The mask argument +// indicates what fields in info are valid; the mask should usually be a +// superset of fields requested by the Checker's corresponding CheckerReq, but +// may be missing requested fields in some cases (e.g. if the Checker is +// registered concurrently with invocations of checkpoints). +type Checker interface { + Clone(ctx context.Context, mask CloneFieldSet, info CloneInfo) error +} + +// CheckerDefaults may be embedded by implementations of Checker to obtain +// no-op implementations of Checker methods that may be explicitly overridden. +type CheckerDefaults struct{} + +// Clone implements Checker.Clone. +func (CheckerDefaults) Clone(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + return nil +} + +// CheckerReq indicates what checkpoints a corresponding Checker runs at, and +// what information it requires at those checkpoints. +type CheckerReq struct { + // Points are the set of checkpoints for which the corresponding Checker + // must be called. Note that methods not specified in Points may still be + // called; implementations of Checker may embed CheckerDefaults to obtain + // no-op implementations of Checker methods. + Points []Point + + // All of the following fields indicate what fields in the corresponding + // XInfo struct will be requested at the corresponding checkpoint. + Clone CloneFields +} + +// Global is the method receiver of all seccheck functions. +var Global state + +// state is the type of global, and is separated out for testing. +type state struct { + // registrationMu serializes all changes to the set of registered Checkers + // for all checkpoints. + registrationMu sync.Mutex + + // enabledPoints is a bitmask of checkpoints for which at least one Checker + // is registered. + // + // enabledPoints is accessed using atomic memory operations. Mutation of + // enabledPoints is serialized by registrationMu. + enabledPoints [numPointBitmaskUint32s]uint32 + + // registrationSeq supports store-free atomic reads of registeredCheckers. + registrationSeq sync.SeqCount + + // checkers is the set of all registered Checkers in order of execution. + // + // checkers is accessed using instantiations of SeqAtomic functions. + // Mutation of checkers is serialized by registrationMu. + checkers []Checker + + // All of the following xReq variables indicate what fields in the + // corresponding XInfo struct have been requested by any registered + // checker, are accessed using atomic memory operations, and are mutated + // with registrationMu locked. + cloneReq CloneFieldSet +} + +// AppendChecker registers the given Checker to execute at checkpoints. The +// Checker will execute after all previously-registered Checkers, and only if +// those Checkers return a nil error. +func (s *state) AppendChecker(c Checker, req *CheckerReq) { + s.registrationMu.Lock() + defer s.registrationMu.Unlock() + s.cloneReq.AddFieldsLoadable(req.Clone) + s.appendCheckerLocked(c) + for _, p := range req.Points { + word, bit := p/32, p%32 + atomic.StoreUint32(&s.enabledPoints[word], s.enabledPoints[word]|(uint32(1)<<bit)) + } +} + +// Enabled returns true if any Checker is registered for the given checkpoint. +func (s *state) Enabled(p Point) bool { + word, bit := p/32, p%32 + return atomic.LoadUint32(&s.enabledPoints[word])&(uint32(1)<<bit) != 0 +} + +func (s *state) getCheckers() []Checker { + return SeqAtomicLoadCheckerSlice(&s.registrationSeq, &s.checkers) +} + +// Preconditions: s.registrationMu must be locked. +func (s *state) appendCheckerLocked(c Checker) { + s.registrationSeq.BeginWrite() + s.checkers = append(s.checkers, c) + s.registrationSeq.EndWrite() +} diff --git a/pkg/sentry/seccheck/seccheck_test.go b/pkg/sentry/seccheck/seccheck_test.go new file mode 100644 index 000000000..687810d18 --- /dev/null +++ b/pkg/sentry/seccheck/seccheck_test.go @@ -0,0 +1,157 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccheck + +import ( + "errors" + "testing" + + "gvisor.dev/gvisor/pkg/context" +) + +type testChecker struct { + CheckerDefaults + + onClone func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error +} + +// Clone implements Checker.Clone. +func (c *testChecker) Clone(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + if c.onClone == nil { + return nil + } + return c.onClone(ctx, mask, info) +} + +func TestNoChecker(t *testing.T) { + var s state + if s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got true, wanted false") + } +} + +func TestCheckerNotRegisteredForPoint(t *testing.T) { + var s state + s.AppendChecker(&testChecker{}, &CheckerReq{}) + if s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got true, wanted false") + } +} + +func TestCheckerRegistered(t *testing.T) { + var s state + checkerCalled := false + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkerCalled = true + return nil + }}, &CheckerReq{ + Points: []Point{PointClone}, + Clone: CloneFields{ + Credentials: true, + }, + }) + + if !s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got false, wanted true") + } + if !s.CloneReq().Contains(CloneFieldCredentials) { + t.Errorf("CloneReq().Contains(CloneFieldCredentials): got false, wanted true") + } + if err := s.Clone(context.Background(), CloneFieldSet{}, &CloneInfo{}); err != nil { + t.Errorf("Clone(): got %v, wanted nil", err) + } + if !checkerCalled { + t.Errorf("Clone() did not call Checker.Clone()") + } +} + +func TestMultipleCheckersRegistered(t *testing.T) { + var s state + checkersCalled := [2]bool{} + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkersCalled[0] = true + return nil + }}, &CheckerReq{ + Points: []Point{PointClone}, + Clone: CloneFields{ + Args: true, + }, + }) + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkersCalled[1] = true + return nil + }}, &CheckerReq{ + Points: []Point{PointClone}, + Clone: CloneFields{ + Created: TaskFields{ + ThreadID: true, + }, + }, + }) + + if !s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got false, wanted true") + } + // CloneReq() should return the union of requested fields from all calls to + // AppendChecker. + req := s.CloneReq() + if !req.Contains(CloneFieldArgs) { + t.Errorf("req.Contains(CloneFieldArgs): got false, wanted true") + } + if !req.Created.Contains(TaskFieldThreadID) { + t.Errorf("req.Created.Contains(TaskFieldThreadID): got false, wanted true") + } + if err := s.Clone(context.Background(), CloneFieldSet{}, &CloneInfo{}); err != nil { + t.Errorf("Clone(): got %v, wanted nil", err) + } + for i := range checkersCalled { + if !checkersCalled[i] { + t.Errorf("Clone() did not call Checker.Clone() index %d", i) + } + } +} + +func TestCheckpointReturnsFirstCheckerError(t *testing.T) { + errFirstChecker := errors.New("first Checker error") + errSecondChecker := errors.New("second Checker error") + + var s state + checkersCalled := [2]bool{} + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkersCalled[0] = true + return errFirstChecker + }}, &CheckerReq{ + Points: []Point{PointClone}, + }) + s.AppendChecker(&testChecker{onClone: func(ctx context.Context, mask CloneFieldSet, info CloneInfo) error { + checkersCalled[1] = true + return errSecondChecker + }}, &CheckerReq{ + Points: []Point{PointClone}, + }) + + if !s.Enabled(PointClone) { + t.Errorf("Enabled(PointClone): got false, wanted true") + } + if err := s.Clone(context.Background(), CloneFieldSet{}, &CloneInfo{}); err != errFirstChecker { + t.Errorf("Clone(): got %v, wanted %v", err, errFirstChecker) + } + if !checkersCalled[0] { + t.Errorf("Clone() did not call first Checker") + } + if checkersCalled[1] { + t.Errorf("Clone() called second Checker") + } +} diff --git a/pkg/sentry/seccheck/task.go b/pkg/sentry/seccheck/task.go new file mode 100644 index 000000000..1dee33203 --- /dev/null +++ b/pkg/sentry/seccheck/task.go @@ -0,0 +1,39 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccheck + +import ( + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" +) + +// TaskInfo contains information unambiguously identifying a single thread +// and/or its containing process. +// +// +fieldenum Task +type TaskInfo struct { + // ThreadID is the thread's ID in the root PID namespace. + ThreadID int32 + + // ThreadStartTime is the thread's CLOCK_REALTIME start time. + ThreadStartTime ktime.Time + + // ThreadGroupID is the thread's group leader's ID in the root PID + // namespace. + ThreadGroupID int32 + + // ThreadGroupStartTime is the thread's group leader's CLOCK_REALTIME start + // time. + ThreadGroupStartTime ktime.Time +} diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 2029e7cf4..b2fc84181 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/abi/linux", "//pkg/bits", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/marshal", "//pkg/marshal/primitive", @@ -25,7 +26,6 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 235b9c306..00a5e729a 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" @@ -28,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" ) const maxInt = int(^uint(0) >> 1) @@ -70,7 +70,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { file := t.GetFile(fd) if file == nil { files.Release(t) - return nil, syserror.EBADF + return nil, linuxerr.EBADF } files = append(files, file) } @@ -169,7 +169,7 @@ func NewSCMCredentials(t *kernel.Task, cred linux.ControlMessageCredentials) (SC return nil, err } if kernel.ThreadID(cred.PID) != t.ThreadGroup().ID() && !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.PIDNamespace().UserNamespace()) { - return nil, syserror.EPERM + return nil, linuxerr.EPERM } return &scmCredentials{t, kuid, kgid}, nil } @@ -473,17 +473,17 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) for i := 0; i < len(buf); { if i+linux.SizeOfControlMessageHeader > len(buf) { - return cmsgs, syserror.EINVAL + return cmsgs, linuxerr.EINVAL } var h linux.ControlMessageHeader h.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageHeader]) if h.Length < uint64(linux.SizeOfControlMessageHeader) { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } if h.Length > uint64(len(buf)-i) { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } i += linux.SizeOfControlMessageHeader @@ -497,7 +497,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) numRights := rightsSize / linux.SizeOfControlMessageRight if len(fds)+numRights > linux.SCM_MAX_FD { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight { @@ -508,7 +508,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.SCM_CREDENTIALS: if length < linux.SizeOfControlMessageCredentials { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } var creds linux.ControlMessageCredentials @@ -522,7 +522,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.SO_TIMESTAMP: if length < linux.SizeOfTimeval { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } var ts linux.Timeval ts.UnmarshalUnsafe(buf[i : i+linux.SizeOfTimeval]) @@ -532,13 +532,13 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) default: // Unknown message type. - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } case linux.SOL_IP: switch h.Type { case linux.IP_TOS: if length < linux.SizeOfControlMessageTOS { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasTOS = true var tos primitive.Uint8 @@ -548,7 +548,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IP_PKTINFO: if length < linux.SizeOfControlMessageIPPacketInfo { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasIPPacketInfo = true @@ -561,7 +561,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IP_RECVORIGDSTADDR: var addr linux.SockAddrInet if length < addr.SizeBytes() { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } addr.UnmarshalUnsafe(buf[i : i+addr.SizeBytes()]) cmsgs.IP.OriginalDstAddress = &addr @@ -570,7 +570,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IP_RECVERR: var errCmsg linux.SockErrCMsgIPv4 if length < errCmsg.SizeBytes() { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } errCmsg.UnmarshalBytes(buf[i : i+errCmsg.SizeBytes()]) @@ -578,13 +578,13 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) i += bits.AlignUp(length, width) default: - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } case linux.SOL_IPV6: switch h.Type { case linux.IPV6_TCLASS: if length < linux.SizeOfControlMessageTClass { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasTClass = true var tclass primitive.Uint32 @@ -595,7 +595,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IPV6_RECVORIGDSTADDR: var addr linux.SockAddrInet6 if length < addr.SizeBytes() { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } addr.UnmarshalUnsafe(buf[i : i+addr.SizeBytes()]) cmsgs.IP.OriginalDstAddress = &addr @@ -604,7 +604,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) case linux.IPV6_RECVERR: var errCmsg linux.SockErrCMsgIPv6 if length < errCmsg.SizeBytes() { - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } errCmsg.UnmarshalBytes(buf[i : i+errCmsg.SizeBytes()]) @@ -612,10 +612,10 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) i += bits.AlignUp(length, width) default: - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } default: - return socket.ControlMessages{}, syserror.EINVAL + return socket.ControlMessages{}, linuxerr.EINVAL } } diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go index 37d02948f..0a989cbeb 100644 --- a/pkg/sentry/socket/control/control_vfs2.go +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -17,10 +17,10 @@ package control import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // SCMRightsVFS2 represents a SCM_RIGHTS socket control message. @@ -51,7 +51,7 @@ func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { file := t.GetFileVFS2(fd) if file == nil { files.Release(t) - return nil, syserror.EBADF + return nil, linuxerr.EBADF } files = append(files, file) } diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 3c6511ead..4ea89f9d0 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -18,6 +18,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fdnotifier", "//pkg/hostarch", "//pkg/log", @@ -37,7 +38,6 @@ go_library( "//pkg/sentry/socket/control", "//pkg/sentry/vfs", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/stack", "//pkg/usermem", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 52ae4bc9c..1c1e501ba 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -20,6 +20,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" @@ -34,7 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -67,23 +67,6 @@ type socketOperations struct { socketOpsCommon } -// socketOpsCommon contains the socket operations common to VFS1 and VFS2. -// -// +stateify savable -type socketOpsCommon struct { - socket.SendReceiveTimeout - - family int // Read-only. - stype linux.SockType // Read-only. - protocol int // Read-only. - queue waiter.Queue - - // fd is the host socket fd. It must have O_NONBLOCK, so that operations - // will return EWOULDBLOCK instead of blocking on the host. This allows us to - // handle blocking behavior independently in the sentry. - fd int -} - var _ = socket.Socket(&socketOperations{}) func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) { @@ -103,29 +86,6 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil } -// Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release(context.Context) { - fdnotifier.RemoveFD(int32(s.fd)) - unix.Close(s.fd) -} - -// Readiness implements waiter.Waitable.Readiness. -func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { - return fdnotifier.NonBlockingPoll(int32(s.fd), mask) -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - s.queue.EventRegister(e, mask) - fdnotifier.UpdateFD(int32(s.fd)) -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { - s.queue.EventUnregister(e) - fdnotifier.UpdateFD(int32(s.fd)) -} - // Ioctl implements fs.FileOperations.Ioctl. func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { return ioctl(ctx, s.fd, io, args) @@ -177,6 +137,96 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO return int64(n), err } +// Socket implements socket.Provider.Socket. +func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) { + // Check that we are using the host network stack. + stack := t.NetworkContext() + if stack == nil { + return nil, nil + } + if _, ok := stack.(*Stack); !ok { + return nil, nil + } + + // Only accept TCP and UDP. + stype := stypeflags & linux.SOCK_TYPE_MASK + switch stype { + case unix.SOCK_STREAM: + switch protocol { + case 0, unix.IPPROTO_TCP: + // ok + default: + return nil, nil + } + case unix.SOCK_DGRAM: + switch protocol { + case 0, unix.IPPROTO_UDP: + // ok + default: + return nil, nil + } + default: + return nil, nil + } + + // Conservatively ignore all flags specified by the application and add + // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 + // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. + fd, err := unix.Socket(p.family, int(stype)|unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, syserr.FromError(err) + } + return newSocketFile(t, p.family, stype, protocol, fd, stypeflags&unix.SOCK_NONBLOCK != 0) +} + +// Pair implements socket.Provider.Pair. +func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { + // Not supported by AF_INET/AF_INET6. + return nil, nil, nil +} + +// LINT.ThenChange(./socket_vfs2.go) + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { + socket.SendReceiveTimeout + + family int // Read-only. + stype linux.SockType // Read-only. + protocol int // Read-only. + queue waiter.Queue + + // fd is the host socket fd. It must have O_NONBLOCK, so that operations + // will return EWOULDBLOCK instead of blocking on the host. This allows us to + // handle blocking behavior independently in the sentry. + fd int +} + +// Release implements fs.FileOperations.Release. +func (s *socketOpsCommon) Release(context.Context) { + fdnotifier.RemoveFD(int32(s.fd)) + unix.Close(s.fd) +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(s.fd), mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(s.fd)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { + s.queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(s.fd)) +} + // Connect implements socket.Socket.Connect. func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { if len(sockaddr) > sizeofSockaddr { @@ -237,7 +287,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC) if blocking { var ch chan struct{} - for syscallErr == syserror.ErrWouldBlock { + for syscallErr == linuxerr.ErrWouldBlock { if ch != nil { if syscallErr = t.Block(ch); syscallErr != nil { break @@ -484,7 +534,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags n, err := copyToDst() // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT. if flags&(unix.MSG_DONTWAIT|unix.MSG_ERRQUEUE) == 0 { - for err == syserror.ErrWouldBlock { + for err == linuxerr.ErrWouldBlock { // We only expect blocking to come from the actual syscall, in which // case it can't have returned any data. if n != 0 { @@ -596,6 +646,17 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b return 0, syserr.ErrInvalidArgument } + // If the src is zero-length, call SENDTO directly with a null buffer in + // order to generate poll/epoll notifications. + if src.NumBytes() == 0 { + sysflags := flags | unix.MSG_DONTWAIT + n, _, errno := unix.Syscall6(unix.SYS_SENDTO, uintptr(s.fd), 0, 0, uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to))) + if errno != 0 { + return 0, syserr.FromError(errno) + } + return int(n), nil + } + space := uint64(control.CmsgsSpace(t, controlMessages)) if space > maxControlLen { space = maxControlLen @@ -645,7 +706,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b var ch chan struct{} n, err := src.CopyInTo(t, sendmsgFromBlocks) if flags&unix.MSG_DONTWAIT == 0 { - for err == syserror.ErrWouldBlock { + for err == linuxerr.ErrWouldBlock { // We only expect blocking to come from the actual syscall, in which // case it can't have returned any data. if n != 0 { @@ -653,8 +714,8 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b } if ch != nil { if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - err = syserror.ErrWouldBlock + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { + err = linuxerr.ErrWouldBlock } break } @@ -673,7 +734,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b func translateIOSyscallError(err error) error { if err == unix.EAGAIN || err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -709,56 +770,6 @@ type socketProvider struct { family int } -// Socket implements socket.Provider.Socket. -func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) { - // Check that we are using the host network stack. - stack := t.NetworkContext() - if stack == nil { - return nil, nil - } - if _, ok := stack.(*Stack); !ok { - return nil, nil - } - - // Only accept TCP and UDP. - stype := stypeflags & linux.SOCK_TYPE_MASK - switch stype { - case unix.SOCK_STREAM: - switch protocol { - case 0, unix.IPPROTO_TCP: - // ok - default: - return nil, nil - } - case unix.SOCK_DGRAM: - switch protocol { - case 0, unix.IPPROTO_UDP: - // ok - default: - return nil, nil - } - default: - return nil, nil - } - - // Conservatively ignore all flags specified by the application and add - // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 - // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. - fd, err := unix.Socket(p.family, int(stype)|unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC, 0) - if err != nil { - return nil, syserr.FromError(err) - } - return newSocketFile(t, p.family, stype, protocol, fd, stypeflags&unix.SOCK_NONBLOCK != 0) -} - -// Pair implements socket.Provider.Pair. -func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { - // Not supported by AF_INET/AF_INET6. - return nil, nil, nil -} - -// LINT.ThenChange(./socket_vfs2.go) - func init() { for _, family := range []int{unix.AF_INET, unix.AF_INET6} { socket.RegisterProvider(family, &socketProvider{family}) diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go index d3be2d825..587f479eb 100644 --- a/pkg/sentry/socket/hostinet/socket_unsafe.go +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -20,12 +20,12 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -67,9 +67,25 @@ func ioctl(ctx context.Context, fd int, io usermem.IO, args arch.SyscallArgument AddressSpaceActive: true, }) return 0, err - + case unix.SIOCGIFFLAGS, unix.SIOCGIFCONF: + cc := &usermem.IOCopyContext{ + Ctx: ctx, + IO: io, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + } + var ifr linux.IFReq + if _, err := ifr.CopyIn(cc, args[2].Pointer()); err != nil { + return 0, err + } + if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&ifr))); errno != 0 { + return 0, translateIOSyscallError(errno) + } + _, err := ifr.CopyOut(cc, args[2].Pointer()) + return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 5d55cc64d..cd6e34ecc 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -18,6 +18,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" @@ -26,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -105,7 +105,7 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal // PRead implements vfs.FileDescriptionImpl.PRead. func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Read implements vfs.FileDescriptionImpl. @@ -113,7 +113,7 @@ func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) @@ -124,7 +124,7 @@ func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // PWrite implements vfs.FileDescriptionImpl. func (s *socketVFS2) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Write implements vfs.FileDescriptionImpl. @@ -132,7 +132,7 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go index 8a783712e..2397e04e7 100644 --- a/pkg/sentry/socket/hostinet/sockopt_impl.go +++ b/pkg/sentry/socket/hostinet/sockopt_impl.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package hostinet import ( diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index cbb1e905d..61111ac6c 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -29,11 +29,11 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" @@ -309,6 +309,11 @@ func (s *Stack) Interfaces() map[int32]inet.Interface { return interfaces } +// RemoveInterface implements inet.Stack.RemoveInterface. +func (*Stack) RemoveInterface(int32) error { + return linuxerr.EACCES +} + // InterfaceAddrs implements inet.Stack.InterfaceAddrs. func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { addrs := make(map[int32][]inet.InterfaceAddr) @@ -319,13 +324,13 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { } // AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. -func (s *Stack) AddInterfaceAddr(int32, inet.InterfaceAddr) error { - return syserror.EACCES +func (*Stack) AddInterfaceAddr(int32, inet.InterfaceAddr) error { + return linuxerr.EACCES } // RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr. -func (s *Stack) RemoveInterfaceAddr(int32, inet.InterfaceAddr) error { - return syserror.EACCES +func (*Stack) RemoveInterfaceAddr(int32, inet.InterfaceAddr) error { + return linuxerr.EACCES } // SupportsIPv6 implements inet.Stack.SupportsIPv6. @@ -339,8 +344,8 @@ func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { } // SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. -func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error { - return syserror.EACCES +func (*Stack) SetTCPReceiveBufferSize(inet.TCPBufferSize) error { + return linuxerr.EACCES } // TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. @@ -349,8 +354,8 @@ func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { } // SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. -func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error { - return syserror.EACCES +func (*Stack) SetTCPSendBufferSize(inet.TCPBufferSize) error { + return linuxerr.EACCES } // TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. @@ -359,8 +364,8 @@ func (s *Stack) TCPSACKEnabled() (bool, error) { } // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. -func (s *Stack) SetTCPSACKEnabled(bool) error { - return syserror.EACCES +func (*Stack) SetTCPSACKEnabled(bool) error { + return linuxerr.EACCES } // TCPRecovery implements inet.Stack.TCPRecovery. @@ -369,8 +374,8 @@ func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { } // SetTCPRecovery implements inet.Stack.SetTCPRecovery. -func (s *Stack) SetTCPRecovery(inet.TCPLossRecovery) error { - return syserror.EACCES +func (*Stack) SetTCPRecovery(inet.TCPLossRecovery) error { + return linuxerr.EACCES } // getLine reads one line from proc file, with specified prefix. @@ -470,20 +475,20 @@ func (s *Stack) RouteTable() []inet.Route { } // Resume implements inet.Stack.Resume. -func (s *Stack) Resume() {} +func (*Stack) Resume() {} // RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. -func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil } +func (*Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil } // CleanupEndpoints implements inet.Stack.CleanupEndpoints. -func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil } +func (*Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil } // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. -func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} +func (*Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} // SetForwarding implements inet.Stack.SetForwarding. -func (s *Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error { - return syserror.EACCES +func (*Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error { + return linuxerr.EACCES } // PortRange implements inet.Stack.PortRange. @@ -493,6 +498,6 @@ func (*Stack) PortRange() (uint16, uint16) { } // SetPortRange implements inet.Stack.SetPortRange. -func (*Stack) SetPortRange(start uint16, end uint16) error { - return syserror.EACCES +func (*Stack) SetPortRange(uint16, uint16) error { + return linuxerr.EACCES } diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index 61b2c9755..608474fa1 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -25,6 +25,7 @@ go_library( "//pkg/log", "//pkg/marshal", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/syserr", "//pkg/tcpip", "//pkg/tcpip/header", diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go index 6fc7781ad..3f1b4a17b 100644 --- a/pkg/sentry/socket/netfilter/extensions.go +++ b/pkg/sentry/socket/netfilter/extensions.go @@ -19,20 +19,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) -// TODO(gvisor.dev/issue/170): The following per-matcher params should be -// supported: -// - Table name -// - Match size -// - User size -// - Hooks -// - Proto -// - Family - // matchMaker knows how to (un)marshal the matcher named name(). type matchMaker interface { // name is the matcher name as stored in the xt_entry_match struct. @@ -43,7 +35,7 @@ type matchMaker interface { // unmarshal converts from the ABI matcher struct to an // stack.Matcher. - unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) + unmarshal(task *kernel.Task, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) } type matcher interface { @@ -94,12 +86,12 @@ func marshalEntryMatch(name string, data []byte) []byte { return buf } -func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) { +func unmarshalMatcher(task *kernel.Task, match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) { matchMaker, ok := matchMakers[match.Name.String()] if !ok { return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String()) } - return matchMaker.unmarshal(buf, filter) + return matchMaker.unmarshal(task, buf, filter) } // targetMaker knows how to (un)marshal a target. Once registered, diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go index cb78ef60b..af31cbc5b 100644 --- a/pkg/sentry/socket/netfilter/ipv4.go +++ b/pkg/sentry/socket/netfilter/ipv4.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -80,6 +81,8 @@ func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTG copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask) copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface) copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + copy(entry.Entry.IP.InputInterface[:], rule.Filter.InputInterface) + copy(entry.Entry.IP.InputInterfaceMask[:], rule.Filter.InputInterfaceMask) if rule.Filter.DstInvert { entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP } @@ -123,7 +126,7 @@ func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTG return entries, info } -func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { +func modifyEntries4(task *kernel.Task, stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { nflog("set entries: setting entries in table %q", replace.Name.String()) // Convert input into a list of rules and their offsets. @@ -148,23 +151,19 @@ func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): We should support more IPTIP - // filtering fields. filter, err := filterFromIPTIP(entry.IP) if err != nil { nflog("bad iptip: %v", err) return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Matchers and targets can specify - // that they only work for certain protocols, hooks, tables. // Get matchers. matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry if len(optVal) < int(matchersSize) { nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) return nil, syserr.ErrInvalidArgument } - matchers, err := parseMatchers(filter, optVal[:matchersSize]) + matchers, err := parseMatchers(task, filter, optVal[:matchersSize]) if err != nil { nflog("failed to parse matchers: %v", err) return nil, syserr.ErrInvalidArgument diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go index 5cb7fe4aa..6cefe0b9c 100644 --- a/pkg/sentry/socket/netfilter/ipv6.go +++ b/pkg/sentry/socket/netfilter/ipv6.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -80,6 +81,8 @@ func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6T copy(entry.Entry.IPv6.SrcMask[:], rule.Filter.SrcMask) copy(entry.Entry.IPv6.OutputInterface[:], rule.Filter.OutputInterface) copy(entry.Entry.IPv6.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + copy(entry.Entry.IPv6.InputInterface[:], rule.Filter.InputInterface) + copy(entry.Entry.IPv6.InputInterfaceMask[:], rule.Filter.InputInterfaceMask) if rule.Filter.DstInvert { entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_DSTIP } @@ -126,7 +129,7 @@ func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6T return entries, info } -func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { +func modifyEntries6(task *kernel.Task, stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { nflog("set entries: setting entries in table %q", replace.Name.String()) // Convert input into a list of rules and their offsets. @@ -151,23 +154,19 @@ func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): We should support more IPTIP - // filtering fields. filter, err := filterFromIP6TIP(entry.IPv6) if err != nil { nflog("bad iptip: %v", err) return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Matchers and targets can specify - // that they only work for certain protocols, hooks, tables. // Get matchers. matchersSize := entry.TargetOffset - linux.SizeOfIP6TEntry if len(optVal) < int(matchersSize) { nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) return nil, syserr.ErrInvalidArgument } - matchers, err := parseMatchers(filter, optVal[:matchersSize]) + matchers, err := parseMatchers(task, filter, optVal[:matchersSize]) if err != nil { nflog("failed to parse matchers: %v", err) return nil, syserr.ErrInvalidArgument diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index e1c4b06fc..e3eade180 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -174,13 +174,12 @@ func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint // SetEntries sets iptables rules for a single table. See // net/ipv4/netfilter/ip_tables.c:translate_table for reference. -func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { +func SetEntries(task *kernel.Task, stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { var replace linux.IPTReplace replaceBuf := optVal[:linux.SizeOfIPTReplace] optVal = optVal[linux.SizeOfIPTReplace:] replace.UnmarshalBytes(replaceBuf) - // TODO(gvisor.dev/issue/170): Support other tables. var table stack.Table switch replace.Name.String() { case filterTable: @@ -188,16 +187,16 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { case natTable: table = stack.EmptyNATTable() default: - nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) + nflog("unknown iptables table %q", replace.Name.String()) return syserr.ErrInvalidArgument } var err *syserr.Error var offsets map[uint32]int if ipv6 { - offsets, err = modifyEntries6(stk, optVal, &replace, &table) + offsets, err = modifyEntries6(task, stk, optVal, &replace, &table) } else { - offsets, err = modifyEntries4(stk, optVal, &replace, &table) + offsets, err = modifyEntries4(task, stk, optVal, &replace, &table) } if err != nil { return err @@ -272,7 +271,6 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { table.Rules[ruleIdx] = rule } - // TODO(gvisor.dev/issue/170): Support other chains. // Since we don't support FORWARD, yet, make sure all other chains point to // ACCEPT rules. for hook, ruleIdx := range table.BuiltinChains { @@ -287,7 +285,7 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { } } - // TODO(gvisor.dev/issue/170): Check the following conditions: + // TODO(gvisor.dev/issue/6167): Check the following conditions: // - There are no loops. // - There are no chains without an unconditional final rule. // - There are no chains without an unconditional underflow rule. @@ -297,7 +295,7 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { // parseMatchers parses 0 or more matchers from optVal. optVal should contain // only the matchers. -func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) { +func parseMatchers(task *kernel.Task, filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) { nflog("set entries: parsing matchers of size %d", len(optVal)) var matchers []stack.Matcher for len(optVal) > 0 { @@ -321,13 +319,13 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, } // Parse the specific matcher. - matcher, err := unmarshalMatcher(match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize]) + matcher, err := unmarshalMatcher(task, match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize]) if err != nil { return nil, fmt.Errorf("failed to create matcher: %v", err) } matchers = append(matchers, matcher) - // TODO(gvisor.dev/issue/170): Check the revision field. + // TODO(gvisor.dev/issue/6167): Check the revision field. optVal = optVal[match.MatchSize:] } diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go index 60845cab3..6eff2ae65 100644 --- a/pkg/sentry/socket/netfilter/owner_matcher.go +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -19,6 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -40,8 +42,8 @@ func (ownerMarshaler) name() string { func (ownerMarshaler) marshal(mr matcher) []byte { matcher := mr.(*OwnerMatcher) iptOwnerInfo := linux.IPTOwnerInfo{ - UID: matcher.uid, - GID: matcher.gid, + UID: uint32(matcher.uid), + GID: uint32(matcher.gid), } // Support for UID and GID match. @@ -63,7 +65,7 @@ func (ownerMarshaler) marshal(mr matcher) []byte { } // unmarshal implements matchMaker.unmarshal. -func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { +func (ownerMarshaler) unmarshal(task *kernel.Task, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfIPTOwnerInfo { return nil, fmt.Errorf("buf has insufficient size for owner match: %d", len(buf)) } @@ -72,11 +74,12 @@ func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack. // exceed what's strictly necessary to hold matchData. var matchData linux.IPTOwnerInfo matchData.UnmarshalUnsafe(buf[:linux.SizeOfIPTOwnerInfo]) - nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData) + nflog("parsed IPTOwnerInfo: %+v", matchData) var owner OwnerMatcher - owner.uid = matchData.UID - owner.gid = matchData.GID + creds := task.Credentials() + owner.uid = creds.UserNamespace.MapToKUID(auth.UID(matchData.UID)) + owner.gid = creds.UserNamespace.MapToKGID(auth.GID(matchData.GID)) // Check flags. if matchData.Match&linux.XT_OWNER_UID != 0 { @@ -97,8 +100,8 @@ func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack. // OwnerMatcher matches against a UID and/or GID. type OwnerMatcher struct { - uid uint32 - gid uint32 + uid auth.KUID + gid auth.KGID matchUID bool matchGID bool invertUID bool @@ -113,7 +116,6 @@ func (*OwnerMatcher) name() string { // Match implements Matcher.Match. func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // Support only for OUTPUT chain. - // TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also. if hook != stack.Output { return false, true } @@ -126,7 +128,7 @@ func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ str var matches bool // Check for UID match. if om.matchUID { - if pkt.Owner.UID() == om.uid { + if auth.KUID(pkt.Owner.KUID()) == om.uid { matches = true } if matches == om.invertUID { @@ -137,7 +139,7 @@ func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ str // Check for GID match. if om.matchGID { matches = false - if pkt.Owner.GID() == om.gid { + if auth.KGID(pkt.Owner.KGID()) == om.gid { matches = true } if matches == om.invertGID { diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go index fa5456eee..ea56f39c1 100644 --- a/pkg/sentry/socket/netfilter/targets.go +++ b/pkg/sentry/socket/netfilter/targets.go @@ -331,7 +331,6 @@ func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) ( return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Check if the flags are valid. // Also check if we need to map ports or IP. // For now, redirect target only supports destination port change. // Port range and IP range are not supported yet. @@ -340,7 +339,6 @@ func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) ( return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Port range is not supported yet. if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { nflog("redirectTargetMaker: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument @@ -420,7 +418,6 @@ func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (tar return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/3549): Check for other flags. // For now, redirect target only supports destination change. if natRange.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED { nflog("nfNATTargetMaker: invalid range flags %d", natRange.Flags) @@ -502,7 +499,6 @@ func (*snatTargetMakerV4) unmarshal(buf []byte, filter stack.IPHeaderFilter) (ta return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Port range is not supported yet. if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { nflog("snatTargetMakerV4: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument @@ -594,7 +590,6 @@ func (*snatTargetMakerV6) unmarshal(buf []byte, filter stack.IPHeaderFilter) (ta // translateToStandardTarget translates from the value in a // linux.XTStandardTarget to an stack.Verdict. func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (target, *syserr.Error) { - // TODO(gvisor.dev/issue/170): Support other verdicts. switch val { case -linux.NF_ACCEPT - 1: return &acceptTarget{stack.AcceptTarget{ diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index 95bb9826e..e5b73a976 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -50,7 +51,7 @@ func (tcpMarshaler) marshal(mr matcher) []byte { } // unmarshal implements matchMaker.unmarshal. -func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { +func (tcpMarshaler) unmarshal(_ *kernel.Task, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfXTTCP { return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf)) } @@ -95,8 +96,6 @@ func (*TCPMatcher) name() string { // Match implements Matcher.Match. func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { - // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved - // into the stack.Check codepath as matchers are added. switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: netHeader := header.IPv4(pkt.NetworkHeader().View()) diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index fb8be27e6..aa72ee70c 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -50,7 +51,7 @@ func (udpMarshaler) marshal(mr matcher) []byte { } // unmarshal implements matchMaker.unmarshal. -func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { +func (udpMarshaler) unmarshal(_ *kernel.Task, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfXTUDP { return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf)) } @@ -92,8 +93,6 @@ func (*UDPMatcher) name() string { // Match implements Matcher.Match. func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { - // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved - // into the stack.Check codepath as matchers are added. switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: netHeader := header.IPv4(pkt.NetworkHeader().View()) diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 64cd263da..9710a15ee 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -14,8 +14,10 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/abi/linux/errno", "//pkg/bits", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/marshal", "//pkg/marshal/primitive", @@ -34,7 +36,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go index 86f6419dc..d526acb73 100644 --- a/pkg/sentry/socket/netlink/route/protocol.go +++ b/pkg/sentry/socket/netlink/route/protocol.go @@ -161,6 +161,47 @@ func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlin return nil } +// delLink handles RTM_DELLINK requests. +func (p *Protocol) delLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network stack. + return syserr.ErrProtocolNotSupported + } + + var ifinfomsg linux.InterfaceInfoMessage + attrs, ok := msg.GetData(&ifinfomsg) + if !ok { + return syserr.ErrInvalidArgument + } + if ifinfomsg.Index == 0 { + // The index is unspecified, search by the interface name. + ahdr, value, _, ok := attrs.ParseFirst() + if !ok { + return syserr.ErrInvalidArgument + } + switch ahdr.Type { + case linux.IFLA_IFNAME: + if len(value) < 1 { + return syserr.ErrInvalidArgument + } + ifname := string(value[:len(value)-1]) + for idx, ifa := range stack.Interfaces() { + if ifname == ifa.Name { + ifinfomsg.Index = idx + break + } + } + default: + return syserr.ErrInvalidArgument + } + if ifinfomsg.Index == 0 { + return syserr.ErrNoDevice + } + } + return syserr.FromError(stack.RemoveInterface(ifinfomsg.Index)) +} + // addNewLinkMessage appends RTM_NEWLINK message for the given interface into // the message set. func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) { @@ -537,6 +578,8 @@ func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms switch hdr.Type { case linux.RTM_GETLINK: return p.getLink(ctx, msg, ms) + case linux.RTM_DELLINK: + return p.delLink(ctx, msg, ms) case linux.RTM_GETROUTE: return p.dumpRoutes(ctx, msg, ms) case linux.RTM_NEWADDR: diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 280563d09..ed5fa9c38 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -20,7 +20,9 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" @@ -37,7 +39,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -56,7 +57,7 @@ const ( maxSendBufferSize = 4 << 20 // 4MB ) -var errNoFilter = syserr.New("no filter attached", linux.ENOENT) +var errNoFilter = syserr.New("no filter attached", errno.ENOENT) // netlinkSocketDevice is the netlink socket virtual device. var netlinkSocketDevice = device.NewAnonDevice() @@ -212,7 +213,7 @@ func (s *socketOpsCommon) ConnectedPasscred() bool { // Ioctl implements fs.FileOperations.Ioctl. func (*Socket) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) { // TODO(b/68878065): no ioctls supported. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // ExtractSockAddr extracts the SockAddrNetlink from b. @@ -528,7 +529,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } } - if n, err := doRead(); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { + if n, err := doRead(); err != linuxerr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { var mflags int if n < int64(r.MsgSize) { mflags |= linux.MSG_TRUNC @@ -546,7 +547,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags defer s.EventUnregister(&e) for { - if n, err := doRead(); err != syserror.ErrWouldBlock { + if n, err := doRead(); err != linuxerr.ErrWouldBlock { var mflags int if n < int64(r.MsgSize) { mflags |= linux.MSG_TRUNC @@ -558,7 +559,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain } return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index 842036764..4d3cdea62 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -17,6 +17,7 @@ package netlink import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -24,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -108,12 +108,12 @@ func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { // Ioctl implements vfs.FileDescriptionImpl. func (*SocketVFS2) Ioctl(context.Context, usermem.IO, arch.SyscallArguments) (uintptr, error) { // TODO(b/68878065): no ioctls supported. - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // PRead implements vfs.FileDescriptionImpl. func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Read implements vfs.FileDescriptionImpl. @@ -121,7 +121,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { @@ -134,7 +134,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // PWrite implements vfs.FileDescriptionImpl. func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Write implements vfs.FileDescriptionImpl. @@ -142,7 +142,7 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index 9561b7c25..bf5ec4558 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -19,7 +19,9 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/abi/linux/errno", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/marshal", @@ -40,13 +42,13 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/header", "//pkg/tcpip/link/tun", "//pkg/tcpip/network/ipv4", "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", + "//pkg/tcpip/transport", "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/usermem", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index d4b1bad67..aa081e90d 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -36,7 +36,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal" @@ -47,18 +49,18 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" - "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -271,6 +273,7 @@ var Metrics = tcpip.Stats{ Timeouts: mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."), ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), FailedPortReservations: mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."), + SegmentsAckedWithDSACK: mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."), }, UDP: tcpip.UDPStats{ PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), @@ -289,7 +292,7 @@ const DefaultTTL = 64 const sizeOfInt32 int = 4 -var errStackType = syserr.New("expected but did not receive a netstack.Stack", linux.EINVAL) +var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL) // commonEndpoint represents the intersection of a tcpip.Endpoint and a // transport.Endpoint. @@ -416,6 +419,27 @@ func bytesToIPAddress(addr []byte) tcpip.Address { return tcpip.Address(addr) } +// minSockAddrLen returns the minimum length in bytes of a socket address for +// the socket's family. +func (s *socketOpsCommon) minSockAddrLen() int { + const addressFamilySize = 2 + + switch s.family { + case linux.AF_UNIX: + return addressFamilySize + case linux.AF_INET: + return sockAddrInetSize + case linux.AF_INET6: + return sockAddrInet6Size + case linux.AF_PACKET: + return sockAddrLinkSize + case linux.AF_UNSPEC: + return addressFamilySize + default: + panic(fmt.Sprintf("s.family unrecognized = %d", s.family)) + } +} + func (s *socketOpsCommon) isPacketBased() bool { return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW } @@ -455,7 +479,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) if err == syserr.ErrWouldBlock { - return int64(n), syserror.ErrWouldBlock + return int64(n), linuxerr.ErrWouldBlock } if err != nil { return 0, err.ToError() @@ -488,14 +512,14 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO r := src.Reader(ctx) n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) if _, ok := err.(*tcpip.ErrWouldBlock); ok { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } if err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } if n < src.NumBytes() { - return n, syserror.ErrWouldBlock + return n, linuxerr.ErrWouldBlock } return n, nil @@ -542,16 +566,21 @@ func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { return s.Endpoint.Readiness(mask) } -func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error { +// checkFamily returns true iff the specified address family may be used with +// the socket. +// +// If exact is true, then the specified address family must be an exact match +// with the socket's family. +func (s *socketOpsCommon) checkFamily(family uint16, exact bool) bool { if family == uint16(s.family) { - return nil + return true } if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 { if !s.Endpoint.SocketOptions().GetV6Only() { - return nil + return true } } - return syserr.ErrInvalidArgument + return false } // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the @@ -584,8 +613,8 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool return syserr.TranslateNetstackError(err) } - if err := s.checkFamily(family, false /* exact */); err != nil { - return err + if !s.checkFamily(family, false /* exact */) { + return syserr.ErrInvalidArgument } addr = s.mapFamily(addr, family) @@ -643,23 +672,24 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } a.UnmarshalBytes(sockaddr[:sockAddrLinkSize]) - if a.Protocol != uint16(s.protocol) { - return syserr.ErrInvalidArgument - } - addr = tcpip.FullAddress{ NIC: tcpip.NICID(a.InterfaceIndex), Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + Port: socket.Ntohs(a.Protocol), } } else { + if s.minSockAddrLen() > len(sockaddr) { + return syserr.ErrInvalidArgument + } + var err *syserr.Error addr, family, err = socket.AddressAndFamily(sockaddr) if err != nil { return err } - if err = s.checkFamily(family, true /* exact */); err != nil { - return err + if !s.checkFamily(family, true /* exact */) { + return syserr.ErrAddressFamilyNotSupported } addr = s.mapFamily(addr, family) @@ -1680,6 +1710,26 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int return nil } +func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 { + // packetOverheadFactor is used to multiply the value provided by the user on + // a setsockopt(2) for setting the send/receive buffer sizes sockets. + const packetOverheadFactor = 2 + + if !ignoreMax && newSz > max { + newSz = max + } + + if newSz < math.MaxInt32/packetOverheadFactor { + newSz *= packetOverheadFactor + if newSz < min { + newSz = min + } + } else { + newSz = math.MaxInt32 + } + return newSz +} + // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { switch name { @@ -1689,7 +1739,9 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := hostarch.ByteOrder.Uint32(optVal) - ep.SocketOptions().SetSendBufferSize(int64(v), true /* notify */) + min, max := ep.SocketOptions().SendBufferLimits() + clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) + ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */) return nil case linux.SO_RCVBUF: @@ -1698,7 +1750,24 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := hostarch.ByteOrder.Uint32(optVal) - ep.SocketOptions().SetReceiveBufferSize(int64(v), true /* notify */) + min, max := ep.SocketOptions().ReceiveBufferLimits() + clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) + ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) + return nil + + case linux.SO_RCVBUFFORCE: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) { + return syserr.ErrNotPermitted + } + + v := hostarch.ByteOrder.Uint32(optVal) + min, max := ep.SocketOptions().ReceiveBufferLimits() + clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */) + ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) return nil case linux.SO_REUSEADDR: @@ -2003,7 +2072,7 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name if isTCPSocket(skType, skProto) && tcp.EndpointState(ep.State()) != tcp.StateInitial { return syserr.ErrInvalidEndpointState - } else if isUDPSocket(skType, skProto) && udp.EndpointState(ep.State()) != udp.StateInitial { + } else if isUDPSocket(skType, skProto) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial { return syserr.ErrInvalidEndpointState } @@ -2105,10 +2174,10 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return syserr.ErrNoDevice } // Stack must be a netstack stack. - return netfilter.SetEntries(stack.(*Stack).Stack, optVal, true) + return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, true) case linux.IP6T_SO_SET_ADD_COUNTERS: - // TODO(gvisor.dev/issue/170): Counter support. + log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported") return nil default: @@ -2348,10 +2417,10 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return syserr.ErrNoDevice } // Stack must be a netstack stack. - return netfilter.SetEntries(stack.(*Stack).Stack, optVal, false) + return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, false) case linux.IPT_SO_SET_ADD_COUNTERS: - // TODO(gvisor.dev/issue/170): Counter support. + log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported") return nil case linux.IP_ADD_SOURCE_MEMBERSHIP, @@ -2808,7 +2877,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags if n > 0 { return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil } - if err == syserror.ETIMEDOUT { + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain } return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) @@ -2830,8 +2899,8 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b if err != nil { return 0, err } - if err := s.checkFamily(family, false /* exact */); err != nil { - return 0, err + if !s.checkFamily(family, false /* exact */) { + return 0, syserr.ErrInvalidArgument } addrBuf = s.mapFamily(addrBuf, family) @@ -2876,7 +2945,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b // became available between when we last checked and when we setup // the notification. if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return int(total), syserr.ErrTryAgain } // handleIOError will consume errors from t.Block if needed. @@ -2908,7 +2977,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy s.readMu.Lock() defer s.readMu.Unlock() if !s.timestampValid { - return 0, syserror.ENOENT + return 0, linuxerr.ENOENT } tv := linux.NsecToTimeval(s.timestampNS) @@ -3014,7 +3083,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc unimpl.EmitUnimplementedEvent(ctx) } - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // interfaceIoctl implements interface requests. @@ -3289,10 +3358,10 @@ func (s *socketOpsCommon) State() uint32 { } case isUDPSocket(s.skType, s.protocol): // UDP socket. - switch udp.EndpointState(s.Endpoint.State()) { - case udp.StateInitial, udp.StateBound, udp.StateClosed: + switch transport.DatagramEndpointState(s.Endpoint.State()) { + case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed: return linux.TCP_CLOSE - case udp.StateConnected: + case transport.DatagramEndpointStateConnected: return linux.TCP_ESTABLISHED default: return 0 diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index 30f3ad153..3cdf29b80 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -17,6 +17,7 @@ package netstack import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" @@ -26,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -104,7 +104,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { @@ -112,7 +112,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. } n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) if err == syserr.ErrWouldBlock { - return int64(n), syserror.ErrWouldBlock + return int64(n), linuxerr.ErrWouldBlock } if err != nil { return 0, err.ToError() @@ -125,20 +125,20 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } r := src.Reader(ctx) n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) if _, ok := err.(*tcpip.ErrWouldBlock); ok { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } if err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } if n < src.NumBytes() { - return n, syserror.ErrWouldBlock + return n, linuxerr.ErrWouldBlock } return n, nil diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index eef5e6519..ea199f223 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -18,10 +18,10 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -71,6 +71,12 @@ func (s *Stack) Interfaces() map[int32]inet.Interface { return is } +// RemoveInterface implements inet.Stack.RemoveInterface. +func (s *Stack) RemoveInterface(idx int32) error { + nic := tcpip.NICID(idx) + return syserr.TranslateNetstackError(s.Stack.RemoveNIC(nic)).ToError() +} + // InterfaceAddrs implements inet.Stack.InterfaceAddrs. func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { nicAddrs := make(map[int32][]inet.InterfaceAddr) @@ -110,24 +116,24 @@ func convertAddr(addr inet.InterfaceAddr) (tcpip.ProtocolAddress, error) { switch addr.Family { case linux.AF_INET: if len(addr.Addr) != header.IPv4AddressSize { - return protocolAddress, syserror.EINVAL + return protocolAddress, linuxerr.EINVAL } if addr.PrefixLen > header.IPv4AddressSize*8 { - return protocolAddress, syserror.EINVAL + return protocolAddress, linuxerr.EINVAL } protocol = ipv4.ProtocolNumber address = tcpip.Address(addr.Addr) case linux.AF_INET6: if len(addr.Addr) != header.IPv6AddressSize { - return protocolAddress, syserror.EINVAL + return protocolAddress, linuxerr.EINVAL } if addr.PrefixLen > header.IPv6AddressSize*8 { - return protocolAddress, syserror.EINVAL + return protocolAddress, linuxerr.EINVAL } protocol = ipv6.ProtocolNumber address = tcpip.Address(addr.Addr) default: - return protocolAddress, syserror.ENOTSUP + return protocolAddress, linuxerr.ENOTSUP } protocolAddress = tcpip.ProtocolAddress{ @@ -149,7 +155,7 @@ func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { // Attach address to interface. nicID := tcpip.NICID(idx) - if err := s.Stack.AddProtocolAddressWithOptions(nicID, protocolAddress, stack.CanBePrimaryEndpoint); err != nil { + if err := s.Stack.AddProtocolAddress(nicID, protocolAddress, stack.AddressProperties{}); err != nil { return syserr.TranslateNetstackError(err).ToError() } diff --git a/pkg/sentry/socket/netstack/tun.go b/pkg/sentry/socket/netstack/tun.go index c7ed52702..e67fe9700 100644 --- a/pkg/sentry/socket/netstack/tun.go +++ b/pkg/sentry/socket/netstack/tun.go @@ -16,7 +16,7 @@ package netstack import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/tcpip/link/tun" ) @@ -41,7 +41,7 @@ func LinuxToTUNFlags(flags uint16) (tun.Flags, error) { // when there is no sk_filter. See __tun_chr_ioctl() in // net/drivers/tun.c. if flags&^uint16(linux.IFF_TUN|linux.IFF_TAP|linux.IFF_NO_PI|linux.IFF_ONE_QUEUE) != 0 { - return tun.Flags{}, syserror.EINVAL + return tun.Flags{}, linuxerr.EINVAL } return tun.Flags{ TUN: flags&linux.IFF_TUN != 0, diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 353f4ade0..841d5bd55 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -509,7 +509,6 @@ func SetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) { linux.SO_ATTACH_REUSEPORT_EBPF, linux.SO_CNX_ADVICE, linux.SO_DETACH_FILTER, - linux.SO_RCVBUFFORCE, linux.SO_SNDBUFFORCE: t.Kernel().EmitUnimplementedEvent(t) @@ -659,7 +658,6 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) return &out, uint32(sockAddrInet6Size) case linux.AF_PACKET: - // TODO(gvisor.dev/issue/173): Return protocol too. var out linux.SockAddrLink out.Family = linux.AF_PACKET out.InterfaceIndex = int32(addr.NIC) @@ -745,14 +743,16 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } a.UnmarshalUnsafe(addr[:sockAddrLinkSize]) + // TODO(https://gvisor.dev/issue/6530): Do not assume all interfaces have + // an ethernet address. if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/173): Return protocol too. return tcpip.FullAddress{ NIC: tcpip.NICID(a.InterfaceIndex), Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + Port: Ntohs(a.Protocol), }, family, nil case linux.AF_UNSPEC: diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index c9cbefb3a..7b546c04d 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -39,6 +39,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/hostarch", "//pkg/log", @@ -61,7 +62,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 33f9aeb06..b3f0cf563 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -129,9 +129,9 @@ func newConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProv stype: stype, } + ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) ep.ops.SetSendBufferSize(defaultBufferSize, false /* notify */) ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */) - ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) return ep } @@ -406,14 +406,15 @@ func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error { // Accept accepts a new connection. func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) { e.Lock() - defer e.Unlock() if !e.Listening() { + e.Unlock() return nil, syserr.ErrInvalidEndpointState } select { case ne := <-e.acceptedChan: + e.Unlock() if peerAddr != nil { ne.Lock() c := ne.connected @@ -429,6 +430,7 @@ func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *s return ne, nil default: + e.Unlock() // Nothing left. return nil, syserr.ErrWouldBlock } @@ -517,3 +519,6 @@ func (e *connectionedEndpoint) OnSetSendBufferSize(v int64) (newSz int64) { } return v } + +// WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. +func (e *connectionedEndpoint) WakeupWriters() {} diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 61338728a..61311718e 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -44,9 +44,9 @@ func NewConnectionless(ctx context.Context) Endpoint { q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: defaultBufferSize} q.InitRefs() ep.receiver = &queueReceiver{readQueue: &q} + ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) ep.ops.SetSendBufferSize(defaultBufferSize, false /* notify */) ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */) - ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) return ep } @@ -227,3 +227,6 @@ func (e *connectionlessEndpoint) OnSetSendBufferSize(v int64) (newSz int64) { } return v } + +// WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. +func (e *connectionlessEndpoint) WakeupWriters() {} diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index e4de44498..a9cedcf5f 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -133,7 +133,7 @@ func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, f free := q.limit - q.used if l > free && truncate { - if free == 0 { + if free <= 0 { // Message can't fit right now. q.mu.Unlock() return 0, false, syserr.ErrWouldBlock diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index db7b1affe..e9e482017 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -23,6 +23,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" @@ -37,7 +38,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -493,7 +493,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b } n, err := src.CopyInTo(t, &w) - if err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { + if err != linuxerr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { return int(n), syserr.FromError(err) } @@ -513,13 +513,13 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b n, err = src.CopyInTo(t, &w) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - err = syserror.ErrWouldBlock + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { + err = linuxerr.ErrWouldBlock } break } @@ -647,7 +647,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } var total int64 - if n, err := doRead(); err != syserror.ErrWouldBlock || dontWait { + if n, err := doRead(); err != linuxerr.ErrWouldBlock || dontWait { var from linux.SockAddr var fromLen uint32 if r.From != nil && len([]byte(r.From.Addr)) != 0 { @@ -682,7 +682,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags defer s.EventUnregister(&e) for { - if n, err := doRead(); err != syserror.ErrWouldBlock { + if n, err := doRead(); err != linuxerr.ErrWouldBlock { var from linux.SockAddr var fromLen uint32 if r.From != nil { @@ -719,7 +719,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags if total > 0 { err = nil } - if err == syserror.ETIMEDOUT { + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain } return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.FromError(err) diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index c39e317ff..8c5075a1c 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -17,6 +17,7 @@ package unix import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" @@ -29,7 +30,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -236,7 +236,7 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { Mode: linux.FileMode(linux.S_IFSOCK | uint(stat.Mode)&^t.FSContext().Umask()), Endpoint: bep, }) - if err == syserror.EEXIST { + if linuxerr.Equals(linuxerr.EEXIST, err) { return syserr.ErrAddressInUse } return syserr.FromError(err) @@ -253,7 +253,7 @@ func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal // PRead implements vfs.FileDescriptionImpl. func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Read implements vfs.FileDescriptionImpl. @@ -261,7 +261,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { @@ -282,7 +282,7 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. // PWrite implements vfs.FileDescriptionImpl. func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Write implements vfs.FileDescriptionImpl. @@ -290,7 +290,7 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } t := kernel.TaskFromContext(ctx) diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD index 3e801182c..7f02807c5 100644 --- a/pkg/sentry/state/BUILD +++ b/pkg/sentry/state/BUILD @@ -13,6 +13,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/sentry/inet", "//pkg/sentry/kernel", @@ -20,7 +21,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sentry/watchdog", "//pkg/state/statefile", - "//pkg/syserror", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go index 167754537..e9d544f3d 100644 --- a/pkg/sentry/state/state.go +++ b/pkg/sentry/state/state.go @@ -20,6 +20,7 @@ import ( "io" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -27,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/state/statefile" - "gvisor.dev/gvisor/pkg/syserror" ) var previousMetadata map[string]string @@ -88,7 +88,7 @@ func (opts SaveOpts) Save(ctx context.Context, k *kernel.Kernel, w *watchdog.Wat // ENOSPC is a state file error. This error can only come from // writing the state file, and not from fs.FileOperations.Fsync // because we wrap those in kernel.TaskSet.flushWritesToFiles. - if err == syserror.ENOSPC { + if linuxerr.Equals(linuxerr.ENOSPC, err) { err = ErrStateFile{err} } @@ -110,7 +110,7 @@ type LoadOpts struct { } // Load loads the given kernel, setting the provided platform and stack. -func (opts LoadOpts) Load(ctx context.Context, k *kernel.Kernel, n inet.Stack, clocks time.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { +func (opts LoadOpts) Load(ctx context.Context, k *kernel.Kernel, timeReady chan struct{}, n inet.Stack, clocks time.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { // Open the file. r, m, err := statefile.NewReader(opts.Source, opts.Key) if err != nil { @@ -120,5 +120,5 @@ func (opts LoadOpts) Load(ctx context.Context, k *kernel.Kernel, n inet.Stack, c previousMetadata = m // Restore the Kernel object graph. - return k.LoadFrom(ctx, r, n, clocks, vfsOpts) + return k.LoadFrom(ctx, r, timeReady, n, clocks, vfsOpts) } diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go index cefd20b9b..c42297c80 100644 --- a/pkg/sentry/state/state_metadata.go +++ b/pkg/sentry/state/state_metadata.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package state import ( diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go index 6ce1bb592..317c3c31c 100644 --- a/pkg/sentry/strace/linux64_amd64.go +++ b/pkg/sentry/strace/linux64_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package strace diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go index ce5594301..65f27c810 100644 --- a/pkg/sentry/strace/linux64_arm64.go +++ b/pkg/sentry/strace/linux64_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package strace diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index af7088847..757ff2a40 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -133,6 +133,9 @@ func dump(t *kernel.Task, addr hostarch.Addr, size uint, maximumBlobSize uint) s } func path(t *kernel.Task, addr hostarch.Addr) string { + if addr == 0 { + return "<null>" + } path, err := t.CopyInString(addr, linux.PATH_MAX) if err != nil { return fmt.Sprintf("%#x (error decoding path: %s)", addr, err) @@ -816,10 +819,10 @@ func convertToSyscallFlag(sinks SinkType) uint32 { return ret } -// Enable enables the syscalls in whitelist in all syscall tables. +// Enable enables the syscalls in allowlist in all syscall tables. // // Preconditions: Initialize has been called. -func Enable(whitelist []string, sinks SinkType) error { +func Enable(allowlist []string, sinks SinkType) error { flags := convertToSyscallFlag(sinks) for _, table := range kernel.SyscallTables() { // Is this known? @@ -829,7 +832,7 @@ func Enable(whitelist []string, sinks SinkType) error { } // Convert to a set of system calls numbers. - wl, err := sys.ConvertToSysnoMap(whitelist) + wl, err := sys.ConvertToSysnoMap(allowlist) if err != nil { return err } diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD index b8d1bd415..7a7c80ac6 100644 --- a/pkg/sentry/syscalls/BUILD +++ b/pkg/sentry/syscalls/BUILD @@ -11,11 +11,11 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/errors/linuxerr", "//pkg/sentry/arch", "//pkg/sentry/kernel", "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/time", - "//pkg/syserror", "//pkg/waiter", ], ) diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index 3b4d79889..a69ed0746 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -18,10 +18,10 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -45,21 +45,21 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syserror.EBADF + return linuxerr.EBADF } defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syserror.EBADF + return linuxerr.EBADF } // Try to add the entry. @@ -71,21 +71,21 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syserror.EBADF + return linuxerr.EBADF } defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syserror.EBADF + return linuxerr.EBADF } // Try to update the entry. @@ -97,21 +97,21 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { // Get epoll from the file descriptor. epollfile := t.GetFile(epfd) if epollfile == nil { - return syserror.EBADF + return linuxerr.EBADF } defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return syserror.EBADF + return linuxerr.EBADF } // Try to remove the entry. @@ -123,14 +123,14 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeoutInNanos int64) ([]linux // Get epoll from the file descriptor. epollfile := t.GetFile(fd) if epollfile == nil { - return nil, syserror.EBADF + return nil, linuxerr.EBADF } defer epollfile.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) if !ok { - return nil, syserror.EBADF + return nil, linuxerr.EBADF } // Try to read events and return right away if we got them or if the @@ -163,7 +163,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeoutInNanos int64) ([]linux } if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return nil, nil } diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 408a6c422..394396cde 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -25,6 +25,7 @@ go_library( "sys_mempolicy.go", "sys_mmap.go", "sys_mount.go", + "sys_msgqueue.go", "sys_pipe.go", "sys_poll.go", "sys_prctl.go", @@ -64,6 +65,7 @@ go_library( "//pkg/abi/linux", "//pkg/bpf", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/marshal", @@ -83,6 +85,8 @@ go_library( "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/eventfd", "//pkg/sentry/kernel/fasync", + "//pkg/sentry/kernel/ipc", + "//pkg/sentry/kernel/msgqueue", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/shm", @@ -100,7 +104,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 6eabfd219..f4d549a3f 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -19,13 +19,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) var ( @@ -89,18 +89,18 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr er } // Translate error, if possible, to consolidate errors from other packages - // into a smaller set of errors from syserror package. + // into a smaller set of errors from linuxerr package. translatedErr := errOrig - if errno, ok := syserror.TranslateError(errOrig); ok { + if errno, ok := linuxerr.TranslateError(errOrig); ok { translatedErr = errno } - switch translatedErr { - case io.EOF: + switch { + case translatedErr == io.EOF: // EOF is always consumed. If this is a partial read/write // (result != 0), the application will see that, otherwise // they will see 0. return true, nil - case syserror.EFBIG: + case linuxerr.Equals(linuxerr.EFBIG, translatedErr): t := kernel.TaskFromContext(ctx) if t == nil { panic("I/O error should only occur from a context associated with a Task") @@ -112,8 +112,8 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr er // Do not consume the error and return it as EFBIG. // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return true, syserror.EFBIG - case syserror.EINTR: + return true, linuxerr.EFBIG + case linuxerr.Equals(linuxerr.EINTR, translatedErr): // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall // needs (to indicate to the kernel what it should do). @@ -128,21 +128,21 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr er return true, errOrig } - switch translatedErr { - case syserror.EINTR: + switch { + case linuxerr.Equals(linuxerr.EINTR, translatedErr): // Syscall interrupted, but completed a partial // read/write. Like ErrWouldBlock, since we have a // partial read/write, we consume the error and return // the partial result. return true, nil - case syserror.EFAULT: + case linuxerr.Equals(linuxerr.EFAULT, translatedErr): // EFAULT is only shown the user if nothing was // read/written. If we read something (this case), they see // a partial read/write. They will then presumably try again // with an incremented buffer, which will EFAULT with // result == 0. return true, nil - case syserror.EPIPE: + case linuxerr.Equals(linuxerr.EPIPE, translatedErr): // Writes to a pipe or socket will return EPIPE if the other // side is gone. The partial write is returned. EPIPE will be // returned on the next call. @@ -150,24 +150,23 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr er // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should // also be sent to the application. return true, nil - case syserror.ENOSPC: + case linuxerr.Equals(linuxerr.ENOSPC, translatedErr): // Similar to EPIPE. Return what we wrote this time, and let // ENOSPC be returned on the next call. return true, nil - case syserror.ECONNRESET, syserror.ETIMEDOUT: + case linuxerr.Equals(linuxerr.ECONNRESET, translatedErr): + fallthrough + case linuxerr.Equals(linuxerr.ETIMEDOUT, translatedErr): // For TCP sendfile connections, we may have a reset or timeout. But we // should just return n as the result. return true, nil - case syserror.EWOULDBLOCK: + case linuxerr.Equals(linuxerr.EWOULDBLOCK, translatedErr): // Syscall would block, but completed a partial read/write. // This case should only be returned by IssueIO for nonblocking // files. Since we have a partial read/write, we consume // ErrWouldBlock, returning the partial result. return true, nil - } - - switch errOrig.(type) { - case syserror.SyscallRestartErrno: + case linuxerr.IsRestartError(translatedErr): // Identical to the EINTR case. return true, nil } diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 090c5ffcb..2046a48b9 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -18,11 +18,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -120,10 +120,10 @@ var AMD64 = &kernel.SyscallTable{ 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 66: syscalls.Supported("semctl", Semctl), 67: syscalls.Supported("shmdt", Shmdt), - 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 68: syscalls.Supported("msgget", Msgget), + 69: syscalls.Supported("msgsnd", Msgsnd), + 70: syscalls.Supported("msgrcv", Msgrcv), + 71: syscalls.Supported("msgctl", Msgctl), 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), @@ -174,8 +174,8 @@ var AMD64 = &kernel.SyscallTable{ 119: syscalls.Supported("setresgid", Setresgid), 120: syscalls.Supported("getresgid", Getresgid), 121: syscalls.Supported("getpgid", Getpgid), - 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 122: syscalls.ErrorWithEvent("setfsuid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 124: syscalls.Supported("getsid", Getsid), 125: syscalls.Supported("capget", Capget), 126: syscalls.Supported("capset", Capset), @@ -186,12 +186,12 @@ var AMD64 = &kernel.SyscallTable{ 131: syscalls.Supported("sigaltstack", Sigaltstack), 132: syscalls.Supported("utime", Utime), 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), - 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), - 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), - 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), + 134: syscalls.Error("uselib", linuxerr.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", linuxerr.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", linuxerr.ENOSYS, "Needs filesystem support.", nil), 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), - 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 139: syscalls.ErrorWithEvent("sysfs", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/165"}), 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), @@ -200,15 +200,15 @@ var AMD64 = &kernel.SyscallTable{ 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", linuxerr.EPERM, "", nil), 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), - 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), - 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 154: syscalls.Error("modify_ldt", linuxerr.EPERM, "", nil), + 155: syscalls.Error("pivot_root", linuxerr.EPERM, "", nil), + 156: syscalls.Error("sysctl", linuxerr.EPERM, "Deprecated. Use /proc/sys instead.", nil), 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), @@ -229,15 +229,15 @@ var AMD64 = &kernel.SyscallTable{ 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), - 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 177: syscalls.Error("get_kernel_syms", linuxerr.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", linuxerr.ENOSYS, "Not supported in Linux > 2.6.", nil), 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), - 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), + 180: syscalls.Error("nfsservctl", linuxerr.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", linuxerr.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", linuxerr.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", linuxerr.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", linuxerr.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", linuxerr.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), 187: syscalls.Supported("readahead", Readahead), 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), @@ -257,18 +257,18 @@ var AMD64 = &kernel.SyscallTable{ 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), - 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 205: syscalls.Error("set_thread_area", linuxerr.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 211: syscalls.Error("get_thread_area", linuxerr.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), 213: syscalls.Supported("epoll_create", EpollCreate), - 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), - 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), - 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", linuxerr.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", linuxerr.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", linuxerr.ENOSYS, "Deprecated since Linux 3.16.", nil), 217: syscalls.Supported("getdents64", Getdents64), 218: syscalls.Supported("set_tid_address", SetTidAddress), 219: syscalls.Supported("restart_syscall", RestartSyscall), @@ -288,21 +288,21 @@ var AMD64 = &kernel.SyscallTable{ 233: syscalls.Supported("epoll_ctl", EpollCtl), 234: syscalls.Supported("tgkill", Tgkill), 235: syscalls.Supported("utimes", Utimes), - 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), + 236: syscalls.Error("vserver", linuxerr.ENOSYS, "Not implemented by Linux", nil), 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), - 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 240: syscalls.ErrorWithEvent("mq_open", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), 247: syscalls.Supported("waitid", Waitid), - 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), - 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), - 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 248: syscalls.Error("add_key", linuxerr.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", linuxerr.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", linuxerr.EACCES, "Not available to user.", nil), 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil), @@ -330,7 +330,7 @@ var AMD64 = &kernel.SyscallTable{ 275: syscalls.Supported("splice", Splice), 276: syscalls.Supported("tee", Tee), 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), - 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 278: syscalls.ErrorWithEvent("vmsplice", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) 280: syscalls.Supported("utimensat", Utimensat), 281: syscalls.Supported("epoll_pwait", EpollPwait), @@ -350,60 +350,60 @@ var AMD64 = &kernel.SyscallTable{ 295: syscalls.Supported("preadv", Preadv), 296: syscalls.Supported("pwritev", Pwritev), 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 298: syscalls.ErrorWithEvent("perf_event_open", linuxerr.ENODEV, "No support for perf counters", nil), 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), - 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 300: syscalls.ErrorWithEvent("fanotify_init", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 302: syscalls.Supported("prlimit64", Prlimit64), - 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 303: syscalls.Error("name_to_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 308: syscalls.ErrorWithEvent("setns", linuxerr.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) 309: syscalls.Supported("getcpu", Getcpu), - 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 310: syscalls.ErrorWithEvent("process_vm_readv", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 314: syscalls.ErrorWithEvent("sched_setattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) 317: syscalls.Supported("seccomp", Seccomp), 318: syscalls.Supported("getrandom", GetRandom), 319: syscalls.Supported("memfd_create", MemfdCreate), 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), 322: syscalls.Supported("execveat", Execveat), - 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 323: syscalls.ErrorWithEvent("userfaultfd", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) 324: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil), 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls implemented after 325 are "backports" from versions // of Linux after 4.4. - 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 326: syscalls.ErrorWithEvent("copy_file_range", linuxerr.ENOSYS, "", nil), 327: syscalls.Supported("preadv2", Preadv2), 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), - 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), - 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), - 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 329: syscalls.ErrorWithEvent("pkey_mprotect", linuxerr.ENOSYS, "", nil), + 330: syscalls.ErrorWithEvent("pkey_alloc", linuxerr.ENOSYS, "", nil), + 331: syscalls.ErrorWithEvent("pkey_free", linuxerr.ENOSYS, "", nil), 332: syscalls.Supported("statx", Statx), - 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 333: syscalls.ErrorWithEvent("io_pgetevents", linuxerr.ENOSYS, "", nil), 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), // Linux skips ahead to syscall 424 to sync numbers between arches. - 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), - 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), - 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), - 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), - 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), - 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), - 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), - 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + 424: syscalls.ErrorWithEvent("pidfd_send_signal", linuxerr.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", linuxerr.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", linuxerr.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", linuxerr.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", linuxerr.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", linuxerr.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", linuxerr.ENOSYS, "", nil), 441: syscalls.Supported("epoll_pwait2", EpollPwait2), }, Emulate: map[hostarch.Addr]uintptr{ @@ -413,7 +413,7 @@ var AMD64 = &kernel.SyscallTable{ }, Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t.Kernel().EmitUnimplementedEvent(t) - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS }, } @@ -470,8 +470,8 @@ var ARM64 = &kernel.SyscallTable{ 38: syscalls.Supported("renameat", Renameat), 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), - 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil), - 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 41: syscalls.Error("pivot_root", linuxerr.EPERM, "", nil), + 42: syscalls.Error("nfsservctl", linuxerr.ENOSYS, "Removed after Linux 3.1.", nil), 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), 45: syscalls.Supported("truncate", Truncate), @@ -504,7 +504,7 @@ var ARM64 = &kernel.SyscallTable{ 72: syscalls.Supported("pselect", Pselect), 73: syscalls.Supported("ppoll", Ppoll), 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 75: syscalls.ErrorWithEvent("vmsplice", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 76: syscalls.Supported("splice", Splice), 77: syscalls.Supported("tee", Tee), 78: syscalls.Supported("readlinkat", Readlinkat), @@ -521,7 +521,7 @@ var ARM64 = &kernel.SyscallTable{ 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), 90: syscalls.Supported("capget", Capget), 91: syscalls.Supported("capset", Capset), - 92: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 92: syscalls.ErrorWithEvent("personality", linuxerr.EINVAL, "Unable to change personality.", nil), 93: syscalls.Supported("exit", Exit), 94: syscalls.Supported("exit_group", ExitGroup), 95: syscalls.Supported("waitid", Waitid), @@ -556,7 +556,7 @@ var ARM64 = &kernel.SyscallTable{ 124: syscalls.Supported("sched_yield", SchedYield), 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 127: syscalls.ErrorWithEvent("sched_rr_get_interval", linuxerr.EPERM, "", nil), 128: syscalls.Supported("restart_syscall", RestartSyscall), 129: syscalls.Supported("kill", Kill), 130: syscalls.Supported("tkill", Tkill), @@ -580,8 +580,8 @@ var ARM64 = &kernel.SyscallTable{ 148: syscalls.Supported("getresuid", Getresuid), 149: syscalls.Supported("setresgid", Setresgid), 150: syscalls.Supported("getresgid", Getresgid), - 151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 151: syscalls.ErrorWithEvent("setfsuid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 152: syscalls.ErrorWithEvent("setfsgid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 153: syscalls.Supported("times", Times), 154: syscalls.Supported("setpgid", Setpgid), 155: syscalls.Supported("getpgid", Getpgid), @@ -609,16 +609,16 @@ var ARM64 = &kernel.SyscallTable{ 177: syscalls.Supported("getegid", Getegid), 178: syscalls.Supported("gettid", Gettid), 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), - 180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 180: syscalls.ErrorWithEvent("mq_open", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 181: syscalls.ErrorWithEvent("mq_unlink", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 182: syscalls.ErrorWithEvent("mq_timedsend", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 183: syscalls.ErrorWithEvent("mq_timedreceive", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 184: syscalls.ErrorWithEvent("mq_notify", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 185: syscalls.ErrorWithEvent("mq_getsetattr", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 186: syscalls.Supported("msgget", Msgget), + 187: syscalls.Supported("msgctl", Msgctl), + 188: syscalls.Supported("msgrcv", Msgrcv), + 189: syscalls.Supported("msgsnd", Msgsnd), 190: syscalls.Supported("semget", Semget), 191: syscalls.Supported("semctl", Semctl), 192: syscalls.Supported("semtimedop", Semtimedop), @@ -646,9 +646,9 @@ var ARM64 = &kernel.SyscallTable{ 214: syscalls.Supported("brk", Brk), 215: syscalls.Supported("munmap", Munmap), 216: syscalls.Supported("mremap", Mremap), - 217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), - 218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), - 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 217: syscalls.Error("add_key", linuxerr.EACCES, "Not available to user.", nil), + 218: syscalls.Error("request_key", linuxerr.EACCES, "Not available to user.", nil), + 219: syscalls.Error("keyctl", linuxerr.EACCES, "Not available to user.", nil), 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), 221: syscalls.Supported("execve", Execve), 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), @@ -663,72 +663,72 @@ var ARM64 = &kernel.SyscallTable{ 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), - 234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 234: syscalls.ErrorWithEvent("remap_file_pages", linuxerr.ENOSYS, "Deprecated since Linux 3.16.", nil), 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 241: syscalls.ErrorWithEvent("perf_event_open", linuxerr.ENODEV, "No support for perf counters", nil), 242: syscalls.Supported("accept4", Accept4), 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), 260: syscalls.Supported("wait4", Wait4), 261: syscalls.Supported("prlimit64", Prlimit64), - 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 262: syscalls.ErrorWithEvent("fanotify_init", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 263: syscalls.ErrorWithEvent("fanotify_mark", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 264: syscalls.Error("name_to_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 265: syscalls.Error("open_by_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), - 268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 268: syscalls.ErrorWithEvent("setns", linuxerr.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 270: syscalls.ErrorWithEvent("process_vm_readv", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 271: syscalls.ErrorWithEvent("process_vm_writev", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/158"}), 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 274: syscalls.ErrorWithEvent("sched_setattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 275: syscalls.ErrorWithEvent("sched_getattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 276: syscalls.ErrorWithEvent("renameat2", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) 277: syscalls.Supported("seccomp", Seccomp), 278: syscalls.Supported("getrandom", GetRandom), 279: syscalls.Supported("memfd_create", MemfdCreate), 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), 281: syscalls.Supported("execveat", Execveat), - 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 282: syscalls.ErrorWithEvent("userfaultfd", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) 283: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil), 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls after 284 are "backports" from versions of Linux after 4.4. - 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 285: syscalls.ErrorWithEvent("copy_file_range", linuxerr.ENOSYS, "", nil), 286: syscalls.Supported("preadv2", Preadv2), 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), - 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), - 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), - 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 288: syscalls.ErrorWithEvent("pkey_mprotect", linuxerr.ENOSYS, "", nil), + 289: syscalls.ErrorWithEvent("pkey_alloc", linuxerr.ENOSYS, "", nil), + 290: syscalls.ErrorWithEvent("pkey_free", linuxerr.ENOSYS, "", nil), 291: syscalls.Supported("statx", Statx), - 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 292: syscalls.ErrorWithEvent("io_pgetevents", linuxerr.ENOSYS, "", nil), 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), // Linux skips ahead to syscall 424 to sync numbers between arches. - 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), - 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), - 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), - 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), - 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), - 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), - 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), - 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + 424: syscalls.ErrorWithEvent("pidfd_send_signal", linuxerr.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", linuxerr.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", linuxerr.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", linuxerr.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", linuxerr.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", linuxerr.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", linuxerr.ENOSYS, "", nil), 441: syscalls.Supported("epoll_pwait2", EpollPwait2), }, Emulate: map[hostarch.Addr]uintptr{}, Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t.Kernel().EmitUnimplementedEvent(t) - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS }, } diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index e8c2d8f9e..373948991 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -16,9 +16,9 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // CopyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and @@ -29,7 +29,7 @@ import ( // syscalls are moved into this package, then they can be unexported. func CopyInSigSet(t *kernel.Task, sigSetAddr hostarch.Addr, size uint) (linux.SignalSet, error) { if size != linux.SignalSetSize { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } b := t.CopyScratchBuffer(8) if _, err := t.CopyInBytes(sigSetAddr, b); err != nil { @@ -66,6 +66,6 @@ func copyInSigSetWithSize(t *kernel.Task, addr hostarch.Addr) (hostarch.Addr, ui maskSize := uint(hostarch.ByteOrder.Uint64(in[8:])) return maskAddr, maskSize, nil default: - return 0, 0, syserror.ENOSYS + return 0, 0, linuxerr.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index 70e8569a8..2f00c3783 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -17,6 +17,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -25,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/usermem" ) @@ -42,7 +43,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca return 0, nil, err } if idIn != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents)) @@ -66,7 +67,7 @@ func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys ctx := t.MemoryManager().DestroyAIOContext(t, id) if ctx == nil { // Does not exist. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Drain completed requests amd wait for pending requests until there are no @@ -97,12 +98,12 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Sanity check arguments. if minEvents < 0 || minEvents > events { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ctx, ok := t.MemoryManager().LookupAIOContext(t, id) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Setup the timeout. @@ -114,7 +115,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } if !d.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration()) haveDeadline = true @@ -134,10 +135,10 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S var err error v, err = waitForRequest(ctx, t, haveDeadline, deadline) if err != nil { - if count > 0 || err == syserror.ETIMEDOUT { + if count > 0 || linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return uintptr(count), nil, nil } - return 0, nil, syserror.ConvertIntr(err, syserror.EINTR) + return 0, nil, syserr.ConvertIntr(err, linuxerr.EINTR) } } @@ -171,7 +172,7 @@ func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadl done := ctx.WaitChannel() if done == nil { // Context has been destroyed. - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil { return nil, err @@ -184,7 +185,7 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) bytes := int(cb.Bytes) if bytes < 0 { // Linux also requires that this field fit in ssize_t. - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } // Since this I/O will be asynchronous with respect to t's task goroutine, @@ -206,7 +207,7 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) default: // Not a supported command. - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } } @@ -215,7 +216,7 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) // It is not presently supported (ENOSYS indicates no support on this // architecture). func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } // LINT.IfChange @@ -269,7 +270,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host file := t.GetFile(cb.FD) if file == nil { // File not found. - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) @@ -279,14 +280,14 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host eventFile = t.GetFile(cb.ResFD) if eventFile == nil { // Bad FD. - return syserror.EBADF + return linuxerr.EBADF } defer eventFile.DecRef(t) // Check that it is an eventfd. if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok { // Not an event FD. - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -299,14 +300,14 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host switch cb.OpCode { case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: if cb.Offset < 0 { - return syserror.EINVAL + return linuxerr.EINVAL } } // Prepare the request. ctx, ok := t.MemoryManager().LookupAIOContext(t, id) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } if err := ctx.Prepare(); err != nil { return err @@ -335,7 +336,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc addr := args[2].Pointer() if nrEvents < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } for i := int32(0); i < nrEvents; i++ { @@ -354,7 +355,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } cbAddr = hostarch.Addr(cbAddrP) default: - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } // Copy in this callback. diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go index d3b85e11b..1e714503c 100644 --- a/pkg/sentry/syscalls/linux/sys_capability.go +++ b/pkg/sentry/syscalls/linux/sys_capability.go @@ -16,22 +16,22 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) { if tid < 0 { - err = syserror.EINVAL + err = linuxerr.EINVAL return } if tid > 0 { t = t.PIDNamespace().TaskWithID(tid) } if t == nil { - err = syserror.ESRCH + err = linuxerr.ESRCH return } creds := t.Credentials() @@ -97,7 +97,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } if dataAddr != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil } @@ -115,7 +115,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal switch hdr.Version { case linux.LINUX_CAPABILITY_VERSION_1: if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } var data linux.CapUserData if _, err := data.CopyIn(t, dataAddr); err != nil { @@ -128,7 +128,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } var data [2]linux.CapUserData if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil { @@ -144,6 +144,6 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if _, err := hdr.CopyOut(t, hdrAddr); err != nil { return 0, nil, err } - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go index dd43cf18d..2b2dbd9f9 100644 --- a/pkg/sentry/syscalls/linux/sys_clone_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go index cf68a8949..877c86e6a 100644 --- a/pkg/sentry/syscalls/linux/sys_clone_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 69cbc98d0..6c807124c 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -16,12 +16,13 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) @@ -31,7 +32,7 @@ import ( func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags & ^linux.EPOLL_CLOEXEC != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } closeOnExec := flags&linux.EPOLL_CLOEXEC != 0 @@ -48,7 +49,7 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S size := args[0].Int() if size <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } fd, err := syscalls.CreateEpoll(t, false) @@ -101,14 +102,14 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc mask |= waiter.EventHUp | waiter.EventErr return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data) default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } func waitEpoll(t *kernel.Task, fd int32, eventsAddr hostarch.Addr, max int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) { r, err := syscalls.WaitEpoll(t, fd, max, timeoutInNanos) if err != nil { - return 0, nil, syserror.ConvertIntr(err, syserror.EINTR) + return 0, nil, syserr.ConvertIntr(err, linuxerr.EINTR) } if len(r) != 0 { diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index 3b4f879e4..7ba9a755e 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -16,11 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" - "gvisor.dev/gvisor/pkg/syserror" ) // Eventfd2 implements linux syscall eventfd2(2). @@ -30,7 +30,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) if flags & ^allOps != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } event := eventfd.New(t, uint64(initVal), flags&linux.EFD_SEMAPHORE != 0) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 90a719ba2..e79b92fb6 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -18,6 +18,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -29,7 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // fileOpAt performs an operation on the second last component in the path. @@ -79,12 +80,12 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro // Need to extract the given FD. f = t.GetFile(dirFD) if f == nil { - return syserror.EBADF + return linuxerr.EBADF } rel = f.Dirent if !fs.IsDir(rel.Inode.StableAttr) { f.DecRef(t) - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } @@ -121,7 +122,7 @@ func copyInPath(t *kernel.Task, addr hostarch.Addr, allowEmpty bool) (path strin return "", false, err } if path == "" && !allowEmpty { - return "", false, syserror.ENOENT + return "", false, linuxerr.ENOENT } // If the path ends with a /, then checks must be enforced in various @@ -152,7 +153,7 @@ func openAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint) (fd uin } if fs.IsSymlink(d.Inode.StableAttr) && !resolve { - return syserror.ELOOP + return linuxerr.ELOOP } fileFlags := linuxToFlags(flags) @@ -161,22 +162,22 @@ func openAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint) (fd uin if fs.IsDir(d.Inode.StableAttr) { // Don't allow directories to be opened writable. if fileFlags.Write { - return syserror.EISDIR + return linuxerr.EISDIR } } else { // If O_DIRECTORY is set, but the file is not a directory, then fail. if fileFlags.Directory { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // If it's a directory, then make sure. if dirPath { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } } file, err := d.Inode.GetFile(t, d, fileFlags) if err != nil { - return syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } defer file.DecRef(t) @@ -214,12 +215,12 @@ func mknodAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod return err } if dirPath { - return syserror.ENOENT + return linuxerr.ENOENT } return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Do we have the appropriate permissions on the parent? @@ -260,7 +261,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod // Instead of emulating this seemingly useless behaviour, we'll // indicate that the filesystem doesn't support the creation of // sockets. - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP case linux.ModeCharacterDevice: fallthrough @@ -270,12 +271,12 @@ func mknodAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod // // When we start supporting block and character devices, we'll // need to check for CAP_MKNOD here. - return syserror.EPERM + return linuxerr.EPERM default: // "EINVAL - mode requested creation of something other than a // regular file, device special file, FIFO or socket." - mknod(2) - return syserror.EINVAL + return linuxerr.EINVAL } }) } @@ -307,7 +308,7 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode return 0, err } if dirPath { - return 0, syserror.ENOENT + return 0, linuxerr.ENOENT } fileFlags := linuxToFlags(flags) @@ -325,7 +326,7 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode ) for { if !fs.IsDir(parent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Start by looking up the dirent at 'name'. @@ -339,7 +340,7 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode // O_EXCL flag was passed, then we can immediately // return EEXIST. if flags&linux.O_EXCL != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } // If we have a non-symlink, then we can proceed. @@ -350,7 +351,7 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode // If O_NOFOLLOW was passed, then don't try to resolve // anything. if flags&linux.O_NOFOLLOW != 0 { - return syserror.ELOOP + return linuxerr.ELOOP } // Try to resolve the symlink directly to a Dirent. @@ -394,8 +395,8 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode } var newFile *fs.File - switch err { - case nil: + switch { + case err == nil: // Like sys_open, check for a few things about the // filesystem before trying to get a reference to the // fs.File. The same constraints on Check apply. @@ -415,10 +416,10 @@ func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode // Create a new fs.File. newFile, err = found.Inode.GetFile(t, found, fileFlags) if err != nil { - return syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } defer newFile.DecRef(t) - case syserror.ENOENT: + case linuxerr.Equals(linuxerr.ENOENT, err): // File does not exist. Proceed with creation. // Do we have write permissions on the parent? @@ -527,7 +528,7 @@ func accessAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode uint) error // Sanity check the mode. if mode&^(rOK|wOK|xOK) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { @@ -595,7 +596,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -684,7 +685,7 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Note this is >= because we need a terminator. if uint(len(s)) >= size { - return 0, nil, syserror.ERANGE + return 0, nil, linuxerr.ERANGE } // Copy out the path name for the node. @@ -703,7 +704,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal addr := args[0].Pointer() if !t.HasCapability(linux.CAP_SYS_CHROOT) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } path, _, err := copyInPath(t, addr, false /* allowEmpty */) @@ -714,7 +715,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { // Is it a directory? if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Does it have execute permissions? @@ -739,7 +740,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { // Is it a directory? if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Does it have execute permissions? @@ -758,13 +759,13 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Is it a directory? if !fs.IsDir(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.ENOTDIR + return 0, nil, linuxerr.ENOTDIR } // Does it have execute permissions? @@ -789,12 +790,12 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // (and other reference-holding operations complete). file, _ := t.FDTable().Remove(t, fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) err := file.Flush(t) - return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) + return 0, nil, handleIOError(t, false /* partial */, err, linuxerr.EINTR, "close", file) } // Dup implements linux syscall dup(2). @@ -803,13 +804,13 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { - return 0, nil, syserror.EMFILE + return 0, nil, linuxerr.EMFILE } return uintptr(newFD), nil, nil } @@ -824,7 +825,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldfd == newfd { oldFile := t.GetFile(oldfd) if oldFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer oldFile.DecRef(t) @@ -843,12 +844,12 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC flags := args[2].Uint() if oldfd == newfd { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } oldFile := t.GetFile(oldfd) if oldFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer oldFile.DecRef(t) @@ -905,7 +906,7 @@ func fSetOwn(t *kernel.Task, fd int, file *fs.File, who int32) error { if who < 0 { // Check for overflow before flipping the sign. if who-1 > who { - return syserror.EINVAL + return linuxerr.EINVAL } pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who)) a.SetOwnerProcessGroup(t, pg) @@ -923,7 +924,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file, flags := t.FDTable().Get(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -956,7 +957,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Normally pipe and socket types lack lock operations. We diverge and use a heavy // hammer by only allowing locks on files and directories. if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Copy in the lock request. @@ -976,7 +977,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case 2: sw = fs.SeekEnd default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Compute the lock offset. @@ -995,7 +996,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } off = uattr.Size default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Compute the lock range. @@ -1009,33 +1010,33 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch flock.Type { case linux.F_RDLCK: if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, nil) { - return 0, nil, syserror.EAGAIN + return 0, nil, linuxerr.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, t) { - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } } return 0, nil, nil case linux.F_WRLCK: if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, nil) { - return 0, nil, syserror.EAGAIN + return 0, nil, linuxerr.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, t) { - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } } return 0, nil, nil @@ -1043,7 +1044,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng) return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } case linux.F_GETOWN: return uintptr(fGetOwn(t, file)), nil, nil @@ -1066,47 +1067,47 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_OWNER_TID: task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID)) if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } a.SetOwnerTask(t, task) return 0, nil, nil case linux.F_OWNER_PID: tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID)) if tg == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } a.SetOwnerThreadGroup(t, tg) return 0, nil, nil case linux.F_OWNER_PGRP: pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID)) if pg == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } a.SetOwnerProcessGroup(t, pg) return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } case linux.F_GET_SEALS: val, err := tmpfs.GetSeals(file.Dirent.Inode) return uintptr(val), nil, err case linux.F_ADD_SEALS: if !file.Flags().Write { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint()) return 0, nil, err case linux.F_GETPIPE_SZ: sz, ok := file.FileOperations.(fs.FifoSizer) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } size, err := sz.FifoSize(t, file) return uintptr(size), nil, err case linux.F_SETPIPE_SZ: sz, ok := file.FileOperations.(fs.FifoSizer) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } n, err := sz.SetFifoSize(int64(args[2].Int())) return uintptr(n), nil, err @@ -1118,7 +1119,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, a.SetSignal(linux.Signal(args[2].Int())) default: // Everything else is not yet supported. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -1131,18 +1132,18 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Note: offset is allowed to be negative. if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // If the FD refers to a pipe or FIFO, return error. if fs.IsPipe(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } switch advice { @@ -1153,7 +1154,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys case linux.POSIX_FADV_DONTNEED: case linux.POSIX_FADV_NOREUSE: default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Sure, whatever. @@ -1172,18 +1173,18 @@ func mkdirAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMod return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Does this directory exist already? remainingTraversals := uint(linux.MaxSymlinkTraversals) f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals) - switch err { - case nil: + switch { + case err == nil: // The directory existed. defer f.DecRef(t) - return syserror.EEXIST - case syserror.EACCES: + return linuxerr.EEXIST + case linuxerr.Equals(linuxerr.EACCES, err): // Permission denied while walking to the directory. return err default: @@ -1224,21 +1225,21 @@ func rmdirAt(t *kernel.Task, dirFD int32, addr hostarch.Addr) error { // Special case: removing the root always returns EBUSY. if path == "/" { - return syserror.EBUSY + return linuxerr.EBUSY } return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Linux returns different ernos when the path ends in single // dot vs. double dots. switch name { case ".": - return syserror.EINVAL + return linuxerr.EINVAL case "..": - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } if err := d.MayDelete(t, root, name); err != nil { @@ -1262,7 +1263,7 @@ func symlinkAt(t *kernel.Task, dirFD int32, newAddr hostarch.Addr, oldAddr hosta return err } if dirPath { - return syserror.ENOENT + return linuxerr.ENOENT } // The oldPath is copied in verbatim. This is because the symlink @@ -1272,12 +1273,12 @@ func symlinkAt(t *kernel.Task, dirFD int32, newAddr hostarch.Addr, oldAddr hosta return err } if oldPath == "" { - return syserror.ENOENT + return linuxerr.ENOENT } return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Make sure we have write permissions on the parent directory. @@ -1329,10 +1330,10 @@ func mayLinkAt(t *kernel.Task, target *fs.Inode) error { // If we are not the owner, then the file must be regular and have // Read+Write permissions. if !fs.IsRegular(target.StableAttr) { - return syserror.EPERM + return linuxerr.EPERM } if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil { - return syserror.EPERM + return linuxerr.EPERM } return nil @@ -1351,13 +1352,13 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD int3 return err } if dirPath { - return syserror.ENOENT + return linuxerr.ENOENT } if allowEmpty && oldPath == "" { target := t.GetFile(oldDirFD) if target == nil { - return syserror.EBADF + return linuxerr.EBADF } defer target.DecRef(t) if err := mayLinkAt(t, target.Dirent.Inode); err != nil { @@ -1367,7 +1368,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD int3 // Resolve the target directory. return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error { if !fs.IsDir(newParent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Make sure we have write permissions on the parent directory. @@ -1388,7 +1389,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD int3 // Next resolve newDirFD and newAddr to the parent dirent and name. return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error { if !fs.IsDir(newParent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Make sure we have write permissions on the parent directory. @@ -1431,14 +1432,14 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Sanity check flags. if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) { - return 0, nil, syserror.ENOENT + return 0, nil, linuxerr.ENOENT } return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) @@ -1454,7 +1455,7 @@ func readlinkAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, bufAddr hostarc return 0, err } if dirPath { - return 0, syserror.ENOENT + return 0, linuxerr.ENOENT } err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { @@ -1464,8 +1465,8 @@ func readlinkAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, bufAddr hostarc } s, err := d.Inode.Readlink(t) - if err == syserror.ENOLINK { - return syserror.EINVAL + if linuxerr.Equals(linuxerr.ENOLINK, err) { + return linuxerr.EINVAL } if err != nil { return err @@ -1519,7 +1520,7 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr hostarch.Addr) error { return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error { if !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } if err := d.MayDelete(t, root, name); err != nil { @@ -1557,7 +1558,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc length := args[1].Int64() if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) @@ -1565,7 +1566,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } if dirPath { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { @@ -1573,17 +1574,17 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc Signo: int32(linux.SIGXFSZ), Code: linux.SI_USER, }) - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { if fs.IsDir(d.Inode.StableAttr) { - return syserror.EISDIR + return linuxerr.EISDIR } // In contrast to open(O_TRUNC), truncate(2) is only valid for file // types. if !fs.IsFile(d.Inode.StableAttr) { - return syserror.EINVAL + return linuxerr.EINVAL } // Reject truncation if the access permissions do not allow truncation. @@ -1610,25 +1611,25 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Reject truncation if the file flags do not permit this operation. // This is different from truncate(2) above. if !file.Flags().Write { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // In contrast to open(O_TRUNC), truncate(2) is only valid for file // types. Note that this is different from truncate(2) above, where a // directory returns EISDIR. if !fs.IsFile(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { @@ -1636,7 +1637,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys Signo: int32(linux.SIGXFSZ), Code: linux.SI_USER, }) - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil { @@ -1682,7 +1683,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { kuid := c.UserNamespace.MapToKUID(uid) // Valid UID must be supplied if UID is to be changed. if !kuid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } // "Only a privileged process (CAP_CHOWN) may change the owner @@ -1692,7 +1693,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { // explicitly not changing its UID. isNoop := uattr.Owner.UID == kuid if !(hasCap || (isOwner && isNoop)) { - return syserror.EPERM + return linuxerr.EPERM } // The setuid and setgid bits are cleared during a chown. @@ -1706,7 +1707,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { kgid := c.UserNamespace.MapToKGID(gid) // Valid GID must be supplied if GID is to be changed. if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } // "The owner of a file may change the group of the file to any @@ -1715,7 +1716,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { isNoop := uattr.Owner.GID == kgid isMemberGroup := c.InGroup(kgid) if !(hasCap || (isOwner && (isNoop || isMemberGroup))) { - return syserror.EPERM + return linuxerr.EPERM } // The setuid and setgid bits are cleared during a chown. @@ -1737,7 +1738,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { if clearPrivilege && uattr.Perms.HasSetUIDOrGID() && !fs.IsDir(d.Inode.StableAttr) { uattr.Perms.DropSetUIDAndMaybeGID() if !d.Inode.SetPermissions(t, d, uattr.Perms) { - return syserror.EPERM + return linuxerr.EPERM } } @@ -1754,7 +1755,7 @@ func chownAt(t *kernel.Task, fd int32, addr hostarch.Addr, resolve, allowEmpty b // Annoying. What's wrong with fchown? file := t.GetFile(fd) if file == nil { - return syserror.EBADF + return linuxerr.EBADF } defer file.DecRef(t) @@ -1792,7 +1793,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -1808,7 +1809,7 @@ func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := args[4].Int() if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid) @@ -1817,12 +1818,12 @@ func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error { // Must own file to change mode. if !d.Inode.CheckOwnership(t) { - return syserror.EPERM + return linuxerr.EPERM } p := fs.FilePermsFromMode(mode) if !d.Inode.SetPermissions(t, d, p) { - return syserror.EPERM + return linuxerr.EPERM } // File attribute changed, generate notification. @@ -1857,7 +1858,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -1888,7 +1889,7 @@ func utimes(t *kernel.Task, dirFD int32, addr hostarch.Addr, ts fs.TimeSpec, res if !d.Inode.CheckOwnership(t) { // Trying to set a specific time? Must be owner. if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) { - return syserror.EPERM + return linuxerr.EPERM } // Trying to set to current system time? Must have write access. @@ -1913,11 +1914,11 @@ func utimes(t *kernel.Task, dirFD int32, addr hostarch.Addr, ts fs.TimeSpec, res if addr == 0 && dirFD != linux.AT_FDCWD { if !resolve { // Linux returns EINVAL in this case. See utimes.c. - return syserror.EINVAL + return linuxerr.EINVAL } f := t.GetFile(dirFD) if f == nil { - return syserror.EBADF + return linuxerr.EBADF } defer f.DecRef(t) @@ -1996,7 +1997,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // If both are UTIME_OMIT, this is a noop. @@ -2031,7 +2032,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if times[0].Usec >= 1e6 || times[0].Usec < 0 || times[1].Usec >= 1e6 || times[1].Usec < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ts = fs.TimeSpec{ @@ -2058,26 +2059,26 @@ func renameAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD in return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string, _ uint) error { if !fs.IsDir(oldParent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Rename rejects paths that end in ".", "..", or empty (i.e. // the root) with EBUSY. switch oldName { case "", ".", "..": - return syserror.EBUSY + return linuxerr.EBUSY } return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error { if !fs.IsDir(newParent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Rename rejects paths that end in ".", "..", or empty // (i.e. the root) with EBUSY. switch newName { case "", ".", "..": - return syserror.EBUSY + return linuxerr.EBUSY } return fs.Rename(t, root, oldParent, oldName, newParent, newName) @@ -2112,39 +2113,39 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if offset < 0 || length <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if mode != 0 { t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOTSUP + return 0, nil, linuxerr.ENOTSUP } if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if fs.IsPipe(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if fs.IsDir(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EISDIR + return 0, nil, linuxerr.EISDIR } if !fs.IsRegular(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.ENODEV + return 0, nil, linuxerr.ENODEV } size := offset + length if size < 0 { - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { t.SendSignal(&linux.SignalInfo{ Signo: int32(linux.SIGXFSZ), Code: linux.SI_USER, }) - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil { @@ -2165,7 +2166,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { // flock(2): EBADF fd is not an open file descriptor. - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -2183,31 +2184,31 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, nil) { - return 0, nil, syserror.EWOULDBLOCK + return 0, nil, linuxerr.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, t) { - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } } case linux.LOCK_SH: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, nil) { - return 0, nil, syserror.EWOULDBLOCK + return 0, nil, linuxerr.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, t) { - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } } case linux.LOCK_UN: file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng) default: // flock(2): EINVAL operation is invalid. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil @@ -2226,7 +2227,7 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if flags&^memfdAllFlags != 0 { // Unknown bits in flags. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 @@ -2237,7 +2238,7 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } if len(name) > memfdMaxNameLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } name = memfdPrefix + name diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index eeea1613b..bcdd7b633 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -18,11 +18,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // futexWaitRestartBlock encapsulates the state required to restart futex(2) @@ -74,7 +75,7 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo } t.Futex().WaitComplete(w, t) - return 0, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is @@ -102,7 +103,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add // The wait was unsuccessful for some reason other than interruption. Simply // forward the error. - if err != syserror.ErrInterrupted { + if err != linuxerr.ErrInterrupted { return 0, err } @@ -110,7 +111,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add // The wait duration was absolute, restart with the original arguments. if forever { - return 0, syserror.ERESTARTSYS + return 0, linuxerr.ERESTARTSYS } // The wait duration was relative, restart with the remaining duration. @@ -121,7 +122,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add val: val, mask: mask, }) - return 0, syserror.ERESTART_RESTARTBLOCK + return 0, linuxerr.ERESTART_RESTARTBLOCK } func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error { @@ -149,7 +150,7 @@ func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch. } t.Futex().WaitComplete(w, t) - return syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error { @@ -159,7 +160,7 @@ func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error { return err } if !locked { - return syserror.EWOULDBLOCK + return linuxerr.EWOULDBLOCK } return nil } @@ -210,7 +211,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // WAIT_BITSET uses an absolute timeout which is either // CLOCK_MONOTONIC or CLOCK_REALTIME. if mask == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask) return n, nil, err @@ -224,7 +225,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.FUTEX_WAKE_BITSET: if mask == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if val <= 0 { // The Linux kernel wakes one waiter even if val is @@ -279,11 +280,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS default: // We don't even know about this command. - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } } @@ -295,7 +296,7 @@ func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel length := args[1].SizeT() if length != uint(linux.SizeOfRobustListHead) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.SetRobustList(head) return 0, nil, nil @@ -310,13 +311,13 @@ func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel sizeAddr := args[2].Pointer() if tid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ot := t if tid != 0 { if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index bbba71d8f..9f7a5ae8a 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -19,11 +19,11 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -38,7 +38,7 @@ func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc minSize := int(smallestDirent(t.Arch())) if size < minSize { // size is smaller than smallest possible dirent. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } n, err := getdents(t, fd, addr, size, (*dirent).Serialize) @@ -54,7 +54,7 @@ func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy minSize := int(smallestDirent64(t.Arch())) if size < minSize { // size is smaller than smallest possible dirent. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } n, err := getdents(t, fd, addr, size, (*dirent).Serialize64) @@ -66,7 +66,7 @@ func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy func getdents(t *kernel.Task, fd int32, addr hostarch.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { dir := t.GetFile(fd) if dir == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer dir.DecRef(t) @@ -82,7 +82,7 @@ func getdents(t *kernel.Task, fd int32, addr hostarch.Addr, size int, f func(*di ds := newDirentSerializer(f, w, t.Arch(), size) rerr := dir.Readdir(t, ds) - switch err := handleIOError(t, ds.Written() > 0, rerr, syserror.ERESTARTSYS, "getdents", dir); err { + switch err := handleIOError(t, ds.Written() > 0, rerr, linuxerr.ERESTARTSYS, "getdents", dir); err { case nil: dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0) return uintptr(ds.Written()), nil diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go index a29d307e5..50fcadb58 100644 --- a/pkg/sentry/syscalls/linux/sys_identity.go +++ b/pkg/sentry/syscalls/linux/sys_identity.go @@ -15,10 +15,10 @@ package linux import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -142,7 +142,7 @@ func Setresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { size := int(args[0].Int()) if size < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } kgids := t.Credentials().ExtraKGIDs // "If size is zero, list is not modified, but the total number of @@ -151,7 +151,7 @@ func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return uintptr(len(kgids)), nil, nil } if size < len(kgids) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } gids := make([]auth.GID, len(kgids)) for i, kgid := range kgids { @@ -167,7 +167,7 @@ func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { size := args[0].Int() if size < 0 || size > maxNGroups { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if size == 0 { return 0, nil, t.SetExtraGIDs(nil) diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index cf47bb9dd..b7ad1922e 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -16,11 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC) @@ -30,7 +30,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. flags := int(args[0].Int()) if flags&^allFlags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } dirent := fs.NewDirent(t, anon.NewInode(t), "inotify") @@ -65,14 +65,14 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { file := t.GetFile(fd) if file == nil { // Invalid fd. - return nil, nil, syserror.EBADF + return nil, nil, linuxerr.EBADF } ino, ok := file.FileOperations.(*fs.Inotify) if !ok { // Not an inotify fd. file.DecRef(t) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } return ino, file, nil @@ -91,7 +91,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ino, file, err := fdToInotify(t, fd) @@ -108,7 +108,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent, _ uint) error { // "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7) if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Copy out to the return frame. diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index 0046347cb..4a5712a29 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -15,10 +15,10 @@ package linux import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -31,7 +31,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -44,11 +44,11 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case 2: sw = fs.SeekEnd default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } offset, serr := file.Seek(t, sw, offset) - err := handleIOError(t, false /* partialResult */, serr, syserror.ERESTARTSYS, "lseek", file) + err := handleIOError(t, false /* partialResult */, serr, linuxerr.ERESTARTSYS, "lseek", file) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_membarrier.go b/pkg/sentry/syscalls/linux/sys_membarrier.go index 63ee5d435..6ceedc086 100644 --- a/pkg/sentry/syscalls/linux/sys_membarrier.go +++ b/pkg/sentry/syscalls/linux/sys_membarrier.go @@ -16,9 +16,9 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Membarrier implements syscall membarrier(2). @@ -29,7 +29,7 @@ func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy switch cmd { case linux.MEMBARRIER_CMD_QUERY: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var supportedCommands uintptr if t.Kernel().Platform.HaveGlobalMemoryBarrier() { @@ -46,58 +46,58 @@ func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return supportedCommands, nil, nil case linux.MEMBARRIER_CMD_GLOBAL, linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if cmd == linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED && !t.MemoryManager().IsMembarrierPrivateEnabled() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } return 0, nil, t.Kernel().Platform.GlobalMemoryBarrier() case linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // no-op return 0, nil, nil case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.MemoryManager().EnableMembarrierPrivate() return 0, nil, nil case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: if flags&^linux.MEMBARRIER_CMD_FLAG_CPU != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.RSeqAvailable() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.MemoryManager().IsMembarrierRSeqEnabled() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } // MEMBARRIER_CMD_FLAG_CPU and cpu_id are ignored since we don't have // the ability to preempt specific CPUs. return 0, nil, t.Kernel().Platform.PreemptAllCPUs() case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: if flags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if !t.RSeqAvailable() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.MemoryManager().EnableMembarrierRSeq() return 0, nil, nil default: // Probably a command we don't implement. t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go index 6d27f4292..6e7bcb868 100644 --- a/pkg/sentry/syscalls/linux/sys_mempolicy.go +++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go @@ -18,10 +18,10 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -43,7 +43,7 @@ func copyInNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32) (uint64, // maxnode-1, not maxnode, as the number of bits. bits := maxnode - 1 if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0 - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if bits == 0 { return 0, nil @@ -58,12 +58,12 @@ func copyInNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32) (uint64, // Check that only allowed bits in the first unsigned long in the nodemask // are set. if val&^allowedNodemask != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Check that all remaining bits in the nodemask are 0. for i := 8; i < len(buf); i++ { if buf[i] != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } return val, nil @@ -74,7 +74,7 @@ func copyOutNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32, val uin // bits. bits := maxnode - 1 if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0 - return syserror.EINVAL + return linuxerr.EINVAL } if bits == 0 { return nil @@ -89,7 +89,7 @@ func copyOutNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32, val uin if bits > 64 { remAddr, ok := addr.AddLength(8) if !ok { - return syserror.EFAULT + return linuxerr.EFAULT } remUint64 := (bits - 1) / 64 if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{ @@ -110,7 +110,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. flags := args[4].Uint() if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } nodeFlag := flags&linux.MPOL_F_NODE != 0 addrFlag := flags&linux.MPOL_F_ADDR != 0 @@ -119,7 +119,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // "EINVAL: The value specified by maxnode is less than the number of node // IDs supported by the system." - get_mempolicy(2) if nodemask != 0 && maxnode < maxNodes { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is @@ -130,7 +130,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either // MPOL_F_ADDR or MPOL_F_NODE." if nodeFlag || addrFlag { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil { return 0, nil, err @@ -184,7 +184,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will // just (usually) fail to find a VMA at address 0 and return EFAULT. if addr != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "If flags is specified as 0, then information about the calling thread's @@ -198,7 +198,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. policy, nodemaskVal := t.NumaPolicy() if nodeFlag { if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } policy = linux.MPOL_DEFAULT // maxNodes == 1 } @@ -240,12 +240,12 @@ func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall flags := args[5].Uint() if flags&^linux.MPOL_MF_VALID != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be // privileged (CAP_SYS_NICE) to use this flag." - mbind(2) if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode) @@ -264,11 +264,11 @@ func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nod mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS) if flags == linux.MPOL_MODE_FLAGS { // Can't specify both mode flags simultaneously. - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } if mode < 0 || mode >= linux.MPOL_MAX { // Must specify a valid mode. - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } var nodemaskVal uint64 @@ -285,22 +285,22 @@ func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nod // "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate; // Linux allows a nodemask to be specified, as long as it is empty. if nodemaskVal != 0 { - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } case linux.MPOL_BIND, linux.MPOL_INTERLEAVE: // These require a non-empty nodemask. if nodemaskVal == 0 { - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } case linux.MPOL_PREFERRED: // This permits an empty nodemask, as long as no flags are set. if nodemaskVal == 0 && flags != 0 { - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } case linux.MPOL_LOCAL: // This requires an empty nodemask and no flags set ... if nodemaskVal != 0 || flags != 0 { - return 0, 0, syserror.EINVAL + return 0, 0, linuxerr.EINVAL } // ... and is implemented as MPOL_PREFERRED. mode = linux.MPOL_PREFERRED diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 70da0707d..7efd17d40 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -18,13 +18,13 @@ import ( "bytes" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/syserr" ) // Brk implements linux syscall brk(2). @@ -51,7 +51,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Require exactly one of MAP_PRIVATE and MAP_SHARED. if private == shared { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } opts := memmap.MMapOpts{ @@ -84,14 +84,14 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Convert the passed FD to a file reference. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) flags := file.Flags() // mmap unconditionally requires that the FD is readable. if !flags.Read { - return 0, nil, syserror.EACCES + return 0, nil, linuxerr.EACCES } // MAP_SHARED requires that the FD be writable for PROT_WRITE. if shared && !flags.Write { @@ -132,7 +132,7 @@ func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal newAddr := args[4].Pointer() if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } mayMove := flags&linux.MREMAP_MAYMOVE != 0 fixed := flags&linux.MREMAP_FIXED != 0 @@ -147,7 +147,7 @@ func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case !mayMove && fixed: // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be // specified." - mremap(2) - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ @@ -178,7 +178,7 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // "The Linux implementation requires that the address addr be // page-aligned, and allows length to be zero." - madvise(2) if addr.RoundDown() != addr { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if length == 0 { return 0, nil, nil @@ -186,7 +186,7 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Not explicitly stated: length need not be page-aligned. lenAddr, ok := hostarch.Addr(length).RoundUp() if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } length = uint64(lenAddr) @@ -211,13 +211,13 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca case linux.MADV_REMOVE: // These "suggestions" have application-visible side effects, so we // have to indicate that we don't support them. - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS case linux.MADV_HWPOISON: // Only privileged processes are allowed to poison pages. - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM default: // If adv is not a valid value tell the caller. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -228,25 +228,25 @@ func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca vec := args[2].Pointer() if addr != addr.RoundDown() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "The length argument need not be a multiple of the page size, but since // residency information is returned for whole pages, length is effectively // rounded up to the next multiple of the page size." - mincore(2) la, ok := hostarch.Addr(length).RoundUp() if !ok { - return 0, nil, syserror.ENOMEM + return 0, nil, linuxerr.ENOMEM } ar, ok := addr.ToRange(uint64(la)) if !ok { - return 0, nil, syserror.ENOMEM + return 0, nil, linuxerr.ENOMEM } // Pretend that all mapped pages are "resident in core". mapped := t.MemoryManager().VirtualMemorySizeRange(ar) // "ENOMEM: addr to addr + length contained unmapped memory." if mapped != uint64(la) { - return 0, nil, syserror.ENOMEM + return 0, nil, linuxerr.ENOMEM } resident := bytes.Repeat([]byte{1}, int(mapped/hostarch.PageSize)) _, err := t.CopyOutBytes(vec, resident) @@ -265,11 +265,11 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // semantics that are (currently) equivalent to specifying MS_ASYNC." - // msync(2) if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } sync := flags&linux.MS_SYNC != 0 if sync && flags&linux.MS_ASYNC != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ Sync: sync, @@ -277,7 +277,7 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall }) // MSync calls fsync, the same interrupt conversion rules apply, see // mm/msync.c, fsync POSIX.1-2008. - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // Mlock implements linux syscall mlock(2). @@ -295,7 +295,7 @@ func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal flags := args[2].Int() if flags&^(linux.MLOCK_ONFAULT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } mode := memmap.MLockEager @@ -318,7 +318,7 @@ func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := args[0].Int() if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } mode := memmap.MLockEager diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go index 864d2138c..6d26f89b9 100644 --- a/pkg/sentry/syscalls/linux/sys_mount.go +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -16,12 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Mount implements Linux syscall mount(2). @@ -67,7 +66,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Must have CAP_SYS_ADMIN in the mount namespace's associated user // namespace. if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | @@ -83,15 +82,15 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // unknown or unsupported flags are passed. Since we don't implement // everything, we fail explicitly on flags that are unimplemented. if flags&(unsupportedOps|unsupportedFlags) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } rsys, ok := fs.FindFilesystem(fsType) if !ok { - return 0, nil, syserror.ENODEV + return 0, nil, linuxerr.ENODEV } if !rsys.AllowUserMount() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } var superFlags fs.MountSourceFlags @@ -107,7 +106,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall rootInode, err := rsys.Mount(t, sourcePath, superFlags, data, nil) if err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if err := fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { @@ -130,7 +129,7 @@ func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE if flags&unsupported != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, _, err := copyInPath(t, addr, false /* allowEmpty */) @@ -143,7 +142,7 @@ func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // // Currently, this is always the init task's user namespace. if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } resolve := flags&linux.UMOUNT_NOFOLLOW != linux.UMOUNT_NOFOLLOW diff --git a/pkg/sentry/syscalls/linux/sys_msgqueue.go b/pkg/sentry/syscalls/linux/sys_msgqueue.go new file mode 100644 index 000000000..60b989ee7 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_msgqueue.go @@ -0,0 +1,193 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" + "gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue" +) + +// Msgget implements msgget(2). +func Msgget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + key := ipc.Key(args[0].Int()) + flag := args[1].Int() + + private := key == linux.IPC_PRIVATE + create := flag&linux.IPC_CREAT == linux.IPC_CREAT + exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL + mode := linux.FileMode(flag & 0777) + + r := t.IPCNamespace().MsgqueueRegistry() + queue, err := r.FindOrCreate(t, key, mode, private, create, exclusive) + if err != nil { + return 0, nil, err + } + return uintptr(queue.ID()), nil, nil +} + +// Msgsnd implements msgsnd(2). +func Msgsnd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := ipc.ID(args[0].Int()) + msgAddr := args[1].Pointer() + size := args[2].Int64() + flag := args[3].Int() + + if size < 0 || size > linux.MSGMAX { + return 0, nil, linuxerr.EINVAL + } + + wait := flag&linux.IPC_NOWAIT != linux.IPC_NOWAIT + pid := int32(t.ThreadGroup().ID()) + + buf := linux.MsgBuf{ + Text: make([]byte, size), + } + if _, err := buf.CopyIn(t, msgAddr); err != nil { + return 0, nil, err + } + + queue, err := t.IPCNamespace().MsgqueueRegistry().FindByID(id) + if err != nil { + return 0, nil, err + } + + msg := msgqueue.Message{ + Type: int64(buf.Type), + Text: buf.Text, + Size: uint64(size), + } + return 0, nil, queue.Send(t, msg, t, wait, pid) +} + +// Msgrcv implements msgrcv(2). +func Msgrcv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := ipc.ID(args[0].Int()) + msgAddr := args[1].Pointer() + size := args[2].Int64() + mType := args[3].Int64() + flag := args[4].Int() + + wait := flag&linux.IPC_NOWAIT != linux.IPC_NOWAIT + except := flag&linux.MSG_EXCEPT == linux.MSG_EXCEPT + truncate := flag&linux.MSG_NOERROR == linux.MSG_NOERROR + + msgCopy := flag&linux.MSG_COPY == linux.MSG_COPY + + msg, err := receive(t, id, mType, size, msgCopy, wait, truncate, except) + if err != nil { + return 0, nil, err + } + + buf := linux.MsgBuf{ + Type: primitive.Int64(msg.Type), + Text: msg.Text, + } + if _, err := buf.CopyOut(t, msgAddr); err != nil { + return 0, nil, err + } + return uintptr(msg.Size), nil, nil +} + +// receive returns a message from the queue with the given ID. If msgCopy is +// true, a message is copied from the queue without being removed. Otherwise, +// a message is removed from the queue and returned. +func receive(t *kernel.Task, id ipc.ID, mType int64, maxSize int64, msgCopy, wait, truncate, except bool) (*msgqueue.Message, error) { + pid := int32(t.ThreadGroup().ID()) + + queue, err := t.IPCNamespace().MsgqueueRegistry().FindByID(id) + if err != nil { + return nil, err + } + + if msgCopy { + if wait || except { + return nil, linuxerr.EINVAL + } + return queue.Copy(mType) + } + return queue.Receive(t, t, mType, maxSize, wait, truncate, except, pid) +} + +// Msgctl implements msgctl(2). +func Msgctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := ipc.ID(args[0].Int()) + cmd := args[1].Int() + buf := args[2].Pointer() + + creds := auth.CredentialsFromContext(t) + + r := t.IPCNamespace().MsgqueueRegistry() + + switch cmd { + case linux.IPC_INFO: + info := r.IPCInfo(t) + _, err := info.CopyOut(t, buf) + return 0, nil, err + case linux.MSG_INFO: + msgInfo := r.MsgInfo(t) + _, err := msgInfo.CopyOut(t, buf) + return 0, nil, err + case linux.IPC_RMID: + return 0, nil, r.Remove(id, creds) + } + + // Remaining commands use a queue. + queue, err := r.FindByID(id) + if err != nil { + return 0, nil, err + } + + switch cmd { + case linux.MSG_STAT: + // Technically, we should be treating id as "an index into the kernel's + // internal array that maintains information about all shared memory + // segments on the system". Since we don't track segments in an array, + // we'll just pretend the msqid is the index and do the same thing as + // IPC_STAT. Linux also uses the index as the msqid. + fallthrough + case linux.IPC_STAT: + stat, err := queue.Stat(t) + if err != nil { + return 0, nil, err + } + _, err = stat.CopyOut(t, buf) + return 0, nil, err + + case linux.MSG_STAT_ANY: + stat, err := queue.StatAny(t) + if err != nil { + return 0, nil, err + } + _, err = stat.CopyOut(t, buf) + return 0, nil, err + + case linux.IPC_SET: + var ds linux.MsqidDS + if _, err := ds.CopyIn(t, buf); err != nil { + return 0, nil, linuxerr.EINVAL + } + err := queue.Set(t, &ds) + return 0, nil, err + + default: + return 0, nil, linuxerr.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index d95034347..5925c2263 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -16,13 +16,13 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -30,7 +30,7 @@ import ( // pipe2 implements the actual system call with flags. func pipe2(t *kernel.Task, addr hostarch.Addr, flags uint) (uintptr, error) { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize) diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index da548a14a..ee4dbbc64 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -18,13 +18,14 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) @@ -128,7 +129,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // Wait for a notification. timeout, err = t.BlockWithTimeout(ch, !forever, timeout) if err != nil { - if err == syserror.ETIMEDOUT { + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = nil } return timeout, 0, err @@ -157,7 +158,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // CopyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. func CopyInPollFDs(t *kernel.Task, addr hostarch.Addr, nfds uint) ([]linux.PollFD, error) { if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } pfd := make([]linux.PollFD, nfds) @@ -184,7 +185,7 @@ func doPoll(t *kernel.Task, addr hostarch.Addr, nfds uint, timeout time.Duration pfd[i].Events |= linux.POLLHUP | linux.POLLERR } remainingTimeout, n, err := pollBlock(t, pfd, timeout) - err = syserror.ConvertIntr(err, syserror.EINTR) + err = syserr.ConvertIntr(err, linuxerr.EINTR) // The poll entries are copied out regardless of whether // any are set or not. This aligns with the Linux behavior. @@ -217,7 +218,7 @@ func CopyInFDSet(t *kernel.Task, addr hostarch.Addr, nBytes, nBitsInLastPartialB func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Addr, timeout time.Duration) (uintptr, error) { if nfds < 0 || nfds > fileCap { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Calculate the size of the fd sets (one bit per fd). @@ -264,7 +265,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Ad // OK. Linux is racy in the same way. file := t.GetFile(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } file.DecRef(t) @@ -294,7 +295,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Ad // Do the syscall, then count the number of bits set. if _, _, err = pollBlock(t, pfd, timeout); err != nil { - return 0, syserror.ConvertIntr(err, syserror.EINTR) + return 0, syserr.ConvertIntr(err, linuxerr.EINTR) } // r, w, and e are currently event mask bitsets; unset bits corresponding @@ -404,13 +405,13 @@ func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { func poll(t *kernel.Task, pfdAddr hostarch.Addr, nfds uint, timeout time.Duration) (uintptr, error) { remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) // On an interrupt poll(2) is restarted with the remaining timeout. - if err == syserror.EINTR { + if linuxerr.Equals(linuxerr.EINTR, err) { t.SetSyscallRestartBlock(&pollRestartBlock{ pfdAddr: pfdAddr, nfds: nfds, timeout: remainingTimeout, }) - return 0, syserror.ERESTART_RESTARTBLOCK + return 0, linuxerr.ERESTART_RESTARTBLOCK } return n, err } @@ -463,8 +464,8 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // // Note that this means that if err is nil but copyErr is not, copyErr is // ignored. This is consistent with Linux. - if err == syserror.EINTR && copyErr == nil { - err = syserror.ERESTARTNOHAND + if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { + err = linuxerr.ERESTARTNOHAND } return n, nil, err } @@ -485,7 +486,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } if timeval.Sec < 0 || timeval.Usec < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } timeout = time.Duration(timeval.ToNsecCapped()) } @@ -493,8 +494,8 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) // See comment in Ppoll. - if err == syserror.EINTR && copyErr == nil { - err = syserror.ERESTARTNOHAND + if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { + err = linuxerr.ERESTARTNOHAND } return n, nil, err } @@ -538,8 +539,8 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) // See comment in Ppoll. - if err == syserror.EINTR && copyErr == nil { - err = syserror.ERESTARTNOHAND + if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { + err = linuxerr.ERESTARTNOHAND } return n, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 9890dd946..2ef1e6404 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -25,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/syserror" ) // Prctl implements linux syscall prctl(2). @@ -38,7 +38,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_PDEATHSIG: sig := linux.Signal(args[1].Int()) if sig != 0 && !sig.IsValid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.SetParentDeathSignal(sig) return 0, nil, nil @@ -69,7 +69,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall d = mm.UserDumpable default: // N.B. Userspace may not pass SUID_DUMP_ROOT. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.MemoryManager().SetDumpability(d) return 0, nil, nil @@ -90,7 +90,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else if val == 1 { t.SetKeepCaps(true) } else { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil @@ -98,7 +98,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_NAME: addr := args[1].Pointer() name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1) - if err != nil && err != syserror.ENAMETOOLONG { + if err != nil && !linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { return 0, nil, err } t.SetName(name) @@ -118,7 +118,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_MM: if !t.HasCapability(linux.CAP_SYS_RESOURCE) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } switch args[1].Int() { @@ -127,13 +127,13 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // They trying to set exe to a non-file? if !fs.IsFile(file.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Set the underlying executable. @@ -155,12 +155,12 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } case linux.PR_SET_NO_NEW_PRIVS: if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // PR_SET_NO_NEW_PRIVS is assumed to always be set. // See kernel.Task.updateCredsForExecLocked. @@ -168,7 +168,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_GET_NO_NEW_PRIVS: if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 1, nil, nil @@ -184,7 +184,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall default: tracer := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) if tracer == nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } t.SetYAMAException(tracer) return 0, nil, nil @@ -193,7 +193,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_SET_SECCOMP: if args[1].Int() != linux.SECCOMP_MODE_FILTER { // Unsupported mode. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer()) @@ -204,7 +204,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_READ: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var rv uintptr if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 { @@ -215,10 +215,25 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.PR_CAPBSET_DROP: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, t.DropBoundingCapability(cp) + case linux.PR_SET_CHILD_SUBREAPER: + // "If arg2 is nonzero, set the "child subreaper" attribute of + // the calling process; if arg2 is zero, unset the attribute." + // + // TODO(gvisor.dev/issues/2323): We only support setting, and + // only if the task is already TID 1 in the PID namespace, + // because it already acts as a subreaper in that case. + isPid1 := t.PIDNamespace().IDOfTask(t) == kernel.InitTID + if args[1].Int() != 0 && isPid1 { + return 0, nil, nil + } + + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, linuxerr.EINVAL + case linux.PR_GET_TIMING, linux.PR_SET_TIMING, linux.PR_GET_TSC, @@ -230,7 +245,6 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall linux.PR_MCE_KILL, linux.PR_MCE_KILL_GET, linux.PR_GET_TID_ADDRESS, - linux.PR_SET_CHILD_SUBREAPER, linux.PR_GET_CHILD_SUBREAPER, linux.PR_GET_THP_DISABLE, linux.PR_SET_THP_DISABLE, @@ -240,7 +254,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go index ae545f80f..f86e87bc7 100644 --- a/pkg/sentry/syscalls/linux/sys_random.go +++ b/pkg/sentry/syscalls/linux/sys_random.go @@ -18,14 +18,13 @@ import ( "io" "math" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/pkg/hostarch" ) const ( @@ -47,7 +46,7 @@ func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Flags are checked for validity but otherwise ignored. See above. if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if length > math.MaxInt32 { @@ -55,7 +54,7 @@ func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } ar, ok := addr.ToRange(uint64(length)) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } // "If the urandom source has been initialized, reads of up to 256 bytes diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 13e5e3a51..18ea23913 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -18,12 +18,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -46,19 +46,19 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -71,7 +71,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC n, err := readv(t, file, dst) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "read", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "read", file) } // Readahead implements readahead(2). @@ -82,29 +82,29 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is valid. if int(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Return EINVAL; if the underlying file type does not support readahead, // then Linux will return EINVAL to indicate as much. In the future, we // may extend this function to actually support readahead hints. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Pread64 implements linux syscall pread64(2). @@ -116,29 +116,29 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is reading at an offset supported? if !file.Flags().Pread { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -151,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := preadv(t, file, dst, offset) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pread64", file) } // Readv implements linux syscall readv(2). @@ -162,13 +162,13 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the destination of the read. @@ -181,7 +181,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := readv(t, file, dst) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "readv", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "readv", file) } // Preadv implements linux syscall preadv(2). @@ -193,23 +193,23 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is reading at an offset supported? if !file.Flags().Pread { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the destination of the read. @@ -222,7 +222,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := preadv(t, file, dst, offset) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv", file) } // Preadv2 implements linux syscall preadv2(2). @@ -242,30 +242,30 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is reading at an offset supported? if offset > -1 && !file.Flags().Pread { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is readable. if !file.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check flags field. // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is // accepted as a valid flag argument for preadv2. if flags&^linux.RWF_VALID != 0 { - return 0, nil, syserror.EOPNOTSUPP + return 0, nil, linuxerr.EOPNOTSUPP } // Read the iovecs that specify the destination of the read. @@ -280,17 +280,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if offset == -1 { n, err := readv(t, file, dst) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv2", file) } n, err := preadv(t, file, dst, offset) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv2", file) } func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { n, err := f.Readv(t, dst) - if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if err != linuxerr.ErrWouldBlock || f.Flags().NonBlocking { if n > 0 { // Queue notification if we read anything. f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) @@ -303,7 +303,7 @@ func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { var deadline ktime.Time if s, ok := f.FileOperations.(socket.Socket); ok { dl := s.RecvTimeout() - if dl < 0 && err == syserror.ErrWouldBlock { + if dl < 0 && err == linuxerr.ErrWouldBlock { return n, err } if dl > 0 { @@ -325,14 +325,14 @@ func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { // other than "would block". n, err = f.Readv(t, dst) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - err = syserror.ErrWouldBlock + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { + err = linuxerr.ErrWouldBlock } break } @@ -350,7 +350,7 @@ func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { n, err := f.Preadv(t, dst, offset) - if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if err != linuxerr.ErrWouldBlock || f.Flags().NonBlocking { if n > 0 { // Queue notification if we read anything. f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) @@ -371,7 +371,7 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i // other than "would block". n, err = f.Preadv(t, dst, offset+total) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index e64246d57..7210333d2 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -16,12 +16,12 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/syserror" ) // rlimit describes an implementation of 'struct rlimit', which may vary from @@ -43,7 +43,7 @@ func newRlimit(t *kernel.Task) (rlimit, error) { // On 64-bit system, struct rlimit and struct rlimit64 are identical. return &rlimit64{}, nil default: - return nil, syserror.ENOSYS + return nil, linuxerr.ENOSYS } } @@ -105,7 +105,7 @@ func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) } if _, ok := setableLimits[resource]; !ok { - return limits.Limit{}, syserror.EPERM + return limits.Limit{}, linuxerr.EPERM } // "A privileged process (under Linux: one with the CAP_SYS_RESOURCE @@ -129,7 +129,7 @@ func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys resource, ok := limits.FromLinuxResource[int(args[0].Int())] if !ok { // Return err; unknown limit. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } addr := args[1].Pointer() rlim, err := newRlimit(t) @@ -150,7 +150,7 @@ func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys resource, ok := limits.FromLinuxResource[int(args[0].Int())] if !ok { // Return err; unknown limit. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } addr := args[1].Pointer() rlim, err := newRlimit(t) @@ -158,7 +158,7 @@ func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } if _, err := rlim.CopyIn(t, addr); err != nil { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } _, err = prlimit64(t, resource, rlim.toLimit()) return 0, nil, err @@ -170,7 +170,7 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys resource, ok := limits.FromLinuxResource[int(args[1].Int())] if !ok { // Return err; unknown limit. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } newRlimAddr := args[2].Pointer() oldRlimAddr := args[3].Pointer() @@ -179,18 +179,18 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if newRlimAddr != 0 { var nrl rlimit64 if err := nrl.copyIn(t, newRlimAddr); err != nil { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } newLim = nrl.toLimit() } if tid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ot := t if tid > 0 { if ot = t.PIDNamespace().TaskWithID(tid); ot == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } @@ -207,7 +207,7 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys cred.RealKGID != tcred.RealKGID || cred.RealKGID != tcred.EffectiveKGID || cred.RealKGID != tcred.SavedKGID { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } } @@ -218,7 +218,7 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if oldRlimAddr != 0 { if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } } diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go index 90db10ea6..8328a3742 100644 --- a/pkg/sentry/syscalls/linux/sys_rseq.go +++ b/pkg/sentry/syscalls/linux/sys_rseq.go @@ -16,9 +16,9 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // RSeq implements syscall rseq(2). @@ -32,7 +32,7 @@ func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Event for applications that want rseq on a configuration // that doesn't support them. t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } switch flags { @@ -43,6 +43,6 @@ func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC return 0, nil, t.ClearRSeq(addr, length, signature) default: // Unknown flag. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go index ac5c98a54..a689abcc9 100644 --- a/pkg/sentry/syscalls/linux/sys_rusage.go +++ b/pkg/sentry/syscalls/linux/sys_rusage.go @@ -16,11 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/syserror" ) func getrusage(t *kernel.Task, which int32) linux.Rusage { @@ -76,7 +76,7 @@ func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys addr := args[1].Pointer() if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ru := getrusage(t, which) diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go index bfcf44b6f..59c7a4b22 100644 --- a/pkg/sentry/syscalls/linux/sys_sched.go +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -16,9 +16,9 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -38,13 +38,13 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel pid := args[0].Int() param := args[1].Pointer() if param == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if pid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } r := SchedParam{schedPriority: onlyPriority} if _, err := r.CopyOut(t, param); err != nil { @@ -58,10 +58,10 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := args[0].Int() if pid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } return onlyScheduler, nil, nil } @@ -72,20 +72,20 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke policy := args[1].Int() param := args[2].Pointer() if pid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if policy != onlyScheduler { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } var r SchedParam if _, err := r.CopyIn(t, param); err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if r.schedPriority != onlyPriority { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go index e16d6ff3f..b0dc84b8d 100644 --- a/pkg/sentry/syscalls/linux/sys_seccomp.go +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -17,10 +17,10 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // userSockFprog is equivalent to Linux's struct sock_fprog on amd64. @@ -44,7 +44,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr hostarch.Addr) error { // We only support SECCOMP_SET_MODE_FILTER at the moment. if mode != linux.SECCOMP_SET_MODE_FILTER { // Unsupported mode. - return syserror.EINVAL + return linuxerr.EINVAL } tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0 @@ -52,7 +52,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr hostarch.Addr) error { // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC. if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 { // Unsupported flag. - return syserror.EINVAL + return linuxerr.EINVAL } var fprog userSockFprog @@ -66,7 +66,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr hostarch.Addr) error { compiledFilter, err := bpf.Compile(filter) if err != nil { t.Debugf("Invalid seccomp-bpf filter: %v", err) - return syserror.EINVAL + return linuxerr.EINVAL } return t.AppendSyscallFilter(compiledFilter, tsync) diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go index c84260080..5a119b21c 100644 --- a/pkg/sentry/syscalls/linux/sys_sem.go +++ b/pkg/sentry/syscalls/linux/sys_sem.go @@ -19,20 +19,20 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ) const opsMax = 500 // SEMOPM // Semget handles: semget(key_t key, int nsems, int semflg) func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - key := args[0].Int() + key := ipc.Key(args[0].Int()) nsems := args[1].Int() flag := args[2].Int() @@ -46,7 +46,7 @@ func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - return uintptr(set.ID), nil, nil + return uintptr(set.ID()), nil, nil } // Semtimedop handles: semop(int semid, struct sembuf *sops, size_t nsops, const struct timespec *timeout) @@ -56,15 +56,15 @@ func Semtimedop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return Semop(t, args) } - id := args[0].Int() + id := ipc.ID(args[0].Int()) sembufAddr := args[1].Pointer() nsops := args[2].SizeT() timespecAddr := args[3].Pointer() if nsops <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if nsops > opsMax { - return 0, nil, syserror.E2BIG + return 0, nil, linuxerr.E2BIG } ops := make([]linux.Sembuf, nsops) @@ -77,12 +77,12 @@ func Semtimedop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, err } if timeout.Sec < 0 || timeout.Nsec < 0 || timeout.Nsec >= 1e9 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if err := semTimedOp(t, id, ops, true, timeout.ToDuration()); err != nil { - if err == syserror.ETIMEDOUT { - return 0, nil, syserror.EAGAIN + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { + return 0, nil, linuxerr.EAGAIN } return 0, nil, err } @@ -91,15 +91,15 @@ func Semtimedop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Semop handles: semop(int semid, struct sembuf *sops, size_t nsops) func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - id := args[0].Int() + id := ipc.ID(args[0].Int()) sembufAddr := args[1].Pointer() nsops := args[2].SizeT() if nsops <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if nsops > opsMax { - return 0, nil, syserror.E2BIG + return 0, nil, linuxerr.E2BIG } ops := make([]linux.Sembuf, nsops) @@ -109,11 +109,11 @@ func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, semTimedOp(t, id, ops, false, time.Second) } -func semTimedOp(t *kernel.Task, id int32, ops []linux.Sembuf, haveTimeout bool, timeout time.Duration) error { +func semTimedOp(t *kernel.Task, id ipc.ID, ops []linux.Sembuf, haveTimeout bool, timeout time.Duration) error { set := t.IPCNamespace().SemaphoreRegistry().FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) @@ -131,7 +131,7 @@ func semTimedOp(t *kernel.Task, id int32, ops []linux.Sembuf, haveTimeout bool, // Semctl handles: semctl(int semid, int semnum, int cmd, ...) func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - id := args[0].Int() + id := ipc.ID(args[0].Int()) num := args[1].Int() cmd := args[2].Int() @@ -139,7 +139,7 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.SETVAL: val := args[3].Int() if val > math.MaxInt16 { - return 0, nil, syserror.ERANGE + return 0, nil, linuxerr.ERANGE } return 0, nil, setVal(t, id, num, int16(val)) @@ -165,8 +165,7 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } - perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777)) - return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms) + return 0, nil, ipcSet(t, id, &s) case linux.GETPID: v, err := getPID(t, id, num) @@ -210,7 +209,7 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.SEM_STAT: arg := args[3].Pointer() // id is an index in SEM_STAT. - semid, ds, err := semStat(t, id) + semid, ds, err := semStat(t, int32(id)) if err != nil { return 0, nil, err } @@ -222,7 +221,7 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.SEM_STAT_ANY: arg := args[3].Pointer() // id is an index in SEM_STAT. - semid, ds, err := semStatAny(t, id) + semid, ds, err := semStatAny(t, int32(id)) if err != nil { return 0, nil, err } @@ -232,41 +231,30 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return uintptr(semid), nil, err default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } -func remove(t *kernel.Task, id int32) error { +func remove(t *kernel.Task, id ipc.ID) error { r := t.IPCNamespace().SemaphoreRegistry() creds := auth.CredentialsFromContext(t) - return r.RemoveID(id, creds) + return r.Remove(id, creds) } -func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error { +func ipcSet(t *kernel.Task, id ipc.ID, ds *linux.SemidDS) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } - - creds := auth.CredentialsFromContext(t) - kuid := creds.UserNamespace.MapToKUID(uid) - if !kuid.Ok() { - return syserror.EINVAL - } - kgid := creds.UserNamespace.MapToKGID(gid) - if !kgid.Ok() { - return syserror.EINVAL - } - owner := fs.FileOwner{UID: kuid, GID: kgid} - return set.Change(t, creds, owner, perms) + return set.Set(t, ds) } -func ipcStat(t *kernel.Task, id int32) (*linux.SemidDS, error) { +func ipcStat(t *kernel.Task, id ipc.ID) (*linux.SemidDS, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.GetStat(creds) @@ -276,45 +264,45 @@ func semStat(t *kernel.Task, index int32) (int32, *linux.SemidDS, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByIndex(index) if set == nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) ds, err := set.GetStat(creds) if err != nil { return 0, ds, err } - return set.ID, ds, nil + return int32(set.ID()), ds, nil } func semStatAny(t *kernel.Task, index int32) (int32, *linux.SemidDS, error) { set := t.IPCNamespace().SemaphoreRegistry().FindByIndex(index) if set == nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) ds, err := set.GetStatAny(creds) if err != nil { return 0, ds, err } - return set.ID, ds, nil + return int32(set.ID()), ds, nil } -func setVal(t *kernel.Task, id int32, num int32, val int16) error { +func setVal(t *kernel.Task, id ipc.ID, num int32, val int16) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) return set.SetVal(t, num, val, creds, int32(pid)) } -func setValAll(t *kernel.Task, id int32, array hostarch.Addr) error { +func setValAll(t *kernel.Task, id ipc.ID, array hostarch.Addr) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } vals := make([]uint16, set.Size()) if _, err := primitive.CopyUint16SliceIn(t, array, vals); err != nil { @@ -325,21 +313,21 @@ func setValAll(t *kernel.Task, id int32, array hostarch.Addr) error { return set.SetValAll(t, vals, creds, int32(pid)) } -func getVal(t *kernel.Task, id int32, num int32) (int16, error) { +func getVal(t *kernel.Task, id ipc.ID, num int32) (int16, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.GetVal(num, creds) } -func getValAll(t *kernel.Task, id int32, array hostarch.Addr) error { +func getValAll(t *kernel.Task, id ipc.ID, array hostarch.Addr) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return syserror.EINVAL + return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) vals, err := set.GetValAll(creds) @@ -350,11 +338,11 @@ func getValAll(t *kernel.Task, id int32, array hostarch.Addr) error { return err } -func getPID(t *kernel.Task, id int32, num int32) (int32, error) { +func getPID(t *kernel.Task, id ipc.ID, num int32) (int32, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) gpid, err := set.GetPID(num, creds) @@ -369,21 +357,21 @@ func getPID(t *kernel.Task, id int32, num int32) (int32, error) { return int32(tg.ID()), nil } -func getZCnt(t *kernel.Task, id int32, num int32) (uint16, error) { +func getZCnt(t *kernel.Task, id ipc.ID, num int32) (uint16, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.CountZeroWaiters(num, creds) } -func getNCnt(t *kernel.Task, id int32, num int32) (uint16, error) { +func getNCnt(t *kernel.Task, id ipc.ID, num int32) (uint16, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.CountNegativeWaiters(num, creds) diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go index 584064143..840540506 100644 --- a/pkg/sentry/syscalls/linux/sys_shm.go +++ b/pkg/sentry/syscalls/linux/sys_shm.go @@ -16,15 +16,16 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" - "gvisor.dev/gvisor/pkg/syserror" ) // Shmget implements shmget(2). func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - key := shm.Key(args[0].Int()) + key := ipc.Key(args[0].Int()) size := uint64(args[1].SizeT()) flag := args[2].Int() @@ -40,31 +41,31 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } defer segment.DecRef(t) - return uintptr(segment.ID), nil, nil + return uintptr(segment.ID()), nil, nil } // findSegment retrives a shm segment by the given id. // // findSegment returns a reference on Shm. -func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) { +func findSegment(t *kernel.Task, id ipc.ID) (*shm.Shm, error) { r := t.IPCNamespace().ShmRegistry() segment := r.FindByID(id) if segment == nil { // No segment with provided id. - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } return segment, nil } // Shmat implements shmat(2). func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - id := shm.ID(args[0].Int()) + id := ipc.ID(args[0].Int()) addr := args[1].Pointer() flag := args[2].Int() segment, err := findSegment(t, id) if err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } defer segment.DecRef(t) @@ -89,7 +90,7 @@ func Shmdt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Shmctl implements shmctl(2). func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - id := shm.ID(args[0].Int()) + id := ipc.ID(args[0].Int()) cmd := args[1].Int() buf := args[2].Pointer() @@ -106,7 +107,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.IPC_STAT: segment, err := findSegment(t, id) if err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } defer segment.DecRef(t) @@ -130,7 +131,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Remaining commands refer to a specific segment. segment, err := findSegment(t, id) if err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } defer segment.DecRef(t) @@ -155,6 +156,6 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index 27a7f7fe1..03871d713 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -19,12 +19,13 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // "For a process to have permission to send a signal it must @@ -79,10 +80,10 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC for { target := t.PIDNamespace().TaskWithID(pid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } info := &linux.SignalInfo{ Signo: int32(sig), @@ -90,7 +91,7 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } info.SetPID(int32(target.PIDNamespace().IDOfTask(t))) info.SetUID(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow())) - if err := target.SendGroupSignal(info); err != syserror.ESRCH { + if err := target.SendGroupSignal(info); !linuxerr.Equals(linuxerr.ESRCH, err) { return 0, nil, err } } @@ -130,7 +131,7 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC info.SetPID(int32(tg.PIDNamespace().IDOfTask(t))) info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) err := tg.SendSignal(info) - if err == syserror.ESRCH { + if linuxerr.Equals(linuxerr.ESRCH, err) { // ESRCH is ignored because it means the task // exited while we were iterating. This is a // race which would not normally exist on @@ -145,7 +146,7 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if delivered > 0 { return 0, nil, lastErr } - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH default: // "If pid equals 0, then sig is sent to every process in the process // group of the calling process." @@ -159,11 +160,11 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // If pid != -1 (i.e. signalling a process group), the returned error // is the last error from any call to group_send_sig_info. - lastErr := syserror.ESRCH + lastErr := error(linuxerr.ESRCH) for _, tg := range t.PIDNamespace().ThreadGroups() { if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { if !mayKill(t, tg.Leader(), sig) { - lastErr = syserror.EPERM + lastErr = linuxerr.EPERM continue } @@ -174,7 +175,7 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC info.SetPID(int32(tg.PIDNamespace().IDOfTask(t))) info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) // See note above regarding ESRCH race above. - if err := tg.SendSignal(info); err != syserror.ESRCH { + if err := tg.SendSignal(info); !linuxerr.Equals(linuxerr.ESRCH, err) { lastErr = err } } @@ -202,16 +203,16 @@ func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // N.B. Inconsistent with man page, linux actually rejects calls with // tid <=0 by EINVAL. This isn't the same for all signal calls. if tid <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } target := t.PIDNamespace().TaskWithID(tid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) } @@ -225,17 +226,17 @@ func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // N.B. Inconsistent with man page, linux actually rejects calls with // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. if tgid <= 0 || tid <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) target := t.PIDNamespace().TaskWithID(tid) if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) } @@ -248,7 +249,7 @@ func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S sigsetsize := args[3].SizeT() if sigsetsize != linux.SignalSetSize { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var newactptr *linux.SigAction @@ -291,7 +292,7 @@ func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel sigsetsize := args[3].SizeT() if sigsetsize != linux.SignalSetSize { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } oldmask := t.SignalMask() if setaddr != 0 { @@ -308,7 +309,7 @@ func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel case linux.SIG_SETMASK: t.SetSignalMask(mask) default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } if oldaddr != 0 { @@ -338,7 +339,7 @@ func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // these semantics apply to changing the signal stack via a // ucontext during a signal handler. if !t.SetSignalStack(alt) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } } @@ -347,7 +348,7 @@ func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Pause implements linux syscall pause(2). func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND) + return 0, nil, syserr.ConvertIntr(t.Block(nil), linuxerr.ERESTARTNOHAND) } // RtSigpending implements linux syscall rt_sigpending(2). @@ -377,7 +378,7 @@ func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne return 0, nil, err } if !d.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } timeout = time.Duration(d.ToNsecCapped()) } else { @@ -420,20 +421,20 @@ func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // Deliver to the given task's thread group. target := t.PIDNamespace().TaskWithID(pid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } - if err := target.SendGroupSignal(&info); err != syserror.ESRCH { + if err := target.SendGroupSignal(&info); !linuxerr.Equals(linuxerr.ESRCH, err) { return 0, nil, err } } @@ -449,7 +450,7 @@ func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker // N.B. Inconsistent with man page, linux actually rejects calls with // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. if tgid <= 0 || tid <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Copy in the info. See RtSigqueueinfo above. @@ -463,17 +464,17 @@ func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) target := t.PIDNamespace().TaskWithID(tid) if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } if !mayKill(t, target, sig) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } return 0, nil, target.SendSignal(&info) } @@ -495,7 +496,7 @@ func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. t.SetSavedSignalMask(oldmask) // Perform the wait. - return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND) + return 0, nil, syserr.ConvertIntr(t.Block(nil), linuxerr.ERESTARTNOHAND) } // RestartSyscall implements the linux syscall restart_syscall(2). @@ -511,7 +512,7 @@ func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // function is never null by (re)initializing it with one that translates // the restart into EINTR. We'll emulate that behaviour. t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?") - return 0, nil, syserror.EINTR + return 0, nil, linuxerr.EINTR } // sharedSignalfd is shared between the two calls. @@ -524,7 +525,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u // Always check for valid flags, even if not creating. if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is this a change to an existing signalfd? @@ -533,7 +534,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u if fd != -1 { file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -544,7 +545,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u } // Not a signalfd. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Create a new file. diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index e07917613..50ddbc142 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -18,6 +18,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" @@ -29,7 +30,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -117,7 +117,7 @@ type multipleMessageHeader64 struct { // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr hostarch.Addr, addrlen uint32) ([]byte, error) { if addrlen > maxAddrLen { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } addrBuf := make([]byte, addrlen) @@ -139,7 +139,7 @@ func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr h } if int32(bufLen) < 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // Write the length unconditionally. @@ -173,7 +173,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Create the new socket. @@ -205,7 +205,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } fileFlags := fs.SettableFileFlags{ @@ -252,14 +252,14 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. @@ -269,7 +269,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } blocking := !file.Flags().NonBlocking - return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(s.Connect(t, a, blocking).ToError(), linuxerr.ERESTARTSYS) } // accept is the implementation of the accept syscall. It is called by accept @@ -277,20 +277,20 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } // Call the syscall implementation for this socket, then copy the @@ -300,12 +300,12 @@ func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, peerRequested := addrLen != 0 nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } if peerRequested { // NOTE(magi): Linux does not give you an error if it can't // write the data back out so neither do we. - if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL { + if err := writeAddress(t, peer, peerLen, addr, addrLen); linuxerr.Equals(linuxerr.EINVAL, err) { return 0, err } } @@ -342,14 +342,14 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. @@ -369,14 +369,14 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if backlog > maxListenBacklog { @@ -407,21 +407,21 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Validate how, then call syscall implementation. switch how { case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, s.Shutdown(t, int(how)).ToError() @@ -438,14 +438,14 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Read the length. Reject negative values. @@ -454,7 +454,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, err } if optLen < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Call syscall implementation then copy both value and value len out. @@ -519,21 +519,21 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if optLen < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if optLen > maxOptLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) if _, err := t.CopyInBytes(optValAddr, buf); err != nil { @@ -557,14 +557,14 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Get the socket name and copy it to the caller. @@ -585,14 +585,14 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Get the socket peer name and copy it to the caller. @@ -612,25 +612,25 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if file.Flags().NonBlocking { @@ -660,7 +660,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { @@ -669,20 +669,20 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if file.Flags().NonBlocking { @@ -697,7 +697,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } if !ts.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) haveDeadline = true @@ -717,7 +717,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { @@ -727,7 +727,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break @@ -749,7 +749,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags } if msg.IovLen > linux.UIO_MAXIOV { - return 0, syserror.EMSGSIZE + return 0, linuxerr.EMSGSIZE } dst, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -762,7 +762,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags if msg.ControlLen == 0 && msg.NameLen == 0 { n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) if err != nil { - return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(err.ToError(), linuxerr.ERESTARTSYS) } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC @@ -780,11 +780,11 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags } if msg.ControlLen > maxControlLen { - return 0, syserror.ENOBUFS + return 0, linuxerr.ENOBUFS } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } defer cms.Release(t) @@ -829,25 +829,25 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags // recvfrom and recv syscall handlers. func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLenPtr hostarch.Addr) (uintptr, error) { if int(bufLen) < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } if file.Flags().NonBlocking { @@ -873,7 +873,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, fla n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) cm.Release(t) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } // Copy the address to the caller. @@ -907,25 +907,25 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if file.Flags().NonBlocking { @@ -945,7 +945,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { @@ -955,19 +955,19 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if file.Flags().NonBlocking { @@ -979,7 +979,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { @@ -989,7 +989,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break @@ -1014,7 +1014,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr hostar if msg.ControlLen > 0 { // Put an upper bound to prevent large allocations. if msg.ControlLen > maxControlLen { - return 0, syserror.ENOBUFS + return 0, linuxerr.ENOBUFS } controlData = make([]byte, msg.ControlLen) if _, err := t.CopyInBytes(hostarch.Addr(msg.Control), controlData); err != nil { @@ -1034,7 +1034,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr hostar // Read data then call the sendmsg implementation. if msg.IovLen > linux.UIO_MAXIOV { - return 0, syserror.EMSGSIZE + return 0, linuxerr.EMSGSIZE } src, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -1059,7 +1059,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr hostar // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) - err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file) + err = handleIOError(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendmsg", file) // Control messages should be released on error as well as for zero-length // messages, which are discarded by the receiver. if n == 0 || err != nil { @@ -1073,20 +1073,20 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr hostar func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } if file.Flags().NonBlocking { @@ -1121,7 +1121,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)}) - return uintptr(n), handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file) + return uintptr(n), handleIOError(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendto", file) } // SendTo implements the linux syscall sendto(2). diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 134051124..8c8847efa 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -16,18 +16,18 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) // doSplice implements a blocking splice operation. func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) { if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if opts.Length == 0 { return 0, nil @@ -45,9 +45,9 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB for { n, err = fs.Splice(t, outFile, inFile, opts) - if n != 0 || err != syserror.ErrWouldBlock { + if n != 0 || err != linuxerr.ErrWouldBlock { break - } else if err == syserror.ErrWouldBlock && nonBlocking { + } else if err == linuxerr.ErrWouldBlock && nonBlocking { break } @@ -105,33 +105,33 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get files. inFile := t.GetFile(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) if !inFile.Flags().Read { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } outFile := t.GetFile(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) if !outFile.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Verify that the outfile Append flag is not set. if outFile.Flags().Append { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Verify that we have a regular infile. This is a requirement; the // same check appears in Linux (fs/splice.c:splice_direct_to_actor). if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var ( @@ -142,7 +142,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Verify that when offset address is not null, infile must be // seekable. The fs.Splice routine itself validates basic read. if !inFile.Flags().Pread { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Copy in the offset. @@ -176,7 +176,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // We can only pass a single file to handleIOError, so pick inFile // arbitrarily. This is used only for debugging purposes. - return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "sendfile", inFile) + return uintptr(n), nil, handleIOError(t, false, err, linuxerr.ERESTARTSYS, "sendfile", inFile) } // Splice implements splice(2). @@ -190,19 +190,19 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get files. outFile := t.GetFile(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) @@ -226,11 +226,11 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal switch { case fs.IsPipe(inFileAttr) && !fs.IsPipe(outFileAttr): if inOffset != 0 { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if outOffset != 0 { if !outFile.Flags().Pwrite { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var offset int64 @@ -244,11 +244,11 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } case !fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr): if outOffset != 0 { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if inOffset != 0 { if !inFile.Flags().Pread { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var offset int64 @@ -262,15 +262,15 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } case fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr): if inOffset != 0 || outOffset != 0 { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // We may not refer to the same pipe; otherwise it's a continuous loop. if inFileAttr.InodeID == outFileAttr.InodeID { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Splice data. @@ -286,7 +286,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } // See above; inFile is chosen arbitrarily here. - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "splice", inFile) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "splice", inFile) } // Tee imlements tee(2). @@ -298,30 +298,30 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get files. outFile := t.GetFile(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) // All files must be pipes. if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // We may not refer to the same pipe; see above. if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // The operation is non-blocking if anything is non-blocking. @@ -339,5 +339,5 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo } // See above; inFile is chosen arbitrarily here. - return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "tee", inFile) + return uintptr(n), nil, handleIOError(t, false, err, linuxerr.ERESTARTSYS, "tee", inFile) } diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 2338ba44b..3da385c66 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -16,11 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -56,7 +56,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Annoying. What's wrong with fstat? file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -98,7 +98,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -108,7 +108,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // stat implements stat from the given *fs.Dirent. func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr hostarch.Addr) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } uattr, err := d.Inode.UnstableAttr(t) if err != nil { @@ -139,13 +139,13 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall statxAddr := args[4].Pointer() if mask&linux.STATX__RESERVED != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, dirPath, err := copyInPath(t, pathAddr, flags&linux.AT_EMPTY_PATH != 0) @@ -156,7 +156,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if path == "" { file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) uattr, err := file.UnstableAttr(t) @@ -170,7 +170,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } uattr, err := d.Inode.UnstableAttr(t) if err != nil { @@ -247,7 +247,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go index 0a04a6113..e38066ea8 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go index 5a3b1bfad..b2ea390c5 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package linux diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 5ebd4461f..0c22599bf 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -16,10 +16,11 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // LINT.IfChange @@ -37,7 +38,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -52,12 +53,12 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll) - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // Fdatasync implements linux syscall fdatasync(2). @@ -68,12 +69,12 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData) - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // SyncFileRange implements linux syscall sync_file_rage(2) @@ -86,13 +87,13 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel uflags := args[3].Uint() if offset < 0 || offset+nbytes < offset { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if uflags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE| linux.SYNC_FILE_RANGE_WRITE| linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if nbytes == 0 { @@ -101,7 +102,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -111,7 +112,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if uflags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } // SYNC_FILE_RANGE_WRITE initiates write-out of all dirty pages in the @@ -136,7 +137,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel err = file.Fsync(t, offset, fs.FileMaxOffset, fs.SyncData) } - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // LINT.ThenChange(vfs2/sync.go) diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go index 40c8bb061..15acb2b8b 100644 --- a/pkg/sentry/syscalls/linux/sys_syslog.go +++ b/pkg/sentry/syscalls/linux/sys_syslog.go @@ -15,9 +15,9 @@ package linux import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -40,7 +40,7 @@ func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal switch command { case _SYSLOG_ACTION_READ_ALL: if size < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if size > logBufLen { size = logBufLen @@ -56,6 +56,6 @@ func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case _SYSLOG_ACTION_SIZE_BUFFER: return logBufLen, nil, nil default: - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } } diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 0d5056303..d74173c56 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -17,8 +17,8 @@ package linux import ( "path" - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -27,15 +27,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/loader" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) -const ( - // exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux. - exitSignalMask = 0xff -) - var ( // ExecMaxTotalSize is the maximum length of all argv and envv entries. // @@ -112,11 +106,11 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr host } if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } atEmptyPath := flags&linux.AT_EMPTY_PATH != 0 if !atEmptyPath && len(pathname) == 0 { - return 0, nil, syserror.ENOENT + return 0, nil, linuxerr.ENOENT } resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0 @@ -135,7 +129,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr host // Need to extract the given FD. f, fdFlags := t.FDTable().Get(dirFD) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) closeOnExec = fdFlags.CloseOnExec @@ -154,7 +148,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr host wd = f.Dirent wd.IncRef() if !fs.IsDir(wd.Inode.StableAttr) { - return 0, nil, syserror.ENOTDIR + return 0, nil, linuxerr.ENOTDIR } } } @@ -187,47 +181,30 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr host // Exit implements linux syscall exit(2). func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - status := int(args[0].Int()) - t.PrepareExit(kernel.ExitStatus{Code: status}) + status := args[0].Int() + t.PrepareExit(linux.WaitStatusExit(status & 0xff)) return 0, kernel.CtrlDoExit, nil } // ExitGroup implements linux syscall exit_group(2). func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - status := int(args[0].Int()) - t.PrepareGroupExit(kernel.ExitStatus{Code: status}) + status := args[0].Int() + t.PrepareGroupExit(linux.WaitStatusExit(status & 0xff)) return 0, kernel.CtrlDoExit, nil } // clone is used by Clone, Fork, and VFork. func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) { - opts := kernel.CloneOptions{ - SharingOptions: kernel.SharingOptions{ - NewAddressSpace: flags&linux.CLONE_VM == 0, - NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0, - NewThreadGroup: flags&linux.CLONE_THREAD == 0, - TerminationSignal: linux.Signal(flags & exitSignalMask), - NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, - NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, - NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, - NewFiles: flags&linux.CLONE_FILES == 0, - NewFSContext: flags&linux.CLONE_FS == 0, - NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, - NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, - }, - Stack: stack, - SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS, - TLS: tls, - ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID, - ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID, - ChildTID: childTID, - ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID, - ParentTID: parentTID, - Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK, - Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED, - InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE, - } - ntid, ctrl, err := t.Clone(&opts) + args := linux.CloneArgs{ + Flags: uint64(uint32(flags) &^ linux.CSIGNAL), + Pidfd: uint64(parentTID), + ChildTID: uint64(childTID), + ParentTID: uint64(parentTID), + ExitSignal: uint64(flags & linux.CSIGNAL), + Stack: uint64(stack), + TLS: uint64(tls), + } + ntid, ctrl, err := t.Clone(&args) return uintptr(ntid), ctrl, err } @@ -260,13 +237,13 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { wopts.NonCloneTasks = true wopts.CloneTasks = true default: - return syserror.EINVAL + return linuxerr.EINVAL } if options&linux.WCONTINUED != 0 { wopts.Events |= kernel.EventGroupContinue } if options&linux.WNOHANG == 0 { - wopts.BlockInterruptErr = syserror.ERESTARTSYS + wopts.BlockInterruptErr = linuxerr.ERESTARTSYS } if options&linux.WNOTHREAD == 0 { wopts.SiblingChildren = true @@ -277,7 +254,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { // wait4 waits for the given child process to exit. func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusageAddr hostarch.Addr) (uintptr, error) { if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventExit | kernel.EventTraceeStop, @@ -315,7 +292,7 @@ func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusag return 0, err } if statusAddr != 0 { - if _, err := primitive.CopyUint32Out(t, statusAddr, wr.Status); err != nil { + if _, err := primitive.CopyUint32Out(t, statusAddr, uint32(wr.Status)); err != nil { return 0, err } } @@ -358,10 +335,10 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal rusageAddr := args[4].Pointer() if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventTraceeStop, @@ -374,7 +351,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.P_PGID: wopts.SpecificPGID = kernel.ProcessGroupID(id) default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if err := parseCommonWaitOptions(&wopts, options); err != nil { @@ -418,23 +395,22 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } si.SetPID(int32(wr.TID)) si.SetUID(int32(wr.UID)) - // TODO(b/73541790): convert kernel.ExitStatus to functions and make - // WaitResult.Status a linux.WaitStatus. - s := unix.WaitStatus(wr.Status) + s := wr.Status switch { case s.Exited(): si.Code = linux.CLD_EXITED si.SetStatus(int32(s.ExitStatus())) case s.Signaled(): - si.Code = linux.CLD_KILLED - si.SetStatus(int32(s.Signal())) - case s.CoreDump(): - si.Code = linux.CLD_DUMPED - si.SetStatus(int32(s.Signal())) + if s.CoreDumped() { + si.Code = linux.CLD_DUMPED + } else { + si.Code = linux.CLD_KILLED + } + si.SetStatus(int32(s.TerminationSignal())) case s.Stopped(): if wr.Event == kernel.EventTraceeStop { si.Code = linux.CLD_TRAPPED - si.SetStatus(int32(s.TrapCause())) + si.SetStatus(int32(s.PtraceEvent())) } else { si.Code = linux.CLD_STOPPED si.SetStatus(int32(s.StopSignal())) @@ -461,29 +437,16 @@ func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel // Unshare implements linux syscall unshare(2). func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() - opts := kernel.SharingOptions{ - NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM, - NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND, - NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD, - NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, - NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, - NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, - NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES, - NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS, - NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, - NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, - } // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) - if opts.NewPIDNamespace { - opts.NewThreadGroup = true + if flags&linux.CLONE_NEWPID != 0 { + flags |= linux.CLONE_THREAD } // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS." - if opts.NewUserNamespace { - opts.NewThreadGroup = true - opts.NewFSContext = true + if flags&linux.CLONE_NEWUSER != 0 { + flags |= linux.CLONE_THREAD | linux.CLONE_FS } - return 0, nil, t.Unshare(&opts) + return 0, nil, t.Unshare(flags) } // SchedYield implements linux syscall sched_yield(2). @@ -504,7 +467,7 @@ func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker } else { task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } @@ -528,7 +491,7 @@ func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker // in an array of "unsigned long" so the buffer needs to // be a multiple of the word size. if size&(t.Arch().Width()-1) > 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var task *kernel.Task @@ -537,7 +500,7 @@ func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker } else { task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } @@ -545,7 +508,7 @@ func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker // The buffer needs to be big enough to hold a cpumask with // all possible cpus. if size < mask.Size() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } _, err := t.CopyOutBytes(maskAddr, mask) @@ -590,16 +553,16 @@ func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if pid != 0 { ot := t.PIDNamespace().TaskWithID(pid) if ot == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } tg = ot.ThreadGroup() if tg.Leader() != ot { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Setpgid only operates on child threadgroups. if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } } @@ -609,7 +572,7 @@ func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if pgid == 0 { pgid = defaultPGID } else if pgid < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // If the pgid is the same as the group, then create a new one. Otherwise, @@ -654,7 +617,7 @@ func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca target := t.PIDNamespace().TaskWithID(tid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil @@ -674,7 +637,7 @@ func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal target := t.PIDNamespace().TaskWithID(tid) if target == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil @@ -698,7 +661,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } // From kernel/sys.c:getpriority: @@ -712,7 +675,7 @@ func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -744,7 +707,7 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if task == nil { - return 0, nil, syserror.ESRCH + return 0, nil, linuxerr.ESRCH } task.SetNiceness(niceval) @@ -754,7 +717,7 @@ func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go index 5c3b3dee2..4adc8b8a4 100644 --- a/pkg/sentry/syscalls/linux/sys_time.go +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -19,12 +19,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" ) // The most significant 29 bits hold either a pid or a file descriptor. @@ -75,7 +75,7 @@ func ClockGetres(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } if _, err := getClock(t, clockID); err != nil { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if addr == 0 { @@ -94,12 +94,12 @@ type cpuClocker interface { func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) { if clockID < 0 { if !isValidCPUClock(clockID) { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } targetTask := targetTask(t, clockID) if targetTask == nil { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } var target cpuClocker @@ -116,7 +116,7 @@ func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) { // CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF. return target.CPUClock(), nil default: - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } @@ -138,7 +138,7 @@ func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) { case linux.CLOCK_THREAD_CPUTIME_ID: return t.CPUClock(), nil default: - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } @@ -157,7 +157,7 @@ func ClockGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // ClockSettime implements linux syscall clock_settime(2). func ClockSettime(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } // Time implements linux syscall time(2). @@ -209,11 +209,11 @@ func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, end ktime.Time, rem host timer.Destroy() - switch err { - case syserror.ETIMEDOUT: + switch { + case linuxerr.Equals(linuxerr.ETIMEDOUT, err): // Slept for entire timeout. return nil - case syserror.ErrInterrupted: + case err == linuxerr.ErrInterrupted: // Interrupted. remaining := end.Sub(c.Now()) if remaining <= 0 { @@ -234,9 +234,9 @@ func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, end ktime.Time, rem host end: end, rem: rem, }) - return syserror.ERESTART_RESTARTBLOCK + return linuxerr.ERESTART_RESTARTBLOCK } - return syserror.ERESTARTNOHAND + return linuxerr.ERESTARTNOHAND default: panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err)) } @@ -253,7 +253,7 @@ func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if !ts.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Just like linux, we cap the timeout with the max number that int64 can @@ -276,7 +276,7 @@ func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } if !req.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Only allow clock constants also allowed by Linux. @@ -284,7 +284,7 @@ func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if clockID != linux.CLOCK_REALTIME && clockID != linux.CLOCK_MONOTONIC && clockID != linux.CLOCK_PROCESS_CPUTIME_ID { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index 45eef4feb..d39a0a6f5 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -18,9 +18,9 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const nsecPerSec = int64(time.Second) @@ -29,7 +29,7 @@ const nsecPerSec = int64(time.Second) func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if t.Arch().Width() != 8 { // Definition of linux.ItimerVal assumes 64-bit architecture. - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } timerID := args[0].Int() @@ -51,7 +51,7 @@ func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if t.Arch().Width() != 8 { // Definition of linux.ItimerVal assumes 64-bit architecture. - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } timerID := args[0].Int() diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go index cadd9d348..4eeb94231 100644 --- a/pkg/sentry/syscalls/linux/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -16,12 +16,12 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" ) // TimerfdCreate implements Linux syscall timerfd_create(2). @@ -30,7 +30,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel flags := args[1].Int() if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var c ktime.Clock @@ -40,7 +40,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME: c = t.Kernel().MonotonicClock() default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } f := timerfd.NewFile(t, c) defer f.DecRef(t) @@ -66,18 +66,18 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne oldValAddr := args[3].Pointer() if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var newVal linux.Itimerspec @@ -105,13 +105,13 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } tm, s := tf.GetTime() diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go index 6ddd30d5c..bde672d67 100644 --- a/pkg/sentry/syscalls/linux/sys_tls_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go @@ -12,16 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build amd64 +//go:build amd64 +// +build amd64 package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // ArchPrctl implements linux syscall arch_prctl(2). @@ -37,18 +38,18 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } default: - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } case linux.ARCH_SET_FS: fsbase := args[1].Uint64() if !t.Arch().SetTLS(uintptr(fsbase)) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } case linux.ARCH_GET_GS, linux.ARCH_SET_GS: t.Kernel().EmitUnimplementedEvent(t) fallthrough default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_tls_arm64.go b/pkg/sentry/syscalls/linux/sys_tls_arm64.go index fb08a356e..dfa684387 100644 --- a/pkg/sentry/syscalls/linux/sys_tls_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_tls_arm64.go @@ -12,17 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build arm64 +//go:build arm64 +// +build arm64 package linux import ( + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // ArchPrctl is not defined for ARM64. func ArchPrctl(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go index 66c5974f5..4e945d2c0 100644 --- a/pkg/sentry/syscalls/linux/sys_utsname.go +++ b/pkg/sentry/syscalls/linux/sys_utsname.go @@ -16,9 +16,9 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Uname implements linux syscall uname. @@ -57,10 +57,10 @@ func Setdomainname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel utsns := t.UTSNamespace() if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } if size < 0 || size > linux.UTSLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } name, err := t.CopyInString(nameAddr, int(size)) @@ -79,10 +79,10 @@ func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S utsns := t.UTSNamespace() if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } if size < 0 || size > linux.UTSLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } name := make([]byte, size) diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 95bfe6606..4a4ef5046 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -18,12 +18,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -46,19 +46,19 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -71,7 +71,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := writev(t, file, src) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "write", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "write", file) } // Pwrite64 implements linux syscall pwrite64(2). @@ -83,29 +83,29 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is writing at an offset supported? if !file.Flags().Pwrite { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is writable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -118,7 +118,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err := pwritev(t, file, src, offset) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwrite64", file) } // Writev implements linux syscall writev(2). @@ -129,13 +129,13 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the source of the write. @@ -148,7 +148,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := writev(t, file, src) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "writev", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "writev", file) } // Pwritev implements linux syscall pwritev(2). @@ -160,23 +160,23 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is writing at an offset supported? if !file.Flags().Pwrite { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Check that the file is writable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the source of the write. @@ -189,7 +189,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := pwritev(t, file, src, offset) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev", file) } // Pwritev2 implements linux syscall pwritev2(2). @@ -208,34 +208,34 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := int(args[5].Int()) if int(args[4].Int())&0x4 == 1 { - return 0, nil, syserror.EACCES + return 0, nil, linuxerr.EACCES } file := t.GetFile(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is writing at an offset supported? if offset > -1 && !file.Flags().Pwrite { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is // accepted as a valid flag argument for pwritev2. if flags&^linux.RWF_VALID != 0 { - return uintptr(flags), nil, syserror.EOPNOTSUPP + return uintptr(flags), nil, linuxerr.EOPNOTSUPP } // Check that the file is writeable. if !file.Flags().Write { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Read the iovecs that specify the source of the write. @@ -250,17 +250,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if offset == -1 { n, err := writev(t, file, src) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev2", file) } n, err := pwritev(t, file, src, offset) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev2", file) } func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { n, err := f.Writev(t, src) - if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if err != linuxerr.ErrWouldBlock || f.Flags().NonBlocking { if n > 0 { // Queue notification if we wrote anything. f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) @@ -273,7 +273,7 @@ func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { var deadline ktime.Time if s, ok := f.FileOperations.(socket.Socket); ok { dl := s.SendTimeout() - if dl < 0 && err == syserror.ErrWouldBlock { + if dl < 0 && err == linuxerr.ErrWouldBlock { return n, err } if dl > 0 { @@ -295,14 +295,14 @@ func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { // anything other than "would block". n, err = f.Writev(t, src) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - err = syserror.ErrWouldBlock + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { + err = linuxerr.ErrWouldBlock } break } @@ -320,7 +320,7 @@ func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (int64, error) { n, err := f.Pwritev(t, src, offset) - if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if err != linuxerr.ErrWouldBlock || f.Flags().NonBlocking { if n > 0 { // Queue notification if we wrote anything. f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) @@ -341,7 +341,7 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) ( // anything other than "would block". n, err = f.Pwritev(t, src, offset+total) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 28ad6a60e..baaf31191 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -18,11 +18,11 @@ import ( "strings" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange @@ -47,7 +47,7 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) @@ -73,7 +73,7 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink n := 0 err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } n, err = getXattr(t, d, nameAddr, valueAddr, size) @@ -99,7 +99,7 @@ func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, s // TODO(b/148380782): Support xattrs in namespaces other than "user". if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } // If getxattr(2) is called with size 0, the size of the value will be @@ -116,7 +116,7 @@ func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, s } n := len(value) if uint64(n) > requestedSize { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } // Don't copy out the attribute value if size is 0. @@ -151,7 +151,7 @@ func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) @@ -172,7 +172,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags) @@ -182,7 +182,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink // setXattr implements setxattr(2) from the given *fs.Dirent. func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, size uint64, flags uint32) error { if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } name, err := copyInXattrName(t, nameAddr) @@ -195,7 +195,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, s } if size > linux.XATTR_SIZE_MAX { - return syserror.E2BIG + return linuxerr.E2BIG } buf := make([]byte, size) if _, err := t.CopyInBytes(valueAddr, buf); err != nil { @@ -204,7 +204,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, s value := string(buf) if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil { @@ -217,13 +217,13 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr hostarch.Addr, s func copyInXattrName(t *kernel.Task, nameAddr hostarch.Addr) (string, error) { name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) if err != nil { - if err == syserror.ENAMETOOLONG { - return "", syserror.ERANGE + if linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { + return "", linuxerr.ERANGE } return "", err } if len(name) == 0 { - return "", syserror.ERANGE + return "", linuxerr.ERANGE } return name, nil } @@ -241,9 +241,9 @@ func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error // Restrict xattrs to regular files and directories. if !xattrFileTypeOk(i) { if perms.Write { - return syserror.EPERM + return linuxerr.EPERM } - return syserror.ENODATA + return linuxerr.ENODATA } return i.CheckPermission(t, perms) @@ -268,7 +268,7 @@ func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) @@ -293,7 +293,7 @@ func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlin n := 0 err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } n, err = listXattr(t, d, listAddr, size) @@ -333,10 +333,10 @@ func listXattr(t *kernel.Task, d *fs.Dirent, addr hostarch.Addr, size uint64) (i listSize := xattrListSize(xattrs) if listSize > linux.XATTR_SIZE_MAX { - return 0, syserror.E2BIG + return 0, linuxerr.E2BIG } if uint64(listSize) > requestedSize { - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } // Don't copy out the attributes if size is 0. @@ -382,7 +382,7 @@ func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. f := t.GetFile(fd) if f == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer f.DecRef(t) @@ -400,7 +400,7 @@ func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSyml return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } return removeXattr(t, d, nameAddr) @@ -419,7 +419,7 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr hostarch.Addr) error { } if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP + return linuxerr.EOPNOTSUPP } if err := d.Inode.RemoveXattr(t, d, name); err != nil { diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go index 3edc922eb..d90652a3f 100644 --- a/pkg/sentry/syscalls/linux/timespec.go +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -18,9 +18,9 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // copyTimespecIn copies a Timespec from the untrusted app range to the kernel. @@ -37,7 +37,7 @@ func copyTimespecIn(t *kernel.Task, addr hostarch.Addr) (linux.Timespec, error) ts.Nsec = int64(hostarch.ByteOrder.Uint64(in[8:])) return ts, nil default: - return linux.Timespec{}, syserror.ENOSYS + return linux.Timespec{}, linuxerr.ENOSYS } } @@ -51,7 +51,7 @@ func copyTimespecOut(t *kernel.Task, addr hostarch.Addr, ts *linux.Timespec) err _, err := t.CopyOutBytes(addr, out) return err default: - return syserror.ENOSYS + return linuxerr.ENOSYS } } @@ -69,7 +69,7 @@ func copyTimevalIn(t *kernel.Task, addr hostarch.Addr) (linux.Timeval, error) { tv.Usec = int64(hostarch.ByteOrder.Uint64(in[8:])) return tv, nil default: - return linux.Timeval{}, syserror.ENOSYS + return linux.Timeval{}, linuxerr.ENOSYS } } @@ -83,7 +83,7 @@ func copyTimevalOut(t *kernel.Task, addr hostarch.Addr, tv *linux.Timeval) error _, err := t.CopyOutBytes(addr, out) return err default: - return syserror.ENOSYS + return linuxerr.ENOSYS } } @@ -103,7 +103,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr hostarch.Addr) (time. return 0, err } if !timespec.Valid() { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } timeout = time.Duration(timespec.ToNsecCapped()) } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 5ce0bc714..1e3bd2a50 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -41,6 +41,7 @@ go_library( "//pkg/abi/linux", "//pkg/bits", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fspath", "//pkg/gohacks", "//pkg/hostarch", @@ -72,7 +73,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go index fd1863ef3..0b57c0f7c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/aio.go +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -17,6 +17,8 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" @@ -24,10 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/mm" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/pkg/hostarch" ) // IoSubmit implements linux syscall io_submit(2). @@ -37,7 +36,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc addr := args[2].Pointer() if nrEvents < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } for i := int32(0); i < nrEvents; i++ { @@ -56,7 +55,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } cbAddr = hostarch.Addr(cbAddrP) default: - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } // Copy in this callback. @@ -90,12 +89,12 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // submitCallback processes a single callback. func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr hostarch.Addr) error { if cb.Reserved2 != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } fd := t.GetFileVFS2(cb.FD) if fd == nil { - return syserror.EBADF + return linuxerr.EBADF } defer fd.DecRef(t) @@ -104,13 +103,13 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { eventFD = t.GetFileVFS2(cb.ResFD) if eventFD == nil { - return syserror.EBADF + return linuxerr.EBADF } defer eventFD.DecRef(t) // Check that it is an eventfd. if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -123,14 +122,14 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr host switch cb.OpCode { case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: if cb.Offset < 0 { - return syserror.EINVAL + return linuxerr.EINVAL } } // Prepare the request. aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id) if !ok { - return syserror.EINVAL + return linuxerr.EINVAL } if err := aioCtx.Prepare(); err != nil { return err @@ -200,7 +199,7 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) bytes := int(cb.Bytes) if bytes < 0 { // Linux also requires that this field fit in ssize_t. - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } // Since this I/O will be asynchronous with respect to t's task goroutine, @@ -222,6 +221,6 @@ func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) default: // Not a supported command. - return usermem.IOSequence{}, syserror.EINVAL + return usermem.IOSequence{}, linuxerr.EINVAL } } diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go index 047d955b6..84010db77 100644 --- a/pkg/sentry/syscalls/linux/vfs2/epoll.go +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -19,12 +19,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -34,7 +34,7 @@ var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes() func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags&^linux.EPOLL_CLOEXEC != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file, err := t.Kernel().VFS().NewEpollInstanceFD(t) @@ -59,7 +59,7 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // "Since Linux 2.6.8, the size argument is ignored, but must be greater // than zero" - epoll_create(2) if size <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file, err := t.Kernel().VFS().NewEpollInstanceFD(t) @@ -84,20 +84,20 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc epfile := t.GetFileVFS2(epfd) if epfile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if epfile == file { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var event linux.EpollEvent @@ -115,24 +115,24 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } return 0, nil, ep.ModifyInterest(file, fd, event) default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } func waitEpoll(t *kernel.Task, epfd int32, eventsAddr hostarch.Addr, maxEvents int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) { var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } epfile := t.GetFileVFS2(epfd) if epfile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Allocate space for a few events on the stack for the common case in @@ -174,7 +174,7 @@ func waitEpoll(t *kernel.Task, epfd int32, eventsAddr hostarch.Addr, maxEvents i haveDeadline = true } if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = nil } return 0, nil, err diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index 807f909da..0dcf1fbff 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Eventfd2 implements linux syscall eventfd2(2). @@ -29,7 +29,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) if flags & ^allOps != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } vfsObj := t.Kernel().VFS() diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go index 3315398a4..fcf2e25de 100644 --- a/pkg/sentry/syscalls/linux/vfs2/execve.go +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -16,16 +16,15 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/loader" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Execve implements linux syscall execve(2). @@ -48,7 +47,7 @@ func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) @@ -83,11 +82,11 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr host // do_open_execat(fd=AT_FDCWD)), and the loader package is currently // incapable of handling this correctly. if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { - return 0, nil, syserror.ENOENT + return 0, nil, linuxerr.ENOENT } dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd) if dirfile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } start := dirfile.VirtualDentry() start.IncRef() diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 1a31898e8..2198aa065 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -16,6 +16,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" @@ -24,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Close implements Linux syscall close(2). @@ -36,12 +36,12 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // (and other reference-holding operations complete). _, file := t.FDTable().Remove(t, fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) err := file.OnClose(t) - return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) + return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, linuxerr.EINTR, "close", file) } // Dup implements Linux syscall dup(2). @@ -50,13 +50,13 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { - return 0, nil, syserror.EMFILE + return 0, nil, linuxerr.EMFILE } return uintptr(newFD), nil, nil } @@ -70,7 +70,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // As long as oldfd is valid, dup2() does nothing and returns newfd. file := t.GetFileVFS2(oldfd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } file.DecRef(t) return uintptr(newfd), nil, nil @@ -86,7 +86,7 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC flags := args[2].Uint() if oldfd == newfd { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return dup3(t, oldfd, newfd, flags) @@ -94,12 +94,12 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) { if flags&^linux.O_CLOEXEC != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(oldfd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -119,7 +119,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file, flags := t.FDTable().GetVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -128,7 +128,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL: // allowed default: - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } } @@ -169,7 +169,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if who < 0 { // Check for overflow before flipping the sign. if who-1 > who { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ownerType = linux.F_OWNER_PGRP who = -who @@ -192,7 +192,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_SETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } n, err := pipefile.SetPipeSize(int64(args[2].Int())) if err != nil { @@ -202,7 +202,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_GETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } return uintptr(pipefile.PipeSize()), nil, nil case linux.F_GET_SEALS: @@ -210,7 +210,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(val), nil, err case linux.F_ADD_SEALS: if !file.IsWritable() { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } err := tmpfs.AddSeals(file, args[2].Uint()) return 0, nil, err @@ -232,7 +232,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, a.SetSignal(linux.Signal(args[2].Int())) default: // Everything else is not yet supported. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -269,7 +269,7 @@ func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: // Acceptable type. default: - return syserror.EINVAL + return linuxerr.EINVAL } a := file.SetAsyncHandler(fasync.NewVFS2(fd)).(*fasync.FileAsync) @@ -282,26 +282,26 @@ func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, case linux.F_OWNER_TID: task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) if task == nil { - return syserror.ESRCH + return linuxerr.ESRCH } a.SetOwnerTask(t, task) return nil case linux.F_OWNER_PID: tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid)) if tg == nil { - return syserror.ESRCH + return linuxerr.ESRCH } a.SetOwnerThreadGroup(t, tg) return nil case linux.F_OWNER_PGRP: pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid)) if pg == nil { - return syserror.ESRCH + return linuxerr.ESRCH } a.SetOwnerProcessGroup(t, pg) return nil default: - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -319,7 +319,7 @@ func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDes case linux.F_WRLCK: typ = lock.WriteLock default: - return syserror.EINVAL + return linuxerr.EINVAL } r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) if err != nil { @@ -368,13 +368,13 @@ func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescrip switch flock.Type { case linux.F_RDLCK: if !file.IsReadable() { - return syserror.EBADF + return linuxerr.EBADF } return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.ReadLock, r, blocker) case linux.F_WRLCK: if !file.IsWritable() { - return syserror.EBADF + return linuxerr.EBADF } return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.WriteLock, r, blocker) @@ -382,7 +382,7 @@ func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescrip return file.UnlockPOSIX(t, t.FDTable(), r) default: - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -395,22 +395,22 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Note: offset is allowed to be negative. if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // If the FD refers to a pipe or FIFO, return error. if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } switch advice { @@ -421,7 +421,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys case linux.POSIX_FADV_DONTNEED: case linux.POSIX_FADV_NOREUSE: default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Sure, whatever. diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 36aa1d3ae..f19f0fd41 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -16,12 +16,11 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Link implements Linux syscall link(2). @@ -43,10 +42,10 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags int32) error { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { - return syserror.ENOENT + return linuxerr.ENOENT } oldpath, err := copyInPath(t, oldpathAddr) @@ -290,7 +289,7 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc flags := args[2].Int() if flags&^linux.AT_REMOVEDIR != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if flags&linux.AT_REMOVEDIR != 0 { @@ -320,7 +319,7 @@ func symlinkat(t *kernel.Task, targetAddr hostarch.Addr, newdirfd int32, linkpat return err } if len(target) == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } linkpath, err := copyInPath(t, linkpathAddr) if err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go index a7d4d2a36..1e36d9c76 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fscontext.go +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -16,11 +16,11 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Getcwd implements Linux syscall getcwd(2). @@ -39,7 +39,7 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Note this is >= because we need a terminator. if uint(len(s)) >= size { - return 0, nil, syserror.ERANGE + return 0, nil, linuxerr.ERANGE } // Construct a byte slice containing a NUL terminator. @@ -106,7 +106,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal addr := args[0].Pointer() if !t.HasCapability(linux.CAP_SYS_CHROOT) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } path, err := copyInPath(t, addr) diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go index b41a3056a..c2c3172bc 100644 --- a/pkg/sentry/syscalls/linux/vfs2/getdents.go +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -17,13 +17,12 @@ package vfs2 import ( "fmt" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Getdents implements Linux syscall getdents(2). @@ -43,7 +42,7 @@ func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (ui file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -100,7 +99,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) size = (size + 7) &^ 7 // round up to multiple of 8 if size > cb.remaining { - return syserror.EINVAL + return linuxerr.EINVAL } buf = cb.t.CopyScratchBuffer(size) hostarch.ByteOrder.PutUint64(buf[0:8], dirent.Ino) @@ -134,7 +133,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) size = (size + 7) &^ 7 // round up to multiple of sizeof(long) if size > cb.remaining { - return syserror.EINVAL + return linuxerr.EINVAL } buf = cb.t.CopyScratchBuffer(size) hostarch.ByteOrder.PutUint64(buf[0:8], dirent.Ino) diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go index 11753d8e5..d8d5dd7ad 100644 --- a/pkg/sentry/syscalls/linux/vfs2/inotify.go +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC @@ -28,7 +28,7 @@ const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags&^allFlags != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags)) @@ -60,14 +60,14 @@ func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, f := t.GetFileVFS2(fd) if f == nil { // Invalid fd. - return nil, nil, syserror.EBADF + return nil, nil, linuxerr.EBADF } ino, ok := f.Impl().(*vfs.Inotify) if !ok { // Not an inotify fd. f.DecRef(t) - return nil, nil, syserror.EINVAL + return nil, nil, linuxerr.EINVAL } return ino, f, nil @@ -82,7 +82,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) if mask&linux.ALL_INOTIFY_BITS == 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link." diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index c7c3fed57..b806120cd 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Ioctl implements Linux syscall ioctl(2). @@ -28,12 +28,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Handle ioctls that apply to all FDs. @@ -99,7 +99,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if who < 0 { // Check for overflow before flipping the sign. if who-1 > who { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } ownerType = linux.F_OWNER_PGRP who = -who diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go index d1452a04d..008603173 100644 --- a/pkg/sentry/syscalls/linux/vfs2/lock.go +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Flock implements linux syscall flock(2). @@ -30,7 +30,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { // flock(2): EBADF fd is not an open file descriptor. - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -57,7 +57,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } default: // flock(2): EINVAL operation is invalid. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go index c4c0f9e0a..70c2cf5a5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/memfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -16,10 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -35,7 +35,7 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if flags&^memfdAllFlags != 0 { // Unknown bits in flags. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go index c961545f6..c804f9fd3 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mmap.go +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -16,13 +16,12 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Mmap implements Linux syscall mmap(2). @@ -38,7 +37,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Require exactly one of MAP_PRIVATE and MAP_SHARED. if private == shared { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } opts := memmap.MMapOpts{ @@ -71,13 +70,13 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Convert the passed FD to a file reference. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // mmap unconditionally requires that the FD is readable. if !file.IsReadable() { - return 0, nil, syserror.EACCES + return 0, nil, linuxerr.EACCES } // MAP_SHARED requires that the FD be writable for PROT_WRITE. if shared && !file.IsWritable() { diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go index dd93430e2..4d73d46ef 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mount.go +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -16,12 +16,11 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Mount implements Linux syscall mount(2). @@ -69,7 +68,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // namespace. creds := t.Credentials() if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | @@ -84,7 +83,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // unknown or unsupported flags are passed. Since we don't implement // everything, we fail explicitly on flags that are unimplemented. if flags&(unsupportedOps|unsupportedFlags) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var opts vfs.MountOptions @@ -125,12 +124,12 @@ func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Currently, this is always the init task's user namespace. creds := t.Credentials() if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE if flags&unsupported != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, err := copyInPath(t, addr) diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go index 2aaf1ed74..38796d4db 100644 --- a/pkg/sentry/syscalls/linux/vfs2/path.go +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -16,12 +16,11 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) func copyInPath(t *kernel.Task, addr hostarch.Addr) (fspath.Path, error) { @@ -44,7 +43,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { root.DecRef(t) - return taskPathOperation{}, syserror.ENOENT + return taskPathOperation{}, linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() @@ -53,7 +52,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { root.DecRef(t) - return taskPathOperation{}, syserror.EBADF + return taskPathOperation{}, linuxerr.EBADF } start = dirfile.VirtualDentry() start.IncRef() diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go index c6fc1954c..07a89cf4e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/pipe.go +++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go @@ -16,14 +16,13 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Pipe implements Linux syscall pipe(2). @@ -41,7 +40,7 @@ func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func pipe2(t *kernel.Task, addr hostarch.Addr, flags int32) error { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } r, w, err := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) if err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go index a69c80edd..204051cd0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/poll.go +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -19,15 +19,15 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" - - "gvisor.dev/gvisor/pkg/hostarch" ) // fileCap is the maximum allowable files for poll & select. This has no @@ -132,7 +132,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // Wait for a notification. timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout) if err != nil { - if err == syserror.ETIMEDOUT { + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = nil } return timeout, 0, err @@ -161,7 +161,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. func copyInPollFDs(t *kernel.Task, addr hostarch.Addr, nfds uint) ([]linux.PollFD, error) { if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } pfd := make([]linux.PollFD, nfds) @@ -188,7 +188,7 @@ func doPoll(t *kernel.Task, addr hostarch.Addr, nfds uint, timeout time.Duration pfd[i].Events |= linux.POLLHUP | linux.POLLERR } remainingTimeout, n, err := pollBlock(t, pfd, timeout) - err = syserror.ConvertIntr(err, syserror.EINTR) + err = syserr.ConvertIntr(err, linuxerr.EINTR) // The poll entries are copied out regardless of whether // any are set or not. This aligns with the Linux behavior. @@ -221,7 +221,7 @@ func CopyInFDSet(t *kernel.Task, addr hostarch.Addr, nBytes, nBitsInLastPartialB func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Addr, timeout time.Duration) (uintptr, error) { if nfds < 0 || nfds > fileCap { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Calculate the size of the fd sets (one bit per fd). @@ -268,7 +268,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Ad // OK. Linux is racy in the same way. file := t.GetFileVFS2(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } file.DecRef(t) @@ -298,7 +298,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Ad // Do the syscall, then count the number of bits set. if _, _, err = pollBlock(t, pfd, timeout); err != nil { - return 0, syserror.ConvertIntr(err, syserror.EINTR) + return 0, syserr.ConvertIntr(err, linuxerr.EINTR) } // r, w, and e are currently event mask bitsets; unset bits corresponding @@ -410,13 +410,13 @@ func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { func poll(t *kernel.Task, pfdAddr hostarch.Addr, nfds uint, timeout time.Duration) (uintptr, error) { remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) // On an interrupt poll(2) is restarted with the remaining timeout. - if err == syserror.EINTR { + if linuxerr.Equals(linuxerr.EINTR, err) { t.SetSyscallRestartBlock(&pollRestartBlock{ pfdAddr: pfdAddr, nfds: nfds, timeout: remainingTimeout, }) - return 0, syserror.ERESTART_RESTARTBLOCK + return 0, linuxerr.ERESTART_RESTARTBLOCK } return n, err } @@ -462,8 +462,8 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // // Note that this means that if err is nil but copyErr is not, copyErr is // ignored. This is consistent with Linux. - if err == syserror.EINTR && copyErr == nil { - err = syserror.ERESTARTNOHAND + if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { + err = linuxerr.ERESTARTNOHAND } return n, nil, err } @@ -484,7 +484,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } if timeval.Sec < 0 || timeval.Usec < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } timeout = time.Duration(timeval.ToNsecCapped()) } @@ -492,8 +492,8 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) // See comment in Ppoll. - if err == syserror.EINTR && copyErr == nil { - err = syserror.ERESTARTNOHAND + if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { + err = linuxerr.ERESTARTNOHAND } return n, nil, err } @@ -539,8 +539,8 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) // See comment in Ppoll. - if err == syserror.EINTR && copyErr == nil { - err = syserror.ERESTARTNOHAND + if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { + err = linuxerr.ERESTARTNOHAND } return n, nil, err } @@ -561,7 +561,7 @@ func copyTimespecInToDuration(t *kernel.Task, timespecAddr hostarch.Addr) (time. return 0, err } if !timespec.Valid() { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } timeout = time.Duration(timespec.ToNsecCapped()) } @@ -573,7 +573,7 @@ func setTempSignalSet(t *kernel.Task, maskAddr hostarch.Addr, maskSize uint) err return nil } if maskSize != linux.SignalSetSize { - return syserror.EINVAL + return linuxerr.EINVAL } var mask linux.SignalSet if _, err := mask.CopyIn(t, maskAddr); err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index b863d7b84..4e7dc5080 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -18,13 +18,13 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -42,14 +42,14 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -62,7 +62,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC n, err := read(t, file, dst, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "read", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "read", file) } // Readv implements Linux syscall readv(2). @@ -73,7 +73,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -87,12 +87,12 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := read(t, file, dst, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "readv", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "readv", file) } func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { return n, err } @@ -114,14 +114,14 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt // "would block". n, err = file.Read(t, dst, opts) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - err = syserror.ErrWouldBlock + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { + err = linuxerr.ErrWouldBlock } break } @@ -140,19 +140,19 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -165,7 +165,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "pread64", file) } // Preadv implements Linux syscall preadv(2). @@ -177,13 +177,13 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -196,7 +196,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv", file) } // Preadv2 implements Linux syscall preadv2(2). @@ -215,13 +215,13 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the destination of the read. @@ -242,12 +242,12 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err = pread(t, file, dst, offset, opts) } t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv2", file) } func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { return n, err } @@ -269,14 +269,14 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of // "would block". n, err = file.PRead(t, dst, offset+total, opts) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - err = syserror.ErrWouldBlock + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { + err = linuxerr.ErrWouldBlock } break } @@ -293,14 +293,14 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -313,7 +313,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := write(t, file, src, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "write", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "write", file) } // Writev implements Linux syscall writev(2). @@ -324,7 +324,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -338,12 +338,12 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := write(t, file, src, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "writev", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "writev", file) } func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { return n, err } @@ -365,14 +365,14 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op // "would block". n, err = file.Write(t, src, opts) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - err = syserror.ErrWouldBlock + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { + err = linuxerr.ErrWouldBlock } break } @@ -390,19 +390,19 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check that the size is legitimate. si := int(size) if si < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -415,7 +415,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "pwrite64", file) } // Pwritev implements Linux syscall pwritev(2). @@ -427,13 +427,13 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -446,7 +446,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev", file) } // Pwritev2 implements Linux syscall pwritev2(2). @@ -465,13 +465,13 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get the source of the write. @@ -492,12 +492,12 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err = pwrite(t, file, src, offset, opts) } t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev2", file) } func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { return n, err } @@ -519,14 +519,14 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o // "would block". n, err = file.PWrite(t, src, offset+total, opts) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { - if err == syserror.ETIMEDOUT { - err = syserror.ErrWouldBlock + if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { + err = linuxerr.ErrWouldBlock } break } @@ -560,7 +560,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -576,27 +576,27 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is readable. if !file.IsReadable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Check that the size is valid. if int(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Return EINVAL; if the underlying file type does not support readahead, // then Linux will return EINVAL to indicate as much. In the future, we // may extend this function to actually support readahead hints. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 647e089d0..e608572b4 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -16,15 +16,14 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX @@ -65,7 +64,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -105,7 +104,7 @@ func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func fchownat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, owner, group, flags int32) error { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) @@ -126,7 +125,7 @@ func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vf if owner != -1 { kuid := userns.MapToKUID(auth.UID(owner)) if !kuid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_UID opts.Stat.UID = uint32(kuid) @@ -134,7 +133,7 @@ func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vf if group != -1 { kgid := userns.MapToKGID(auth.GID(group)) if !kgid.Ok() { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_GID opts.Stat.GID = uint32(kgid) @@ -150,7 +149,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -167,7 +166,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc length := args[1].Int64() if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, err := copyInPath(t, addr) @@ -191,17 +190,17 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys length := args[1].Int64() if length < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if !file.IsWritable() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } err := file.SetStat(t, vfs.SetStatOptions{ @@ -222,23 +221,23 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if !file.IsWritable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if mode != 0 { - return 0, nil, syserror.ENOTSUP + return 0, nil, linuxerr.ENOTSUP } if offset < 0 || length <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } size := offset + length if size < 0 { - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } limit := limits.FromContext(t).Get(limits.FileSize).Cur if uint64(size) >= limit { @@ -246,7 +245,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys Signo: int32(linux.SIGXFSZ), Code: linux.SI_USER, }) - return 0, nil, syserror.EFBIG + return 0, nil, linuxerr.EFBIG } return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length)) @@ -340,7 +339,7 @@ func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr hostarch.Addr, op return err } if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME opts.Stat.Atime = linux.StatxTimestamp{ @@ -372,7 +371,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // "If filename is NULL and dfd refers to an open file, then operate on the @@ -405,7 +404,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, o } if times[0].Nsec != linux.UTIME_OMIT { if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_ATIME opts.Stat.Atime = linux.StatxTimestamp{ @@ -415,7 +414,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, o } if times[1].Nsec != linux.UTIME_OMIT { if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) { - return syserror.EINVAL + return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_MTIME opts.Stat.Mtime = linux.StatxTimestamp{ @@ -432,7 +431,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa start := root if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { - return syserror.ENOENT + return linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() @@ -440,7 +439,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - return syserror.EBADF + return linuxerr.EBADF } if !path.HasComponents() { // Use FileDescription.SetStat() instead of @@ -465,10 +464,10 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa } func handleSetSizeError(t *kernel.Task, err error) error { - if err == syserror.ErrExceedsFileSizeLimit { + if err == linuxerr.ErrExceedsFileSizeLimit { // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return syserror.EFBIG + return linuxerr.EFBIG } return err } diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go index 6163da103..27fb2139b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/signal.go +++ b/pkg/sentry/syscalls/linux/vfs2/signal.go @@ -16,13 +16,12 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd" "gvisor.dev/gvisor/pkg/sentry/kernel" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // sharedSignalfd is shared between the two calls. @@ -35,7 +34,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u // Always check for valid flags, even if not creating. if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Is this a change to an existing signalfd? @@ -44,7 +43,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u if fd != -1 { file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -55,7 +54,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize u } // Not a signalfd. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } fileFlags := uint32(linux.O_RDWR) diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index 69f69e3af..48be5a88d 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -18,6 +18,8 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -29,10 +31,7 @@ import ( slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/pkg/hostarch" ) // maxAddrLen is the maximum socket address length we're willing to accept. @@ -117,7 +116,7 @@ type multipleMessageHeader64 struct { // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr hostarch.Addr, addrlen uint32) ([]byte, error) { if addrlen > maxAddrLen { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } addrBuf := make([]byte, addrlen) @@ -139,7 +138,7 @@ func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr h } if int32(bufLen) < 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // Write the length unconditionally. @@ -173,7 +172,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Create the new socket. @@ -206,7 +205,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Create the socket pair. @@ -256,14 +255,14 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. @@ -273,7 +272,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 - return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(s.Connect(t, a, blocking).ToError(), linuxerr.ERESTARTSYS) } // accept is the implementation of the accept syscall. It is called by accept @@ -281,20 +280,20 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } // Call the syscall implementation for this socket, then copy the @@ -304,12 +303,12 @@ func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, peerRequested := addrLen != 0 nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } if peerRequested { // NOTE(magi): Linux does not give you an error if it can't // write the data back out so neither do we. - if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL { + if err := writeAddress(t, peer, peerLen, addr, addrLen); linuxerr.Equals(linuxerr.EINVAL, err) { return 0, err } } @@ -346,14 +345,14 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. @@ -373,14 +372,14 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if backlog > maxListenBacklog { @@ -411,21 +410,21 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Validate how, then call syscall implementation. switch how { case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } return 0, nil, s.Shutdown(t, int(how)).ToError() @@ -442,14 +441,14 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Read the length. Reject negative values. @@ -458,7 +457,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, err } if optLen < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Call syscall implementation then copy both value and value len out. @@ -523,21 +522,21 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if optLen < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if optLen > maxOptLen { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) if _, err := t.CopyInBytes(optValAddr, buf); err != nil { @@ -561,14 +560,14 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Get the socket name and copy it to the caller. @@ -589,14 +588,14 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Get the socket peer name and copy it to the caller. @@ -616,25 +615,25 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -664,7 +663,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { @@ -673,20 +672,20 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -701,7 +700,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } if !ts.Valid() { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) haveDeadline = true @@ -721,7 +720,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { @@ -731,7 +730,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break @@ -753,7 +752,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, fl } if msg.IovLen > linux.UIO_MAXIOV { - return 0, syserror.EMSGSIZE + return 0, linuxerr.EMSGSIZE } dst, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -766,7 +765,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, fl if msg.ControlLen == 0 && msg.NameLen == 0 { n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) if err != nil { - return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(err.ToError(), linuxerr.ERESTARTSYS) } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC @@ -784,11 +783,11 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, fl } if msg.ControlLen > maxControlLen { - return 0, syserror.ENOBUFS + return 0, linuxerr.ENOBUFS } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } defer cms.Release(t) @@ -833,25 +832,25 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, fl // recvfrom and recv syscall handlers. func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLenPtr hostarch.Addr) (uintptr, error) { if int(bufLen) < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -877,7 +876,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, fla n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) cm.Release(t) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) + return 0, syserr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } // Copy the address to the caller. @@ -911,25 +910,25 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -949,7 +948,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if t.Arch().Width() != 8 { // We only handle 64-bit for now. - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { @@ -959,19 +958,19 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, nil, syserror.ENOTSOCK + return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -983,7 +982,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { @@ -993,7 +992,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { - return 0, nil, syserror.EFAULT + return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break @@ -1018,7 +1017,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio if msg.ControlLen > 0 { // Put an upper bound to prevent large allocations. if msg.ControlLen > maxControlLen { - return 0, syserror.ENOBUFS + return 0, linuxerr.ENOBUFS } controlData = make([]byte, msg.ControlLen) if _, err := t.CopyInBytes(hostarch.Addr(msg.Control), controlData); err != nil { @@ -1038,7 +1037,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio // Read data then call the sendmsg implementation. if msg.IovLen > linux.UIO_MAXIOV { - return 0, syserror.EMSGSIZE + return 0, linuxerr.EMSGSIZE } src, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, @@ -1063,7 +1062,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) - err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file) + err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendmsg", file) // Control messages should be released on error as well as for zero-length // messages, which are discarded by the receiver. if n == 0 || err != nil { @@ -1077,20 +1076,20 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFileVFS2(fd) if file == nil { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) if !ok { - return 0, syserror.ENOTSOCK + return 0, linuxerr.ENOTSOCK } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { @@ -1125,7 +1124,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)}) - return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file) + return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendto", file) } // SendTo implements the linux syscall sendto(2). diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 19e175203..0205f09e0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -18,6 +18,7 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -25,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -46,29 +46,29 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal count = int64(kernel.MAX_RW_COUNT) } if count < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get file descriptions. inFile := t.GetFileVFS2(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // The operation is non-blocking if anything is non-blocking. @@ -82,38 +82,38 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) if !inIsPipe && !outIsPipe { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Copy in offsets. inOffset := int64(-1) if inOffsetPtr != 0 { if inIsPipe { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if inFile.Options().DenyPRead { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil { return 0, nil, err } if inOffset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } outOffset := int64(-1) if outOffsetPtr != 0 { if outIsPipe { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } if outFile.Options().DenyPWrite { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil { return 0, nil, err } if outOffset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } @@ -150,7 +150,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal panic("at least one end of splice must be a pipe") } - if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + if n != 0 || err != linuxerr.ErrWouldBlock || nonBlock { break } if err = dw.waitForBoth(t); err != nil { @@ -172,7 +172,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "splice", outFile) } // Tee implements Linux syscall tee(2). @@ -189,29 +189,29 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo count = int64(kernel.MAX_RW_COUNT) } if count < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Get file descriptions. inFile := t.GetFileVFS2(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // The operation is non-blocking if anything is non-blocking. @@ -225,7 +225,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) if !inIsPipe || !outIsPipe { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Copy data. @@ -240,7 +240,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo defer dw.destroy() for { n, err = pipe.Tee(t, outPipeFD, inPipeFD, count) - if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + if n != 0 || err != linuxerr.ErrWouldBlock || nonBlock { break } if err = dw.waitForBoth(t); err != nil { @@ -250,7 +250,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if n != 0 { // If a partial write is completed, the error is dropped. Log it here. - if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { + if err != nil && err != io.EOF && err != linuxerr.ErrWouldBlock { log.Debugf("tee completed a partial write with error: %v", err) err = nil } @@ -258,7 +258,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, linuxerr.ERESTARTSYS, "tee", inFile) } // Sendfile implements linux system call sendfile(2). @@ -270,25 +270,25 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc inFile := t.GetFileVFS2(inFD) if inFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) if !inFile.IsReadable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } outFile := t.GetFileVFS2(outFD) if outFile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) if !outFile.IsWritable() { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } // Verify that the outFile Append flag is not set. if outFile.StatusFlags()&linux.O_APPEND != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Verify that inFile is a regular file or block device. This is a @@ -298,14 +298,14 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } else if stat.Mask&linux.STATX_TYPE == 0 || (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Copy offset if it exists. offset := int64(-1) if offsetAddr != 0 { if inFile.Options().DenyPRead { - return 0, nil, syserror.ESPIPE + return 0, nil, linuxerr.ESPIPE } var offsetP primitive.Int64 if _, err := offsetP.CopyIn(t, offsetAddr); err != nil { @@ -314,16 +314,16 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc offset = int64(offsetP) if offset < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if offset+count < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } } // Validate count. This must come after offset checks. if count < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if count == 0 { return 0, nil, nil @@ -359,10 +359,10 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc break } if err == nil && t.Interrupted() { - err = syserror.ErrInterrupted + err = linuxerr.ErrInterrupted break } - if err == syserror.ErrWouldBlock && !nonBlock { + if err == linuxerr.ErrWouldBlock && !nonBlock { err = dw.waitForBoth(t) } if err != nil { @@ -388,7 +388,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc var writeN int64 writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{}) wbuf = wbuf[writeN:] - if err == syserror.ErrWouldBlock && !nonBlock { + if err == linuxerr.ErrWouldBlock && !nonBlock { err = dw.waitForOut(t) } if err != nil { @@ -419,10 +419,10 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc break } if err == nil && t.Interrupted() { - err = syserror.ErrInterrupted + err = linuxerr.ErrInterrupted break } - if err == syserror.ErrWouldBlock && !nonBlock { + if err == linuxerr.ErrWouldBlock && !nonBlock { err = dw.waitForBoth(t) } if err != nil { @@ -440,7 +440,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } if total != 0 { - if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { + if err != nil && err != io.EOF && err != linuxerr.ErrWouldBlock { // If a partial write is completed, the error is dropped. Log it here. log.Debugf("sendfile completed a partial write with error: %v", err) err = nil @@ -449,7 +449,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. - return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, syserror.ERESTARTSYS, "sendfile", inFile) + return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, linuxerr.ERESTARTSYS, "sendfile", inFile) } // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index 69e77fa99..adaf8db3f 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -17,15 +17,14 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // Stat implements Linux syscall stat(2). @@ -53,7 +52,7 @@ func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr hostarch.Addr, flags int32) error { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } opts := vfs.StatOptions{ @@ -70,7 +69,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr hostarch.Addr, flag start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() @@ -78,7 +77,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr hostarch.Addr, flag } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - return syserror.EBADF + return linuxerr.EBADF } if !path.HasComponents() { // Use FileDescription.Stat() instead of @@ -131,7 +130,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -156,15 +155,15 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall statxAddr := args[4].Pointer() if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Make sure that only one sync type option is set. syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE) if syncType != 0 && !bits.IsPowerOfTwo32(syncType) { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if mask&linux.STATX__RESERVED != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } opts := vfs.StatOptions{ @@ -182,7 +181,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { - return 0, nil, syserror.ENOENT + return 0, nil, linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() @@ -190,7 +189,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } if !path.HasComponents() { // Use FileDescription.Stat() instead of @@ -272,7 +271,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) er // Sanity check the mode. if mode&^(rOK|wOK|xOK) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) @@ -315,7 +314,7 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr hostarch.Addr, size uint) (uintptr, *kernel.SyscallControl, error) { if int(size) <= 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go index 2da538fc6..122921b52 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go index 88b9c7627..d32031481 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 1f8a5878c..cfc693422 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -16,9 +16,10 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/syserr" ) // Sync implements Linux syscall sync(2). @@ -32,12 +33,12 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } return 0, nil, file.SyncFS(t) @@ -49,7 +50,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -71,15 +72,15 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel // Check for negative values and overflow. if offset < 0 || offset+nbytes < 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -107,12 +108,12 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS } if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 { if err := file.Sync(t); err != nil { - return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + return 0, nil, syserr.ConvertIntr(err, linuxerr.ERESTARTSYS) } } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 250870c03..b8f96a757 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -16,11 +16,11 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" ) // TimerfdCreate implements Linux syscall timerfd_create(2). @@ -29,7 +29,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel flags := args[1].Int() if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } // Timerfds aren't writable per se (their implementation of Write just @@ -47,7 +47,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME: clock = t.Kernel().MonotonicClock() default: - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } vfsObj := t.Kernel().VFS() file, err := timerfd.New(t, vfsObj, clock, fileFlags) @@ -72,18 +72,18 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne oldValAddr := args[3].Pointer() if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } var newVal linux.Itimerspec @@ -111,13 +111,13 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } tm, s := tfd.GetTime() diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go index c261050c6..7b2f69c45 100644 --- a/pkg/sentry/syscalls/linux/vfs2/xattr.go +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -18,13 +18,12 @@ import ( "bytes" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - - "gvisor.dev/gvisor/pkg/hostarch" ) // ListXattr implements Linux syscall listxattr(2). @@ -71,7 +70,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -140,7 +139,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -178,7 +177,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli flags := args[4].Int() if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) @@ -216,12 +215,12 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys flags := args[4].Int() if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { - return 0, nil, syserror.EINVAL + return 0, nil, linuxerr.EINVAL } file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -280,7 +279,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. file := t.GetFileVFS2(fd) if file == nil { - return 0, nil, syserror.EBADF + return 0, nil, linuxerr.EBADF } defer file.DecRef(t) @@ -295,13 +294,13 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. func copyInXattrName(t *kernel.Task, nameAddr hostarch.Addr) (string, error) { name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) if err != nil { - if err == syserror.ENAMETOOLONG { - return "", syserror.ERANGE + if linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { + return "", linuxerr.ERANGE } return "", err } if len(name) == 0 { - return "", syserror.ERANGE + return "", linuxerr.ERANGE } return name, nil } @@ -321,16 +320,16 @@ func copyOutXattrNameList(t *kernel.Task, listAddr hostarch.Addr, size uint, nam } if buf.Len() > int(size) { if size >= linux.XATTR_LIST_MAX { - return 0, syserror.E2BIG + return 0, linuxerr.E2BIG } - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } return t.CopyOutBytes(listAddr, buf.Bytes()) } func copyInXattrValue(t *kernel.Task, valueAddr hostarch.Addr, size uint) (string, error) { if size > linux.XATTR_SIZE_MAX { - return "", syserror.E2BIG + return "", linuxerr.E2BIG } buf := make([]byte, size) if _, err := t.CopyInBytes(valueAddr, buf); err != nil { @@ -349,9 +348,9 @@ func copyOutXattrValue(t *kernel.Task, valueAddr hostarch.Addr, size uint, value } if len(value) > int(size) { if size >= linux.XATTR_SIZE_MAX { - return 0, syserror.E2BIG + return 0, linuxerr.E2BIG } - return 0, syserror.ERANGE + return 0, linuxerr.ERANGE } return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value)) } diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go index f88055676..cfcc21271 100644 --- a/pkg/sentry/syscalls/syscalls.go +++ b/pkg/sentry/syscalls/syscalls.go @@ -28,9 +28,9 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/syserror" ) // Supported returns a syscall that is fully supported. @@ -99,13 +99,13 @@ func CapError(name string, c linux.Capability, note string, urls []string) kerne Name: name, Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if !t.HasCapability(c) { - return 0, nil, syserror.EPERM + return 0, nil, linuxerr.EPERM } t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS + return 0, nil, linuxerr.ENOSYS }, SupportLevel: kernel.SupportUnimplemented, - Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, syserror.EPERM, c.String(), syserror.ENOSYS), + Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, linuxerr.EPERM, c.String(), linuxerr.ENOSYS), URLs: urls, } } diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index 362dea76d..c21971322 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -25,6 +25,8 @@ go_library( "muldiv_arm64.s", "parameters.go", "sampler.go", + "sampler_amd64.go", + "sampler_arm64.go", "sampler_unsafe.go", "seqatomic_parameters_unsafe.go", "tsc_amd64.s", @@ -32,11 +34,11 @@ go_library( ], visibility = ["//:sandbox"], deps = [ + "//pkg/errors/linuxerr", "//pkg/gohacks", "//pkg/log", "//pkg/metric", "//pkg/sync", - "//pkg/syserror", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go index 39bf1e0de..eed74f6bd 100644 --- a/pkg/sentry/time/calibrated_clock.go +++ b/pkg/sentry/time/calibrated_clock.go @@ -19,10 +19,10 @@ package time import ( "time" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // CalibratedClock implements a clock that tracks a reference clock. @@ -259,6 +259,6 @@ func (c *CalibratedClocks) GetTime(id ClockID) (int64, error) { case Realtime: return c.realtime.GetTime() default: - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } } diff --git a/pkg/sentry/time/calibrated_clock_test.go b/pkg/sentry/time/calibrated_clock_test.go index d6622bfe2..0a4b1f1bf 100644 --- a/pkg/sentry/time/calibrated_clock_test.go +++ b/pkg/sentry/time/calibrated_clock_test.go @@ -50,6 +50,7 @@ func TestConstantFrequency(t *testing.T) { if !c.ready { c.mu.RUnlock() t.Fatalf("clock not ready") + return // For checklocks consistency. } // A bit after the last sample. now, ok := c.params.ComputeTime(750000) diff --git a/pkg/sentry/time/sampler.go b/pkg/sentry/time/sampler.go index 4ac9c4474..24a47f5d5 100644 --- a/pkg/sentry/time/sampler.go +++ b/pkg/sentry/time/sampler.go @@ -21,13 +21,6 @@ import ( ) const ( - // defaultOverheadTSC is the default estimated syscall overhead in TSC cycles. - // It is further refined as syscalls are made. - defaultOverheadCycles = 1 * 1000 - - // maxOverheadCycles is the maximum allowed syscall overhead in TSC cycles. - maxOverheadCycles = 100 * defaultOverheadCycles - // maxSampleLoops is the maximum number of times to try to get a clock sample // under the expected overhead. maxSampleLoops = 5 diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/time/sampler_amd64.go index b734b6987..5fa1832b4 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go +++ b/pkg/sentry/time/sampler_amd64.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,19 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -package disklayout +//go:build amd64 +// +build amd64 -import ( - "testing" -) +package time + +const ( + // defaultOverheadTSC is the default estimated syscall overhead in TSC cycles. + // It is further refined as syscalls are made. + defaultOverheadCycles = 1 * 1000 -// TestSuperBlockSize tests that the superblock structs are of the correct -// size. -func TestSuperBlockSize(t *testing.T) { - var sbOld SuperBlockOld - assertSize(t, &sbOld, 84) - var sb32 SuperBlock32Bit - assertSize(t, &sb32, 336) - var sb64 SuperBlock64Bit - assertSize(t, &sb64, 1024) -} + // maxOverheadCycles is the maximum allowed syscall overhead in TSC cycles. + maxOverheadCycles = 100 * defaultOverheadCycles +) diff --git a/pkg/sentry/time/sampler_arm64.go b/pkg/sentry/time/sampler_arm64.go new file mode 100644 index 000000000..3560e66ae --- /dev/null +++ b/pkg/sentry/time/sampler_arm64.go @@ -0,0 +1,43 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build arm64 +// +build arm64 + +package time + +// getCNTFRQ get ARM counter-timer frequency +func getCNTFRQ() TSCValue + +// getDefaultArchOverheadCycles get default OverheadCycles based on +// ARM counter-timer frequency. Usually ARM counter-timer frequency +// is range from 1-50Mhz which is much less than that on x86, so we +// calibrate defaultOverheadCycles for ARM. +func getDefaultArchOverheadCycles() TSCValue { + // estimated the clock frequency on x86 is 1Ghz. + // 1Ghz devided by counter-timer frequency of ARM to get + // frqRatio. defaultOverheadCycles of ARM equals to that on + // x86 devided by frqRatio + cntfrq := getCNTFRQ() + frqRatio := 1000000000 / cntfrq + overheadCycles := (1 * 1000) / frqRatio + return overheadCycles +} + +// defaultOverheadTSC is the default estimated syscall overhead in TSC cycles. +// It is further refined as syscalls are made. +var defaultOverheadCycles = getDefaultArchOverheadCycles() + +// maxOverheadCycles is the maximum allowed syscall overhead in TSC cycles. +var maxOverheadCycles = 100 * defaultOverheadCycles diff --git a/pkg/sentry/time/tsc_arm64.s b/pkg/sentry/time/tsc_arm64.s index da9fa4112..711349fa1 100644 --- a/pkg/sentry/time/tsc_arm64.s +++ b/pkg/sentry/time/tsc_arm64.s @@ -20,3 +20,9 @@ TEXT ·Rdtsc(SB),NOSPLIT,$0-8 WORD $0xd53be040 //MRS CNTVCT_EL0, R0 MOVD R0, ret+0(FP) RET + +TEXT ·getCNTFRQ(SB),NOSPLIT,$0-8 + // Get the virtual counter frequency. + WORD $0xd53be000 //MRS CNTFRQ_EL0, R0 + MOVD R0, ret+0(FP) + RET diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index 581862ee2..e7073ec87 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -132,7 +132,7 @@ func Init() error { // always be the case for a newly mapped page from /dev/shm. If we obtain // the shared memory through some other means in the future, we may have to // explicitly zero the page. - mmap, err := unix.Mmap(int(file.Fd()), 0, int(RTMemoryStatsSize), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED) + mmap, err := memutil.MapFile(0, RTMemoryStatsSize, unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, file.Fd(), 0) if err != nil { return fmt.Errorf("error mapping usage file: %v", err) } diff --git a/pkg/sentry/usage/memory_unsafe.go b/pkg/sentry/usage/memory_unsafe.go index 9e0014ca0..bc1531b91 100644 --- a/pkg/sentry/usage/memory_unsafe.go +++ b/pkg/sentry/usage/memory_unsafe.go @@ -21,7 +21,7 @@ import ( // RTMemoryStatsSize is the size of the RTMemoryStats struct. var RTMemoryStatsSize = unsafe.Sizeof(RTMemoryStats{}) -// RTMemoryStatsPointer casts the address of the byte slice into a RTMemoryStats pointer. -func RTMemoryStatsPointer(b []byte) *RTMemoryStats { - return (*RTMemoryStats)(unsafe.Pointer(&b[0])) +// RTMemoryStatsPointer casts addr to a RTMemoryStats pointer. +func RTMemoryStatsPointer(addr uintptr) *RTMemoryStats { + return (*RTMemoryStats)(unsafe.Pointer(addr)) } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index ac60fe8bf..914574543 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -95,6 +95,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fd", "//pkg/fdnotifier", "//pkg/fspath", @@ -115,7 +116,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/uniqueid", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -133,9 +133,9 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/contexttest", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 5aad31b78..82ee2c521 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -1,9 +1,5 @@ # The gVisor Virtual Filesystem -THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION -USE. For the filesystem implementation currently used by gVisor, see the `fs` -package. - ## Implementation Notes ### Reference Counting diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index f48817132..255d3992e 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -19,11 +19,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/syserror" ) // NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name, @@ -101,7 +101,7 @@ func (fs *anonFilesystem) Sync(ctx context.Context) error { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error { if !rp.Done() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID) } @@ -109,10 +109,10 @@ func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds // GetDentryAt implements FilesystemImpl.GetDentryAt. func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) { if !rp.Done() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if opts.CheckSearchable { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // anonDentry no-ops refcounting. return rp.Start(), nil @@ -121,7 +121,7 @@ func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, op // GetParentDentryAt implements FilesystemImpl.GetParentDentryAt. func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) { if !rp.Final() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } // anonDentry no-ops refcounting. return rp.Start(), nil @@ -130,63 +130,63 @@ func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPa // LinkAt implements FilesystemImpl.LinkAt. func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // MkdirAt implements FilesystemImpl.MkdirAt. func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // MknodAt implements FilesystemImpl.MknodAt. func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // OpenAt implements FilesystemImpl.OpenAt. func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) { if !rp.Done() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } - return nil, syserror.ENODEV + return nil, linuxerr.ENODEV } // ReadlinkAt implements FilesystemImpl.ReadlinkAt. func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) { if !rp.Done() { - return "", syserror.ENOTDIR + return "", linuxerr.ENOTDIR } - return "", syserror.EINVAL + return "", linuxerr.EINVAL } // RenameAt implements FilesystemImpl.RenameAt. func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // RmdirAt implements FilesystemImpl.RmdirAt. func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // SetStatAt implements FilesystemImpl.SetStatAt. func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error { if !rp.Done() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Linux actually permits anon_inode_inode's metadata to be set, which is // visible to all users of anon_inode_inode. We just silently ignore @@ -197,7 +197,7 @@ func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts // StatAt implements FilesystemImpl.StatAt. func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) { if !rp.Done() { - return linux.Statx{}, syserror.ENOTDIR + return linux.Statx{}, linuxerr.ENOTDIR } // See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode(). return linux.Statx{ @@ -218,7 +218,7 @@ func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts St // StatFSAt implements FilesystemImpl.StatFSAt. func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) { if !rp.Done() { - return linux.Statfs{}, syserror.ENOTDIR + return linux.Statfs{}, linuxerr.ENOTDIR } return linux.Statfs{ Type: linux.ANON_INODE_FS_MAGIC, @@ -229,34 +229,34 @@ func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linu // SymlinkAt implements FilesystemImpl.SymlinkAt. func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // UnlinkAt implements FilesystemImpl.UnlinkAt. func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error { if !rp.Final() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) { if !rp.Final() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil { return nil, err } - return nil, syserror.ECONNREFUSED + return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements FilesystemImpl.ListXattrAt. func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) { if !rp.Done() { - return nil, syserror.ENOTDIR + return nil, linuxerr.ENOTDIR } return nil, nil } @@ -264,25 +264,25 @@ func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, si // GetXattrAt implements FilesystemImpl.GetXattrAt. func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) { if !rp.Done() { - return "", syserror.ENOTDIR + return "", linuxerr.ENOTDIR } - return "", syserror.ENOTSUP + return "", linuxerr.ENOTSUP } // SetXattrAt implements FilesystemImpl.SetXattrAt. func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error { if !rp.Done() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // RemoveXattrAt implements FilesystemImpl.RemoveXattrAt. func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error { if !rp.Done() { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } - return syserror.EPERM + return linuxerr.EPERM } // PrependPath implements FilesystemImpl.PrependPath. diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index e7ca24d96..cb92b6eee 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -18,8 +18,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Dentry represents a node in a Filesystem tree at which a file exists. @@ -196,11 +196,12 @@ func (d *Dentry) OnZeroWatches(ctx context.Context) { // PrepareDeleteDentry must be called before attempting to delete the file // represented by d. If PrepareDeleteDentry succeeds, the caller must call // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. +// +checklocksacquire:d.mu func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { vfs.mountMu.Lock() if mntns.mountpoints[d] != 0 { vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY // +checklocksforce: inconsistent return. } d.mu.Lock() vfs.mountMu.Unlock() @@ -211,14 +212,14 @@ func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dent // AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion // fails. -// +checklocks:d.mu +// +checklocksrelease:d.mu func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { d.mu.Unlock() } // CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion // succeeds. -// +checklocks:d.mu +// +checklocksrelease:d.mu func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) { d.dead = true d.mu.Unlock() @@ -249,16 +250,18 @@ func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) { // Preconditions: // * If to is not nil, it must be a child Dentry from the same Filesystem. // * from != to. +// +checklocksacquire:from.mu +// +checklocksacquire:to.mu func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { vfs.mountMu.Lock() if mntns.mountpoints[from] != 0 { vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY // +checklocksforce: no locks acquired. } if to != nil { if mntns.mountpoints[to] != 0 { vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY // +checklocksforce: no locks acquired. } to.mu.Lock() } @@ -267,13 +270,13 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t // Return with from.mu and to.mu locked, which will be unlocked by // AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry. - return nil + return nil // +checklocksforce: to may not be acquired. } // AbortRenameDentry must be called after PrepareRenameDentry if the rename // fails. -// +checklocks:from.mu -// +checklocks:to.mu +// +checklocksrelease:from.mu +// +checklocksrelease:to.mu func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { from.mu.Unlock() if to != nil { @@ -286,8 +289,8 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { // that was replaced by from. // // Preconditions: PrepareRenameDentry was previously called on from and to. -// +checklocks:from.mu -// +checklocks:to.mu +// +checklocksrelease:from.mu +// +checklocksrelease:to.mu func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) { from.mu.Unlock() if to != nil { @@ -303,8 +306,8 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, fro // from and to are exchanged by rename(RENAME_EXCHANGE). // // Preconditions: PrepareRenameDentry was previously called on from and to. -// +checklocks:from.mu -// +checklocks:to.mu +// +checklocksrelease:from.mu +// +checklocksrelease:to.mu func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { from.mu.Unlock() to.mu.Unlock() diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index dde2ad79b..572d81afc 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -18,7 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // DeviceKind indicates whether a device is a block or character device. @@ -100,7 +100,7 @@ func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mo defer vfs.devicesMu.RUnlock() rd, ok := vfs.devices[tup] if !ok { - return nil, syserror.ENXIO + return nil, linuxerr.ENXIO } return rd.dev.Open(ctx, mnt, d, *opts) } @@ -120,7 +120,7 @@ func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) { } minor++ } - return 0, syserror.EMFILE + return 0, linuxerr.EMFILE } // PutAnonBlockDevMinor deallocates a minor device number returned by a diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index ae004b371..04bc4d10c 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -17,8 +17,8 @@ package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -174,7 +174,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin // that cyclic polling is not introduced after the check. defer epollCycleMu.Unlock() if subep.mightPoll(ep) { - return syserror.ELOOP + return linuxerr.ELOOP } } @@ -187,7 +187,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin num: num, } if _, ok := ep.interest[key]; ok { - return syserror.EEXIST + return linuxerr.EEXIST } // Register interest in file. @@ -258,7 +258,7 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event num: num, }] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } // Update epi for the next call to ep.ReadEvents(). @@ -294,7 +294,7 @@ func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error num: num, }] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } // Unregister from the file so that epi will no longer be readied. diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index ef8d8a813..ca3303dec 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -20,13 +20,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -252,7 +252,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede return err } if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) { - return syserror.EPERM + return linuxerr.EPERM } } if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) { @@ -266,14 +266,14 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede return err } if stat.Mask&linux.STATX_UID == 0 { - return syserror.EPERM + return linuxerr.EPERM } if !CanActAsOwner(creds, auth.KUID(stat.UID)) { - return syserror.EPERM + return linuxerr.EPERM } } if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO { - return syserror.EINVAL + return linuxerr.EINVAL } // TODO(gvisor.dev/issue/1035): FileDescriptionImpl.SetOAsync()? const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK @@ -567,7 +567,7 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { // Allocate grows file represented by FileDescription to offset + length bytes. func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { if !fd.IsWritable() { - return syserror.EBADF + return linuxerr.EBADF } if err := fd.impl.Allocate(ctx, mode, offset, length); err != nil { return err @@ -602,10 +602,10 @@ func (fd *FileDescription) EventUnregister(e *waiter.Entry) { // partial reads with a nil error. func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { if fd.opts.DenyPRead { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } if !fd.readable { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } start := fsmetric.StartReadWait() n, err := fd.impl.PRead(ctx, dst, offset, opts) @@ -620,7 +620,7 @@ func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of // Read is similar to PRead, but does not specify an offset. func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { if !fd.readable { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } start := fsmetric.StartReadWait() n, err := fd.impl.Read(ctx, dst, opts) @@ -637,10 +637,10 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt // return partial writes with a nil error. func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { if fd.opts.DenyPWrite { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } if !fd.writable { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } n, err := fd.impl.PWrite(ctx, src, offset, opts) if n > 0 { @@ -652,7 +652,7 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { if !fd.writable { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } n, err := fd.impl.Write(ctx, src, opts) if n > 0 { @@ -708,8 +708,8 @@ func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string return names, err } names, err := fd.impl.ListXattr(ctx, size) - if err == syserror.ENOTSUP { - // Linux doesn't actually return ENOTSUP in this case; instead, + if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { + // Linux doesn't actually return EOPNOTSUPP in this case; instead, // fs/xattr.c:vfs_listxattr() falls back to allowing the security // subsystem to return security extended attributes, which by default // don't exist. @@ -873,7 +873,7 @@ func (fd *FileDescription) ComputeLockRange(ctx context.Context, start uint64, l } off = int64(stat.Size) default: - return lock.LockRange{}, syserror.EINVAL + return lock.LockRange{}, linuxerr.EINVAL } return lock.ComputeRange(int64(start), int64(length), off) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 2b6f47b4b..452f5f1f9 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -17,14 +17,15 @@ package vfs import ( "bytes" "io" + "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -55,7 +56,7 @@ func (FileDescriptionDefaultImpl) OnClose(ctx context.Context) error { // StatFS implements FileDescriptionImpl.StatFS analogously to // super_operations::statfs == NULL in Linux. func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) { - return linux.Statfs{}, syserror.ENOSYS + return linux.Statfs{}, linuxerr.ENOSYS } // Allocate implements FileDescriptionImpl.Allocate analogously to @@ -65,7 +66,7 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err // should technically return EISDIR. Allocate should never be called for a // directory, because it requires a writable fd. func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.ENODEV + return linuxerr.ENODEV } // Readiness implements waiter.Waitable.Readiness analogously to @@ -88,81 +89,81 @@ func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) { // PRead implements FileDescriptionImpl.PRead analogously to // file_operations::read == file_operations::read_iter == NULL in Linux. func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Read implements FileDescriptionImpl.Read analogously to // file_operations::read == file_operations::read_iter == NULL in Linux. func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // PWrite implements FileDescriptionImpl.PWrite analogously to // file_operations::write == file_operations::write_iter == NULL in Linux. func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Write implements FileDescriptionImpl.Write analogously to // file_operations::write == file_operations::write_iter == NULL in Linux. func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // IterDirents implements FileDescriptionImpl.IterDirents analogously to // file_operations::iterate == file_operations::iterate_shared == NULL in // Linux. func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Seek implements FileDescriptionImpl.Seek analogously to // file_operations::llseek == NULL in Linux. func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Sync implements FileDescriptionImpl.Sync analogously to // file_operations::fsync == NULL in Linux. func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error { - return syserror.EINVAL + return linuxerr.EINVAL } // ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to // file_operations::mmap == NULL in Linux. func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - return syserror.ENODEV + return linuxerr.ENODEV } // Ioctl implements FileDescriptionImpl.Ioctl analogously to // file_operations::unlocked_ioctl == NULL in Linux. func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } // ListXattr implements FileDescriptionImpl.ListXattr analogously to // inode_operations::listxattr == NULL in Linux. func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) { // This isn't exactly accurate; see FileDescription.ListXattr. - return nil, syserror.ENOTSUP + return nil, linuxerr.ENOTSUP } // GetXattr implements FileDescriptionImpl.GetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { - return "", syserror.ENOTSUP + return "", linuxerr.ENOTSUP } // SetXattr implements FileDescriptionImpl.SetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error { - return syserror.ENOTSUP + return linuxerr.ENOTSUP } // RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error { - return syserror.ENOTSUP + return linuxerr.ENOTSUP } // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of @@ -174,27 +175,27 @@ type DirectoryFileDescriptionDefaultImpl struct{} // Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.EISDIR + return linuxerr.EISDIR } // PRead implements FileDescriptionImpl.PRead. func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Read implements FileDescriptionImpl.Read. func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // PWrite implements FileDescriptionImpl.PWrite. func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Write implements FileDescriptionImpl.Write. func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // DentryMetadataFileDescriptionImpl may be embedded by implementations of @@ -333,10 +334,10 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6 offset += fd.off default: // fs/seq_file:seq_lseek() rejects SEEK_END etc. - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } if offset != fd.lastRead { // Regenerate the file's contents immediately. Compare @@ -357,7 +358,7 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6 // Preconditions: fd.mu must be locked. func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { - return 0, syserror.EOPNOTSUPP + return 0, linuxerr.EOPNOTSUPP } limit, err := CheckLimit(ctx, offset, src.NumBytes()) if err != nil { @@ -367,7 +368,7 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src writable, ok := fd.data.(WritableDynamicBytesSource) if !ok { - return 0, syserror.EIO + return 0, linuxerr.EIO } n, err := writable.Write(ctx, src, offset) if err != nil { @@ -399,6 +400,9 @@ func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src userme // GenericConfigureMMap may be used by most implementations of // FileDescriptionImpl.ConfigureMMap. func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error { + if opts.Offset+opts.Length > math.MaxInt64 { + return linuxerr.EOVERFLOW + } opts.Mappable = m opts.MappingIdentity = fd fd.IncRef() @@ -467,27 +471,27 @@ func (NoLockFD) SupportsLocks() bool { // LockBSD implements FileDescriptionImpl.LockBSD. func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { - return syserror.ENOLCK + return linuxerr.ENOLCK } // UnlockBSD implements FileDescriptionImpl.UnlockBSD. func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { - return syserror.ENOLCK + return linuxerr.ENOLCK } // LockPOSIX implements FileDescriptionImpl.LockPOSIX. func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { - return syserror.ENOLCK + return linuxerr.ENOLCK } // UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { - return syserror.ENOLCK + return linuxerr.ENOLCK } // TestPOSIX implements FileDescriptionImpl.TestPOSIX. func (NoLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { - return linux.Flock{}, syserror.ENOLCK + return linux.Flock{}, linuxerr.ENOLCK } // BadLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface @@ -503,25 +507,25 @@ func (BadLockFD) SupportsLocks() bool { // LockBSD implements FileDescriptionImpl.LockBSD. func (BadLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { - return syserror.EBADF + return linuxerr.EBADF } // UnlockBSD implements FileDescriptionImpl.UnlockBSD. func (BadLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { - return syserror.EBADF + return linuxerr.EBADF } // LockPOSIX implements FileDescriptionImpl.LockPOSIX. func (BadLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { - return syserror.EBADF + return linuxerr.EBADF } // UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. func (BadLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { - return syserror.EBADF + return linuxerr.EBADF } // TestPOSIX implements FileDescriptionImpl.TestPOSIX. func (BadLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { - return linux.Flock{}, syserror.EBADF + return linux.Flock{}, linuxerr.EBADF } diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 1cd607c0a..e34a8c11b 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -102,7 +102,7 @@ func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, erro // SetStat implements FileDescriptionImpl.SetStat. func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error { - return syserror.EPERM + return linuxerr.EPERM } func TestGenCountFD(t *testing.T) { @@ -155,11 +155,11 @@ func TestGenCountFD(t *testing.T) { } // Write and PWrite fails. - if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EIO { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) + if _, err := fd.Write(ctx, ioseq, WriteOptions{}); !linuxerr.Equals(linuxerr.EIO, err) { + t.Errorf("Write: got err %v, wanted %v", err, linuxerr.EIO) } - if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EIO { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) + if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); !linuxerr.Equals(linuxerr.EIO, err) { + t.Errorf("Write: got err %v, wanted %v", err, linuxerr.EIO) } } @@ -215,10 +215,10 @@ func TestWritable(t *testing.T) { if n, err := fd.Seek(ctx, 1, linux.SEEK_SET); n != 0 && err != nil { t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err) } - if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && err != syserror.EINVAL { + if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && !linuxerr.Equals(linuxerr.EINVAL, err) { t.Errorf("Write: got err (%v, %v), wanted (0, EINVAL)", n, err) } - if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && err != syserror.EINVAL { + if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && !linuxerr.Equals(linuxerr.EINVAL, err) { t.Errorf("PWrite: got err (%v, %v), wanted (0, EINVAL)", n, err) } } diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 49d29e20b..17d94b341 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -98,7 +98,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs. flags &^= linux.O_CLOEXEC if flags&^linux.O_NONBLOCK != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } id := uniqueid.GlobalFromContext(ctx) @@ -184,23 +184,23 @@ func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { // PRead implements FileDescriptionImpl.PRead. func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // PWrite implements FileDescriptionImpl.PWrite. func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.ESPIPE + return 0, linuxerr.ESPIPE } // Write implements FileDescriptionImpl.Write. func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Read implements FileDescriptionImpl.Read. func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { if dst.NumBytes() < inotifyEventBaseSize { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } i.evMu.Lock() @@ -208,7 +208,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt if i.events.Empty() { // Nothing to read yet, tell caller to block. - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } var writeLen int64 @@ -226,7 +226,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt // write some events out. return writeLen, nil } - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } // Linux always dequeues an available event as long as there's enough @@ -262,7 +262,7 @@ func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallAr return 0, err default: - return 0, syserror.ENOTTY + return 0, linuxerr.ENOTTY } } @@ -332,7 +332,7 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { if ws == nil { // While Linux supports inotify watches on all filesystem types, watches on // filesystems like kernfs are not generally useful, so we do not. - return 0, syserror.EPERM + return 0, linuxerr.EPERM } // Does the target already have a watch from this inotify instance? if existing := ws.Lookup(i.id); existing != nil { @@ -360,7 +360,7 @@ func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { w, ok := i.watches[wd] if !ok { i.mu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } // Remove the watch from this instance. diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go index cbe4d8c2d..1853cdca0 100644 --- a/pkg/sentry/vfs/lock.go +++ b/pkg/sentry/vfs/lock.go @@ -17,8 +17,8 @@ package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/syserror" ) // FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) @@ -47,9 +47,9 @@ func (fl *FileLocks) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerID i // Return an appropriate error for the unsuccessful lock attempt, depending on // whether this is a blocking or non-blocking operation. if block == nil { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } - return syserror.ERESTARTSYS + return linuxerr.ERESTARTSYS } // UnlockBSD releases a BSD-style lock on the entire file. @@ -69,9 +69,9 @@ func (fl *FileLocks) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPI // Return an appropriate error for the unsuccessful lock attempt, depending on // whether this is a blocking or non-blocking operation. if block == nil { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } - return syserror.ERESTARTSYS + return linuxerr.ERESTARTSYS } // UnlockPOSIX releases a POSIX-style lock on a file region. diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD index ea82f4987..444ab42b9 100644 --- a/pkg/sentry/vfs/memxattr/BUILD +++ b/pkg/sentry/vfs/memxattr/BUILD @@ -8,9 +8,9 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/errors/linuxerr", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", ], ) diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go index 9b7953fa3..f0f82a4d6 100644 --- a/pkg/sentry/vfs/memxattr/xattr.go +++ b/pkg/sentry/vfs/memxattr/xattr.go @@ -20,10 +20,10 @@ import ( "strings" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // SimpleExtendedAttributes implements extended attributes using a map of @@ -49,12 +49,12 @@ func (x *SimpleExtendedAttributes) GetXattr(creds *auth.Credentials, mode linux. value, ok := x.xattrs[opts.Name] x.mu.RUnlock() if !ok { - return "", syserror.ENODATA + return "", linuxerr.ENODATA } // Check that the size of the buffer provided in getxattr(2) is large enough // to contain the value. if opts.Size != 0 && uint64(len(value)) > opts.Size { - return "", syserror.ERANGE + return "", linuxerr.ERANGE } return value, nil } @@ -69,17 +69,17 @@ func (x *SimpleExtendedAttributes) SetXattr(creds *auth.Credentials, mode linux. defer x.mu.Unlock() if x.xattrs == nil { if opts.Flags&linux.XATTR_REPLACE != 0 { - return syserror.ENODATA + return linuxerr.ENODATA } x.xattrs = make(map[string]string) } _, ok := x.xattrs[opts.Name] if ok && opts.Flags&linux.XATTR_CREATE != 0 { - return syserror.EEXIST + return linuxerr.EEXIST } if !ok && opts.Flags&linux.XATTR_REPLACE != 0 { - return syserror.ENODATA + return linuxerr.ENODATA } x.xattrs[opts.Name] = opts.Value @@ -106,7 +106,7 @@ func (x *SimpleExtendedAttributes) ListXattr(creds *auth.Credentials, size uint6 } x.mu.RUnlock() if size != 0 && uint64(listSize) > size { - return nil, syserror.ERANGE + return nil, linuxerr.ERANGE } return names, nil } @@ -120,7 +120,7 @@ func (x *SimpleExtendedAttributes) RemoveXattr(creds *auth.Credentials, mode lin x.mu.Lock() defer x.mu.Unlock() if _, ok := x.xattrs[name]; !ok { - return syserror.ENODATA + return linuxerr.ENODATA } delete(x.xattrs, name) return nil diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index f93da3af1..05a416775 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -24,9 +24,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem @@ -159,7 +159,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth rft := vfs.getFilesystemType(fsTypeName) if rft == nil { ctx.Warningf("Unknown filesystem type: %s", fsTypeName) - return nil, syserror.ENODEV + return nil, linuxerr.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { @@ -192,10 +192,10 @@ func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { - return nil, syserror.ENODEV + return nil, linuxerr.ENODEV } if !opts.InternalMount && !rft.opts.AllowUserMount { - return nil, syserror.ENODEV + return nil, linuxerr.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { @@ -224,7 +224,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr vdDentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef(ctx) - return syserror.ENOENT + return linuxerr.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and // vfs.mountMu.Lock(). @@ -284,7 +284,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia // UmountAt removes the Mount at the given path. func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { - return syserror.EINVAL + return linuxerr.EINVAL } // MNT_FORCE is currently unimplemented except for the permission check. @@ -292,7 +292,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti // namespace, and not in the owner user namespace for the target mount. See // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { - return syserror.EPERM + return linuxerr.EPERM } vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) @@ -301,19 +301,19 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti } defer vd.DecRef(ctx) if vd.dentry != vd.mount.root { - return syserror.EINVAL + return linuxerr.EINVAL } vfs.mountMu.Lock() if mntns := MountNamespaceFromContext(ctx); mntns != nil { defer mntns.DecRef(ctx) if mntns != vd.mount.ns { vfs.mountMu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } if vd.mount == vd.mount.ns.root { vfs.mountMu.Unlock() - return syserror.EINVAL + return linuxerr.EINVAL } } @@ -326,7 +326,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti if len(vd.mount.children) != 0 { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY } // We are holding a reference on vd.mount. expectedRefs := int64(1) @@ -336,7 +336,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() - return syserror.EBUSY + return linuxerr.EBUSY } } vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{ @@ -710,7 +710,7 @@ func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error { func (mnt *Mount) CheckBeginWrite() error { if atomic.AddInt64(&mnt.writers, 1) < 0 { atomic.AddInt64(&mnt.writers, -1) - return syserror.EROFS + return linuxerr.EROFS } return nil } @@ -728,7 +728,7 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error { } if ro { if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) { - return syserror.EBUSY + return linuxerr.EBUSY } return nil } diff --git a/pkg/sentry/vfs/opath.go b/pkg/sentry/vfs/opath.go index e9651b631..da0b33b79 100644 --- a/pkg/sentry/vfs/opath.go +++ b/pkg/sentry/vfs/opath.go @@ -17,10 +17,10 @@ package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -40,77 +40,77 @@ func (fd *opathFD) Release(context.Context) { // Allocate implements FileDescriptionImpl.Allocate. func (fd *opathFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.EBADF + return linuxerr.EBADF } // PRead implements FileDescriptionImpl.PRead. func (fd *opathFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Read implements FileDescriptionImpl.Read. func (fd *opathFD) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // PWrite implements FileDescriptionImpl.PWrite. func (fd *opathFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Write implements FileDescriptionImpl.Write. func (fd *opathFD) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // Ioctl implements FileDescriptionImpl.Ioctl. func (fd *opathFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // IterDirents implements FileDescriptionImpl.IterDirents. func (fd *opathFD) IterDirents(ctx context.Context, cb IterDirentsCallback) error { - return syserror.EBADF + return linuxerr.EBADF } // Seek implements FileDescriptionImpl.Seek. func (fd *opathFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - return 0, syserror.EBADF + return 0, linuxerr.EBADF } // ConfigureMMap implements FileDescriptionImpl.ConfigureMMap. func (fd *opathFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - return syserror.EBADF + return linuxerr.EBADF } // ListXattr implements FileDescriptionImpl.ListXattr. func (fd *opathFD) ListXattr(ctx context.Context, size uint64) ([]string, error) { - return nil, syserror.EBADF + return nil, linuxerr.EBADF } // GetXattr implements FileDescriptionImpl.GetXattr. func (fd *opathFD) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { - return "", syserror.EBADF + return "", linuxerr.EBADF } // SetXattr implements FileDescriptionImpl.SetXattr. func (fd *opathFD) SetXattr(ctx context.Context, opts SetXattrOptions) error { - return syserror.EBADF + return linuxerr.EBADF } // RemoveXattr implements FileDescriptionImpl.RemoveXattr. func (fd *opathFD) RemoveXattr(ctx context.Context, name string) error { - return syserror.EBADF + return linuxerr.EBADF } // Sync implements FileDescriptionImpl.Sync. func (fd *opathFD) Sync(ctx context.Context) error { - return syserror.EBADF + return linuxerr.EBADF } // SetStat implements FileDescriptionImpl.SetStat. func (fd *opathFD) SetStat(ctx context.Context, opts SetStatOptions) error { - return syserror.EBADF + return linuxerr.EBADF } // Stat implements FileDescriptionImpl.Stat. diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index e4da15009..7cc68a157 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -16,9 +16,9 @@ package vfs import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) var fspathBuilderPool = sync.Pool{ @@ -137,7 +137,7 @@ loop: // Linux's sys_getcwd(). func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { if vd.dentry.IsDead() { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } b := getFSPathBuilder() diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index b7704874f..953d31876 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -20,9 +20,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/syserror" ) // AccessTypes is a bitmask of Unix file permissions. @@ -77,7 +77,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu // the caller's user namespace; compare // kernel/capability.c:privileged_wrt_inode_uidgid(). if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() { - return syserror.EACCES + return linuxerr.EACCES } // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary // directories, and read arbitrary non-directory files. @@ -94,7 +94,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu return nil } } - return syserror.EACCES + return linuxerr.EACCES } // MayLink determines whether creating a hard link to a file with the given @@ -110,12 +110,12 @@ func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid // Only regular files can be hard linked. if mode.FileType() != linux.S_IFREG { - return syserror.EPERM + return linuxerr.EPERM } // Setuid files should not get pinned to the filesystem. if mode&linux.S_ISUID != 0 { - return syserror.EPERM + return linuxerr.EPERM } // Executable setgid files should not get pinned to the filesystem, but we @@ -123,7 +123,7 @@ func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid // Hardlinking to unreadable or unwritable sources is dangerous. if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil { - return syserror.EPERM + return linuxerr.EPERM } return nil } @@ -194,12 +194,12 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOpt return err } if limit < int64(stat.Size) { - return syserror.ErrExceedsFileSizeLimit + return linuxerr.ErrExceedsFileSizeLimit } } if stat.Mask&linux.STATX_MODE != 0 { if !CanActAsOwner(creds, kuid) { - return syserror.EPERM + return linuxerr.EPERM } // TODO(b/30815691): "If the calling process is not privileged (Linux: // does not have the CAP_FSETID capability), and the group of the file @@ -210,13 +210,13 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOpt if stat.Mask&linux.STATX_UID != 0 { if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) || HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { - return syserror.EPERM + return linuxerr.EPERM } } if stat.Mask&linux.STATX_GID != 0 { if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) || HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { - return syserror.EPERM + return linuxerr.EPERM } } if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) { @@ -229,7 +229,7 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOpt if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || (stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) || (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) { - return syserror.EPERM + return linuxerr.EPERM } if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { return err @@ -252,7 +252,7 @@ func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, paren HasCapabilityOnFile(creds, linux.CAP_FOWNER, childKUID, childKGID) { return nil } - return syserror.EPERM + return linuxerr.EPERM } // CanActAsOwner returns true if creds can act as the owner of a file with the @@ -281,7 +281,7 @@ func CheckLimit(ctx context.Context, offset, size int64) (int64, error) { return size, nil } if offset >= int64(fileSizeLimit) { - return 0, syserror.ErrExceedsFileSizeLimit + return 0, linuxerr.ErrExceedsFileSizeLimit } remaining := int64(fileSizeLimit) - offset if remaining < size { @@ -306,9 +306,9 @@ func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux. return nil } if ats.MayWrite() { - return syserror.EPERM + return linuxerr.EPERM } - return syserror.ENODATA + return linuxerr.ENODATA case strings.HasPrefix(name, linux.XATTR_USER_PREFIX): // In the user.* namespace, only regular files and directories can have // extended attributes. For sticky directories, only the owner and @@ -316,12 +316,12 @@ func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux. filetype := mode.FileType() if filetype != linux.ModeRegular && filetype != linux.ModeDirectory { if ats.MayWrite() { - return syserror.EPERM + return linuxerr.EPERM } - return syserror.ENODATA + return linuxerr.ENODATA } if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) { - return syserror.EPERM + return linuxerr.EPERM } } return nil diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 97b898aba..7fd7f000d 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // ResolvingPath represents the state of an in-progress path resolution, shared @@ -327,10 +327,10 @@ func (rp *ResolvingPath) ShouldFollowSymlink() bool { // Postconditions: If HandleSymlink returns a nil error, then !rp.Done(). func (rp *ResolvingPath) HandleSymlink(target string) error { if rp.symlinks >= linux.MaxSymlinkTraversals { - return syserror.ELOOP + return linuxerr.ELOOP } if len(target) == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } rp.symlinks++ targetPath := fspath.Parse(target) @@ -377,7 +377,7 @@ func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { // Preconditions: !rp.Done(). func (rp *ResolvingPath) HandleJump(target VirtualDentry) error { if rp.symlinks >= linux.MaxSymlinkTraversals { - return syserror.ELOOP + return linuxerr.ELOOP } rp.symlinks++ // Consume the path component that represented the magic link. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 87fdcf403..1b2a668c0 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -42,12 +42,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. @@ -278,14 +278,14 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential if !newpop.Path.Begin.Ok() { oldVD.DecRef(ctx) if newpop.Path.Absolute { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.ENOENT + return linuxerr.ENOENT } if newpop.FollowFinalSymlink { oldVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, newpop) @@ -315,13 +315,13 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia // pop.Path should not be empty in operations that create/delete files. // This is consistent with mkdirat(dirfd, "", mode). if pop.Path.Absolute { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is // also honored." - mkdir(2) @@ -347,19 +347,19 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia } // MknodAt creates a file of the given mode at the given path. It returns an -// error from the syserror package. +// error from the linuxerr package. func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { if !pop.Path.Begin.Ok() { // pop.Path should not be empty in operations that create/delete files. // This is consistent with mknodat(dirfd, "", mode, dev). if pop.Path.Absolute { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) @@ -402,13 +402,13 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential // filesystem implementations that do not support it). if opts.Flags&linux.O_TMPFILE != 0 { if opts.Flags&linux.O_DIRECTORY == 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } // O_PATH causes most other flags to be ignored. @@ -426,9 +426,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential if opts.Flags&linux.O_DIRECTORY != 0 { rp.mustBeDir = true } - // Ignore O_PATH for verity, as verity performs extra operations on the fd for verification. - // The underlying filesystem that verity wraps opens the fd with O_PATH. - if opts.Flags&linux.O_PATH != 0 && rp.mount.fs.FilesystemType().Name() != "verity" { + if opts.Flags&linux.O_PATH != 0 { vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) if err != nil { return nil, err @@ -448,7 +446,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential if opts.FileExec { if fd.Mount().Flags.NoExec { fd.DecRef(ctx) - return nil, syserror.EACCES + return nil, linuxerr.EACCES } // Only a regular file can be executed. @@ -459,7 +457,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential } if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { fd.DecRef(ctx) - return nil, syserror.EACCES + return nil, linuxerr.EACCES } } @@ -493,13 +491,13 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { if !oldpop.Path.Begin.Ok() { if oldpop.Path.Absolute { - return syserror.EBUSY + return linuxerr.EBUSY } - return syserror.ENOENT + return linuxerr.ENOENT } if oldpop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) @@ -508,20 +506,20 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti } if oldName == "." || oldName == ".." { oldParentVD.DecRef(ctx) - return syserror.EBUSY + return linuxerr.EBUSY } if !newpop.Path.Begin.Ok() { oldParentVD.DecRef(ctx) if newpop.Path.Absolute { - return syserror.EBUSY + return linuxerr.EBUSY } - return syserror.ENOENT + return linuxerr.ENOENT } if newpop.FollowFinalSymlink { oldParentVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, newpop) @@ -555,13 +553,13 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia // pop.Path should not be empty in operations that create/delete files. // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR). if pop.Path.Absolute { - return syserror.EBUSY + return linuxerr.EBUSY } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) @@ -638,13 +636,13 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent // pop.Path should not be empty in operations that create/delete files. // This is consistent with symlinkat(oldpath, newdirfd, ""). if pop.Path.Absolute { - return syserror.EEXIST + return linuxerr.EEXIST } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) @@ -672,13 +670,13 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti // pop.Path should not be empty in operations that create/delete files. // This is consistent with unlinkat(dirfd, "", 0). if pop.Path.Absolute { - return syserror.EBUSY + return linuxerr.EBUSY } - return syserror.ENOENT + return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") - return syserror.EINVAL + return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) @@ -731,8 +729,8 @@ func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Crede rp.Release(ctx) return names, nil } - if err == syserror.ENOTSUP { - // Linux doesn't actually return ENOTSUP in this case; instead, + if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { + // Linux doesn't actually return EOPNOTSUPP in this case; instead, // fs/xattr.c:vfs_listxattr() falls back to allowing the security // subsystem to return security extended attributes, which by // default don't exist. @@ -830,14 +828,14 @@ func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string Path: fspath.Parse(currentPath), } stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE}) - switch err { - case nil: + switch { + case err == nil: if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory { - return syserror.ENOTDIR + return linuxerr.ENOTDIR } // Directory already exists. return nil - case syserror.ENOENT: + case linuxerr.Equals(linuxerr.ENOENT, err): // Expected, we will create the dir. default: return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err) @@ -871,7 +869,7 @@ func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, targe Root: root, Start: root, Path: fspath.Parse(target), - }, mkdirOpts); err != nil && err != syserror.EEXIST { + }, mkdirOpts); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) { return fmt.Errorf("failed to create mountpoint %q: %w", target, err) } return nil diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index 8d563d53a..e8f7d1f01 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -77,11 +77,6 @@ var DefaultOpts = Opts{ // trigger it. const descheduleThreshold = 1 * time.Second -var ( - stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout") - stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected") -) - // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck. var stackDumpSameTaskPeriod = time.Minute @@ -242,7 +237,6 @@ func (w *Watchdog) waitForStart() { return } - stuckStartup.Increment() metric.WeirdnessMetric.Increment("watchdog_stuck_startup") var buf bytes.Buffer @@ -316,7 +310,6 @@ func (w *Watchdog) runTurn() { // unless they are surrounded by // Task.UninterruptibleSleepStart/Finish. tc = &offender{lastUpdateTime: lastUpdateTime} - stuckTasks.Increment() metric.WeirdnessMetric.Increment("watchdog_stuck_tasks") newTaskFound = true } diff --git a/pkg/shim/BUILD b/pkg/shim/BUILD index fd6127b97..367765209 100644 --- a/pkg/shim/BUILD +++ b/pkg/shim/BUILD @@ -6,6 +6,7 @@ go_library( name = "shim", srcs = [ "api.go", + "debug.go", "epoll.go", "options.go", "service.go", diff --git a/pkg/shim/debug.go b/pkg/shim/debug.go new file mode 100644 index 000000000..49f01990e --- /dev/null +++ b/pkg/shim/debug.go @@ -0,0 +1,48 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package shim + +import ( + "os" + "os/signal" + "runtime" + "sync" + "syscall" + + "github.com/containerd/containerd/log" +) + +var once sync.Once + +func setDebugSigHandler() { + once.Do(func() { + dumpCh := make(chan os.Signal, 1) + signal.Notify(dumpCh, syscall.SIGUSR2) + go func() { + buf := make([]byte, 10240) + for range dumpCh { + for { + n := runtime.Stack(buf, true) + if n >= len(buf) { + buf = make([]byte, 2*len(buf)) + continue + } + log.L.Debugf("User requested stack trace:\n%s", buf[:n]) + } + } + }() + log.L.Debugf("For full process dump run: kill -%d %d", syscall.SIGUSR2, os.Getpid()) + }) +} diff --git a/pkg/shim/epoll.go b/pkg/shim/epoll.go index 737d2b781..463e11a84 100644 --- a/pkg/shim/epoll.go +++ b/pkg/shim/epoll.go @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package shim diff --git a/pkg/shim/proc/BUILD b/pkg/shim/proc/BUILD index 544bdc170..c8527a6d9 100644 --- a/pkg/shim/proc/BUILD +++ b/pkg/shim/proc/BUILD @@ -20,7 +20,9 @@ go_library( "//shim:__subpackages__", ], deps = [ + "//pkg/cleanup", "//pkg/shim/runsc", + "//pkg/shim/utils", "@com_github_containerd_console//:go_default_library", "@com_github_containerd_containerd//errdefs:go_default_library", "@com_github_containerd_containerd//log:go_default_library", diff --git a/pkg/shim/proc/deleted_state.go b/pkg/shim/proc/deleted_state.go index d9b970c4d..b0bbe4d7e 100644 --- a/pkg/shim/proc/deleted_state.go +++ b/pkg/shim/proc/deleted_state.go @@ -22,28 +22,38 @@ import ( "github.com/containerd/console" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/pkg/process" + runc "github.com/containerd/go-runc" ) type deletedState struct{} -func (*deletedState) Resize(ws console.WinSize) error { - return fmt.Errorf("cannot resize a deleted process.ss") +func (*deletedState) Resize(console.WinSize) error { + return fmt.Errorf("cannot resize a deleted container/process") } -func (*deletedState) Start(ctx context.Context) error { - return fmt.Errorf("cannot start a deleted process.ss") +func (*deletedState) Start(context.Context) error { + return fmt.Errorf("cannot start a deleted container/process") } -func (*deletedState) Delete(ctx context.Context) error { - return fmt.Errorf("cannot delete a deleted process.ss: %w", errdefs.ErrNotFound) +func (*deletedState) Delete(context.Context) error { + return fmt.Errorf("cannot delete a deleted container/process: %w", errdefs.ErrNotFound) } -func (*deletedState) Kill(ctx context.Context, sig uint32, all bool) error { - return fmt.Errorf("cannot kill a deleted process.ss: %w", errdefs.ErrNotFound) +func (*deletedState) Kill(_ context.Context, signal uint32, _ bool) error { + return handleStoppedKill(signal) } -func (*deletedState) SetExited(status int) {} +func (*deletedState) SetExited(int) {} -func (*deletedState) Exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) { +func (*deletedState) Exec(context.Context, string, *ExecConfig) (process.Process, error) { return nil, fmt.Errorf("cannot exec in a deleted state") } + +func (s *deletedState) State(context.Context) (string, error) { + // There is no "deleted" state, closest one is stopped. + return "stopped", nil +} + +func (s *deletedState) Stats(context.Context, string) (*runc.Stats, error) { + return nil, fmt.Errorf("cannot stat a stopped container/process") +} diff --git a/pkg/shim/proc/exec.go b/pkg/shim/proc/exec.go index e7968d9d5..da2e21598 100644 --- a/pkg/shim/proc/exec.go +++ b/pkg/shim/proc/exec.go @@ -26,11 +26,13 @@ import ( "github.com/containerd/console" "github.com/containerd/containerd/errdefs" + "github.com/containerd/containerd/log" "github.com/containerd/containerd/pkg/stdio" "github.com/containerd/fifo" runc "github.com/containerd/go-runc" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/shim/runsc" ) @@ -92,6 +94,12 @@ func (e *execProcess) SetExited(status int) { } func (e *execProcess) setExited(status int) { + if !e.exited.IsZero() { + log.L.Debugf("Exec: status already set to %d, ignoring status: %d", e.status, status) + return + } + + log.L.Debugf("Exec: setting status: %d", status) e.status = status e.exited = time.Now() e.parent.Platform.ShutdownConsole(context.Background(), e.console) @@ -105,7 +113,7 @@ func (e *execProcess) Delete(ctx context.Context) error { return e.execState.Delete(ctx) } -func (e *execProcess) delete(ctx context.Context) error { +func (e *execProcess) delete() { e.wg.Wait() if e.io != nil { for _, c := range e.closers { @@ -113,13 +121,6 @@ func (e *execProcess) delete(ctx context.Context) error { } e.io.Close() } - pidfile := filepath.Join(e.path, fmt.Sprintf("%s.pid", e.id)) - // silently ignore error - os.Remove(pidfile) - internalPidfile := filepath.Join(e.path, fmt.Sprintf("%s-internal.pid", e.id)) - // silently ignore error - os.Remove(internalPidfile) - return nil } func (e *execProcess) Resize(ws console.WinSize) error { @@ -145,16 +146,13 @@ func (e *execProcess) Kill(ctx context.Context, sig uint32, _ bool) error { func (e *execProcess) kill(ctx context.Context, sig uint32, _ bool) error { internalPid := e.internalPid - if internalPid != 0 { - if err := e.parent.runtime.Kill(ctx, e.parent.id, int(sig), &runsc.KillOpts{ - Pid: internalPid, - }); err != nil { - // If this returns error, consider the process has - // already stopped. - // - // TODO: Fix after signal handling is fixed. - return fmt.Errorf("%s: %w", err.Error(), errdefs.ErrNotFound) - } + if internalPid == 0 { + return nil + } + + opts := runsc.KillOpts{Pid: internalPid} + if err := e.parent.runtime.Kill(ctx, e.parent.id, int(sig), &opts); err != nil { + return fmt.Errorf("%s: %w", err.Error(), errdefs.ErrNotFound) } return nil } @@ -174,42 +172,53 @@ func (e *execProcess) Start(ctx context.Context) error { return e.execState.Start(ctx) } -func (e *execProcess) start(ctx context.Context) (err error) { - var ( - socket *runc.Socket - pidfile = filepath.Join(e.path, fmt.Sprintf("%s.pid", e.id)) - internalPidfile = filepath.Join(e.path, fmt.Sprintf("%s-internal.pid", e.id)) - ) - if e.stdio.Terminal { - if socket, err = runc.NewTempConsoleSocket(); err != nil { +func (e *execProcess) start(ctx context.Context) error { + var socket *runc.Socket + + switch { + case e.stdio.Terminal: + s, err := runc.NewTempConsoleSocket() + if err != nil { return fmt.Errorf("failed to create runc console socket: %w", err) } - defer socket.Close() - } else if e.stdio.IsNull() { - if e.io, err = runc.NewNullIO(); err != nil { + defer s.Close() + socket = s + + case e.stdio.IsNull(): + io, err := runc.NewNullIO() + if err != nil { return fmt.Errorf("creating new NULL IO: %w", err) } - } else { - if e.io, err = runc.NewPipeIO(e.parent.IoUID, e.parent.IoGID, withConditionalIO(e.stdio)); err != nil { + e.io = io + + default: + io, err := runc.NewPipeIO(e.parent.IoUID, e.parent.IoGID, withConditionalIO(e.stdio)) + if err != nil { return fmt.Errorf("failed to create runc io pipes: %w", err) } + e.io = io } + opts := &runsc.ExecOpts{ - PidFile: pidfile, - InternalPidFile: internalPidfile, + PidFile: filepath.Join(e.path, fmt.Sprintf("%s.pid", e.id)), + InternalPidFile: filepath.Join(e.path, fmt.Sprintf("%s-internal.pid", e.id)), IO: e.io, Detach: true, } + defer func() { + _ = os.Remove(opts.PidFile) + _ = os.Remove(opts.InternalPidFile) + }() if socket != nil { opts.ConsoleSocket = socket } + eventCh := e.parent.Monitor.Subscribe() - defer func() { - // Unsubscribe if an error is returned. - if err != nil { - e.parent.Monitor.Unsubscribe(eventCh) - } - }() + cu := cleanup.Make(func() { + e.parent.Monitor.Unsubscribe(eventCh) + }) + defer cu.Clean() + if err := e.parent.runtime.Exec(ctx, e.parent.id, e.spec, opts); err != nil { close(e.waitBlock) return e.parent.runtimeError(err, "OCI runtime exec failed") @@ -237,6 +246,7 @@ func (e *execProcess) start(ctx context.Context) (err error) { return fmt.Errorf("failed to start io pipe copy: %w", err) } } + pid, err := runc.ReadPidFile(opts.PidFile) if err != nil { return fmt.Errorf("failed to retrieve OCI runtime exec pid: %w", err) @@ -247,6 +257,7 @@ func (e *execProcess) start(ctx context.Context) (err error) { return fmt.Errorf("failed to retrieve OCI runtime exec internal pid: %w", err) } e.internalPid = internalPid + go func() { defer e.parent.Monitor.Unsubscribe(eventCh) for event := range eventCh { @@ -260,21 +271,25 @@ func (e *execProcess) start(ctx context.Context) (err error) { } } }() + + cu.Release() // cancel cleanup on success. return nil } -func (e *execProcess) Status(ctx context.Context) (string, error) { +func (e *execProcess) Status(context.Context) (string, error) { e.mu.Lock() defer e.mu.Unlock() // if we don't have a pid then the exec process has just been created if e.pid == 0 { return "created", nil } - // if we have a pid and it can be signaled, the process is running - // TODO(random-liu): Use `runsc kill --pid`. - if err := unix.Kill(e.pid, 0); err == nil { - return "running", nil + // This checks that `runsc exec` process is still running. This process has + // the same lifetime as the process executing inside the container. So instead + // of calling `runsc kill --pid`, just do a quick check that `runsc exec` is + // still running. + if err := unix.Kill(e.pid, 0); err != nil { + // Can't signal the process, it must have exited. + return "stopped", nil } - // else if we have a pid but it can nolonger be signaled, it has stopped - return "stopped", nil + return "running", nil } diff --git a/pkg/shim/proc/exec_state.go b/pkg/shim/proc/exec_state.go index 4dcda8b44..03ecb401a 100644 --- a/pkg/shim/proc/exec_state.go +++ b/pkg/shim/proc/exec_state.go @@ -34,18 +34,21 @@ type execCreatedState struct { p *execProcess } -func (s *execCreatedState) transition(name string) error { - switch name { - case "running": +func (s *execCreatedState) name() string { + return "created" +} + +func (s *execCreatedState) transition(transition stateTransition) { + switch transition { + case running: s.p.execState = &execRunningState{p: s.p} - case "stopped": + case stopped: s.p.execState = &execStoppedState{p: s.p} - case "deleted": + case deleted: s.p.execState = &deletedState{} default: - return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) + panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } - return nil } func (s *execCreatedState) Resize(ws console.WinSize) error { @@ -56,14 +59,14 @@ func (s *execCreatedState) Start(ctx context.Context) error { if err := s.p.start(ctx); err != nil { return err } - return s.transition("running") + s.transition(running) + return nil } -func (s *execCreatedState) Delete(ctx context.Context) error { - if err := s.p.delete(ctx); err != nil { - return err - } - return s.transition("deleted") +func (s *execCreatedState) Delete(context.Context) error { + s.p.delete() + s.transition(deleted) + return nil } func (s *execCreatedState) Kill(ctx context.Context, sig uint32, all bool) error { @@ -72,35 +75,35 @@ func (s *execCreatedState) Kill(ctx context.Context, sig uint32, all bool) error func (s *execCreatedState) SetExited(status int) { s.p.setExited(status) - - if err := s.transition("stopped"); err != nil { - panic(err) - } + s.transition(stopped) } type execRunningState struct { p *execProcess } -func (s *execRunningState) transition(name string) error { - switch name { - case "stopped": +func (s *execRunningState) name() string { + return "running" +} + +func (s *execRunningState) transition(transition stateTransition) { + switch transition { + case stopped: s.p.execState = &execStoppedState{p: s.p} default: - return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) + panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } - return nil } func (s *execRunningState) Resize(ws console.WinSize) error { return s.p.resize(ws) } -func (s *execRunningState) Start(ctx context.Context) error { +func (s *execRunningState) Start(context.Context) error { return fmt.Errorf("cannot start a running process") } -func (s *execRunningState) Delete(ctx context.Context) error { +func (s *execRunningState) Delete(context.Context) error { return fmt.Errorf("cannot delete a running process") } @@ -110,45 +113,44 @@ func (s *execRunningState) Kill(ctx context.Context, sig uint32, all bool) error func (s *execRunningState) SetExited(status int) { s.p.setExited(status) - - if err := s.transition("stopped"); err != nil { - panic(err) - } + s.transition(stopped) } type execStoppedState struct { p *execProcess } -func (s *execStoppedState) transition(name string) error { - switch name { - case "deleted": +func (s *execStoppedState) name() string { + return "stopped" +} + +func (s *execStoppedState) transition(transition stateTransition) { + switch transition { + case deleted: s.p.execState = &deletedState{} default: - return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) + panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } - return nil } -func (s *execStoppedState) Resize(ws console.WinSize) error { +func (s *execStoppedState) Resize(console.WinSize) error { return fmt.Errorf("cannot resize a stopped container") } -func (s *execStoppedState) Start(ctx context.Context) error { +func (s *execStoppedState) Start(context.Context) error { return fmt.Errorf("cannot start a stopped process") } -func (s *execStoppedState) Delete(ctx context.Context) error { - if err := s.p.delete(ctx); err != nil { - return err - } - return s.transition("deleted") +func (s *execStoppedState) Delete(context.Context) error { + s.p.delete() + s.transition(deleted) + return nil } -func (s *execStoppedState) Kill(ctx context.Context, sig uint32, all bool) error { - return s.p.kill(ctx, sig, all) +func (s *execStoppedState) Kill(_ context.Context, sig uint32, _ bool) error { + return handleStoppedKill(sig) } -func (s *execStoppedState) SetExited(status int) { +func (s *execStoppedState) SetExited(int) { // no op } diff --git a/pkg/shim/proc/init.go b/pkg/shim/proc/init.go index 664465e0d..6bf090813 100644 --- a/pkg/shim/proc/init.go +++ b/pkg/shim/proc/init.go @@ -39,6 +39,8 @@ import ( "gvisor.dev/gvisor/pkg/shim/runsc" ) +const statusStopped = "stopped" + // Init represents an initial process for a container. type Init struct { wg sync.WaitGroup @@ -201,10 +203,15 @@ func (p *Init) ExitedAt() time.Time { func (p *Init) Status(ctx context.Context) (string, error) { p.mu.Lock() defer p.mu.Unlock() + + return p.initState.State(ctx) +} + +func (p *Init) state(ctx context.Context) (string, error) { c, err := p.runtime.State(ctx, p.id) if err != nil { if strings.Contains(err.Error(), "does not exist") { - return "stopped", nil + return statusStopped, nil } return "", p.runtimeError(err, "OCI runtime state failed") } @@ -231,10 +238,7 @@ func (p *Init) start(ctx context.Context) error { status, err := p.runtime.Wait(context.Background(), p.id) if err != nil { log.G(ctx).WithError(err).Errorf("Failed to wait for container %q", p.id) - // TODO(random-liu): Handle runsc kill error. - if err := p.killAll(ctx); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to kill container %q", p.id) - } + p.killAllLocked(ctx) status = internalErrorCode } ExitCh <- Exit{ @@ -255,6 +259,12 @@ func (p *Init) SetExited(status int) { } func (p *Init) setExited(status int) { + if !p.exited.IsZero() { + log.L.Debugf("Status already set to %d, ignoring status: %d", p.status, status) + return + } + + log.L.Debugf("Setting status: %d", status) p.exited = time.Now() p.status = status p.Platform.ShutdownConsole(context.Background(), p.console) @@ -270,15 +280,16 @@ func (p *Init) Delete(ctx context.Context) error { } func (p *Init) delete(ctx context.Context) error { - p.killAll(ctx) + p.killAllLocked(ctx) p.wg.Wait() + err := p.runtime.Delete(ctx, p.id, nil) - // ignore errors if a runtime has already deleted the process - // but we still hold metadata and pipes - // - // this is common during a checkpoint, runc will delete the container state - // after a checkpoint and the container will no longer exist within runc if err != nil { + // ignore errors if a runtime has already deleted the process + // but we still hold metadata and pipes + // + // this is common during a checkpoint, runc will delete the container state + // after a checkpoint and the container will no longer exist within runc if strings.Contains(err.Error(), "does not exist") { err = nil } else { @@ -326,29 +337,24 @@ func (p *Init) Kill(ctx context.Context, signal uint32, all bool) error { return p.initState.Kill(ctx, signal, all) } -func (p *Init) kill(context context.Context, signal uint32, all bool) error { +func (p *Init) kill(ctx context.Context, signal uint32, all bool) error { var ( killErr error backoff = 100 * time.Millisecond ) - timeout := 1 * time.Second - for start := time.Now(); time.Now().Sub(start) < timeout; { - c, err := p.runtime.State(context, p.id) + const timeout = time.Second + for start := time.Now(); time.Since(start) < timeout; { + state, err := p.initState.State(ctx) if err != nil { - if strings.Contains(err.Error(), "does not exist") { - return fmt.Errorf("no such process: %w", errdefs.ErrNotFound) - } return p.runtimeError(err, "OCI runtime state failed") } // For runsc, signal only works when container is running state. // If the container is not in running state, directly return // "no such process" - if p.convertStatus(c.Status) == "stopped" { + if state == statusStopped { return fmt.Errorf("no such process: %w", errdefs.ErrNotFound) } - killErr = p.runtime.Kill(context, p.id, int(signal), &runsc.KillOpts{ - All: all, - }) + killErr = p.runtime.Kill(ctx, p.id, int(signal), &runsc.KillOpts{All: all}) if killErr == nil { return nil } @@ -358,22 +364,18 @@ func (p *Init) kill(context context.Context, signal uint32, all bool) error { return p.runtimeError(killErr, "kill timeout") } -// KillAll kills all processes belonging to the init process. -func (p *Init) KillAll(context context.Context) error { +// KillAll kills all processes belonging to the init process. If +// `runsc kill --all` returns error, assume the container has already stopped. +func (p *Init) KillAll(context context.Context) { p.mu.Lock() defer p.mu.Unlock() - return p.killAll(context) + p.killAllLocked(context) } -func (p *Init) killAll(context context.Context) error { - p.runtime.Kill(context, p.id, int(unix.SIGKILL), &runsc.KillOpts{ - All: true, - }) - // Ignore error handling for `runsc kill --all` for now. - // * If it doesn't return error, it is good; - // * If it returns error, consider the container has already stopped. - // TODO: Fix `runsc kill --all` error handling. - return nil +func (p *Init) killAllLocked(context context.Context) { + if err := p.runtime.Kill(context, p.id, int(unix.SIGKILL), &runsc.KillOpts{All: true}); err != nil { + log.L.Warningf("Ignoring error killing container %q: %v", p.id, err) + } } // Stdin returns the stdin of the process. @@ -396,7 +398,6 @@ func (p *Init) Exec(ctx context.Context, path string, r *ExecConfig) (process.Pr // exec returns a new exec'd process. func (p *Init) exec(path string, r *ExecConfig) (process.Process, error) { - // process exec request var spec specs.Process if err := json.Unmarshal(r.Spec.Value, &spec); err != nil { return nil, err @@ -420,6 +421,17 @@ func (p *Init) exec(path string, r *ExecConfig) (process.Process, error) { return e, nil } +func (p *Init) Stats(ctx context.Context, id string) (*runc.Stats, error) { + p.mu.Lock() + defer p.mu.Unlock() + + return p.initState.Stats(ctx, id) +} + +func (p *Init) stats(ctx context.Context, id string) (*runc.Stats, error) { + return p.Runtime().Stats(ctx, id) +} + // Stdio returns the stdio of the process. func (p *Init) Stdio() stdio.Stdio { return p.stdio @@ -444,7 +456,7 @@ func (p *Init) runtimeError(rErr error, msg string) error { func (p *Init) convertStatus(status string) string { if status == "created" && !p.Sandbox && p.status == internalErrorCode { // Treat start failure state for non-root container as stopped. - return "stopped" + return statusStopped } return status } diff --git a/pkg/shim/proc/init_state.go b/pkg/shim/proc/init_state.go index 0065fc385..5347ddefe 100644 --- a/pkg/shim/proc/init_state.go +++ b/pkg/shim/proc/init_state.go @@ -19,16 +19,40 @@ import ( "context" "fmt" - "github.com/containerd/console" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/pkg/process" + runc "github.com/containerd/go-runc" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/shim/utils" ) +type stateTransition int + +const ( + running stateTransition = iota + stopped + deleted +) + +func (s stateTransition) String() string { + switch s { + case running: + return "running" + case stopped: + return "stopped" + case deleted: + return "deleted" + default: + panic(fmt.Sprintf("unknown state: %d", s)) + } +} + type initState interface { - Resize(console.WinSize) error Start(context.Context) error Delete(context.Context) error Exec(context.Context, string, *ExecConfig) (process.Process, error) + State(ctx context.Context) (string, error) + Stats(context.Context, string) (*runc.Stats, error) Kill(context.Context, uint32, bool) error SetExited(int) } @@ -37,22 +61,21 @@ type createdState struct { p *Init } -func (s *createdState) transition(name string) error { - switch name { - case "running": +func (s *createdState) name() string { + return "created" +} + +func (s *createdState) transition(transition stateTransition) { + switch transition { + case running: s.p.initState = &runningState{p: s.p} - case "stopped": - s.p.initState = &stoppedState{p: s.p} - case "deleted": + case stopped: + s.p.initState = &stoppedState{process: s.p} + case deleted: s.p.initState = &deletedState{} default: - return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) + panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } - return nil -} - -func (s *createdState) Resize(ws console.WinSize) error { - return s.p.resize(ws) } func (s *createdState) Start(ctx context.Context) error { @@ -66,20 +89,20 @@ func (s *createdState) Start(ctx context.Context) error { if !s.p.Sandbox { s.p.io.Close() s.p.setExited(internalErrorCode) - if err := s.transition("stopped"); err != nil { - panic(err) - } + s.transition(stopped) } return err } - return s.transition("running") + s.transition(running) + return nil } func (s *createdState) Delete(ctx context.Context) error { if err := s.p.delete(ctx); err != nil { return err } - return s.transition("deleted") + s.transition(deleted) + return nil } func (s *createdState) Kill(ctx context.Context, sig uint32, all bool) error { @@ -88,40 +111,48 @@ func (s *createdState) Kill(ctx context.Context, sig uint32, all bool) error { func (s *createdState) SetExited(status int) { s.p.setExited(status) - - if err := s.transition("stopped"); err != nil { - panic(err) - } + s.transition(stopped) } func (s *createdState) Exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) { return s.p.exec(path, r) } +func (s *createdState) State(ctx context.Context) (string, error) { + state, err := s.p.state(ctx) + if err == nil && state == statusStopped { + s.transition(stopped) + } + return state, err +} + +func (s *createdState) Stats(ctx context.Context, id string) (*runc.Stats, error) { + return s.p.stats(ctx, id) +} + type runningState struct { p *Init } -func (s *runningState) transition(name string) error { - switch name { - case "stopped": - s.p.initState = &stoppedState{p: s.p} - default: - return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) - } - return nil +func (s *runningState) name() string { + return "running" } -func (s *runningState) Resize(ws console.WinSize) error { - return s.p.resize(ws) +func (s *runningState) transition(transition stateTransition) { + switch transition { + case stopped: + s.p.initState = &stoppedState{process: s.p} + default: + panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) + } } func (s *runningState) Start(ctx context.Context) error { - return fmt.Errorf("cannot start a running process.ss") + return fmt.Errorf("cannot start a running container") } func (s *runningState) Delete(ctx context.Context) error { - return fmt.Errorf("cannot delete a running process.ss") + return fmt.Errorf("cannot delete a running container") } func (s *runningState) Kill(ctx context.Context, sig uint32, all bool) error { @@ -130,53 +161,81 @@ func (s *runningState) Kill(ctx context.Context, sig uint32, all bool) error { func (s *runningState) SetExited(status int) { s.p.setExited(status) + s.transition(stopped) +} + +func (s *runningState) Exec(_ context.Context, path string, r *ExecConfig) (process.Process, error) { + return s.p.exec(path, r) +} - if err := s.transition("stopped"); err != nil { - panic(err) +func (s *runningState) State(ctx context.Context) (string, error) { + state, err := s.p.state(ctx) + if err == nil && state == "stopped" { + s.transition(stopped) } + return state, err } -func (s *runningState) Exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) { - return s.p.exec(path, r) +func (s *runningState) Stats(ctx context.Context, id string) (*runc.Stats, error) { + return s.p.stats(ctx, id) } type stoppedState struct { - p *Init + process *Init } -func (s *stoppedState) transition(name string) error { - switch name { - case "deleted": - s.p.initState = &deletedState{} - default: - return fmt.Errorf("invalid state transition %q to %q", stateName(s), name) - } - return nil +func (s *stoppedState) name() string { + return "stopped" } -func (s *stoppedState) Resize(ws console.WinSize) error { - return fmt.Errorf("cannot resize a stopped container") +func (s *stoppedState) transition(transition stateTransition) { + switch transition { + case deleted: + s.process.initState = &deletedState{} + default: + panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) + } } -func (s *stoppedState) Start(ctx context.Context) error { - return fmt.Errorf("cannot start a stopped process.ss") +func (s *stoppedState) Start(context.Context) error { + return fmt.Errorf("cannot start a stopped container") } func (s *stoppedState) Delete(ctx context.Context) error { - if err := s.p.delete(ctx); err != nil { + if err := s.process.delete(ctx); err != nil { return err } - return s.transition("deleted") + s.transition(deleted) + return nil } -func (s *stoppedState) Kill(ctx context.Context, sig uint32, all bool) error { - return errdefs.ToGRPCf(errdefs.ErrNotFound, "process.ss %s not found", s.p.id) +func (s *stoppedState) Kill(_ context.Context, signal uint32, _ bool) error { + return handleStoppedKill(signal) } func (s *stoppedState) SetExited(status int) { - // no op + s.process.setExited(status) } -func (s *stoppedState) Exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) { +func (s *stoppedState) Exec(context.Context, string, *ExecConfig) (process.Process, error) { return nil, fmt.Errorf("cannot exec in a stopped state") } + +func (s *stoppedState) State(context.Context) (string, error) { + return "stopped", nil +} + +func (s *stoppedState) Stats(context.Context, string) (*runc.Stats, error) { + return nil, fmt.Errorf("cannot stat a stopped container") +} + +func handleStoppedKill(signal uint32) error { + switch unix.Signal(signal) { + case unix.SIGTERM, unix.SIGKILL: + // Container is already stopped, so everything inside the container has + // already been killed. + return nil + default: + return utils.ErrToGRPCf(errdefs.ErrNotFound, "process not found") + } +} diff --git a/pkg/shim/proc/proc.go b/pkg/shim/proc/proc.go index edba3fca5..89ad3f505 100644 --- a/pkg/shim/proc/proc.go +++ b/pkg/shim/proc/proc.go @@ -17,23 +17,5 @@ // the sandbox process running the container. package proc -import ( - "fmt" -) - // RunscRoot is the path to the root runsc state directory. const RunscRoot = "/run/containerd/runsc" - -func stateName(v interface{}) string { - switch v.(type) { - case *runningState, *execRunningState: - return "running" - case *createdState, *execCreatedState: - return "created" - case *deletedState: - return "deleted" - case *stoppedState: - return "stopped" - } - panic(fmt.Errorf("invalid state %v", v)) -} diff --git a/pkg/shim/runsc/runsc.go b/pkg/shim/runsc/runsc.go index ff0521d73..888cb0bcb 100644 --- a/pkg/shim/runsc/runsc.go +++ b/pkg/shim/runsc/runsc.go @@ -17,6 +17,7 @@ package runsc import ( + "bytes" "context" "encoding/json" "fmt" @@ -73,9 +74,9 @@ type Runsc struct { // List returns all containers created inside the provided runsc root directory. func (r *Runsc) List(context context.Context) ([]*runc.Container, error) { - data, err := cmdOutput(r.command(context, "list", "--format=json"), false) + data, stderr, err := cmdOutput(r.command(context, "list", "--format=json"), false) if err != nil { - return nil, err + return nil, fmt.Errorf("%w: %s", err, stderr) } var out []*runc.Container if err := json.Unmarshal(data, &out); err != nil { @@ -86,9 +87,9 @@ func (r *Runsc) List(context context.Context) ([]*runc.Container, error) { // State returns the state for the container provided by id. func (r *Runsc) State(context context.Context, id string) (*runc.Container, error) { - data, err := cmdOutput(r.command(context, "state", id), true) + data, stderr, err := cmdOutput(r.command(context, "state", id), false) if err != nil { - return nil, fmt.Errorf("%s: %s", err, data) + return nil, fmt.Errorf("%w: %s", err, stderr) } var c runc.Container if err := json.Unmarshal(data, &c); err != nil { @@ -142,9 +143,9 @@ func (r *Runsc) Create(context context.Context, id, bundle string, opts *CreateO } if cmd.Stdout == nil && cmd.Stderr == nil { - data, err := cmdOutput(cmd, true) + out, _, err := cmdOutput(cmd, true) if err != nil { - return fmt.Errorf("%s: %s", err, data) + return fmt.Errorf("%w: %s", err, out) } return nil } @@ -168,15 +169,15 @@ func (r *Runsc) Create(context context.Context, id, bundle string, opts *CreateO } func (r *Runsc) Pause(context context.Context, id string) error { - if _, err := cmdOutput(r.command(context, "pause", id), true); err != nil { - return fmt.Errorf("unable to pause: %w", err) + if out, _, err := cmdOutput(r.command(context, "pause", id), true); err != nil { + return fmt.Errorf("unable to pause: %w: %s", err, out) } return nil } func (r *Runsc) Resume(context context.Context, id string) error { - if _, err := cmdOutput(r.command(context, "resume", id), true); err != nil { - return fmt.Errorf("unable to resume: %w", err) + if out, _, err := cmdOutput(r.command(context, "resume", id), true); err != nil { + return fmt.Errorf("unable to resume: %w: %s", err, out) } return nil } @@ -189,9 +190,9 @@ func (r *Runsc) Start(context context.Context, id string, cio runc.IO) error { } if cmd.Stdout == nil && cmd.Stderr == nil { - data, err := cmdOutput(cmd, true) + out, _, err := cmdOutput(cmd, true) if err != nil { - return fmt.Errorf("%s: %s", err, data) + return fmt.Errorf("%w: %s", err, out) } return nil } @@ -221,12 +222,10 @@ type waitResult struct { } // Wait will wait for a running container, and return its exit status. -// -// TODO(random-liu): Add exec process support. func (r *Runsc) Wait(context context.Context, id string) (int, error) { - data, err := cmdOutput(r.command(context, "wait", id), true) + data, stderr, err := cmdOutput(r.command(context, "wait", id), false) if err != nil { - return 0, fmt.Errorf("%s: %s", err, data) + return 0, fmt.Errorf("%w: %s", err, stderr) } var res waitResult if err := json.Unmarshal(data, &res); err != nil { @@ -294,9 +293,9 @@ func (r *Runsc) Exec(context context.Context, id string, spec specs.Process, opt opts.Set(cmd) } if cmd.Stdout == nil && cmd.Stderr == nil { - data, err := cmdOutput(cmd, true) + out, _, err := cmdOutput(cmd, true) if err != nil { - return fmt.Errorf("%s: %s", err, data) + return fmt.Errorf("%w: %s", err, out) } return nil } @@ -391,20 +390,12 @@ func (r *Runsc) Kill(context context.Context, id string, sig int, opts *KillOpts // Stats return the stats for a container like cpu, memory, and I/O. func (r *Runsc) Stats(context context.Context, id string) (*runc.Stats, error) { cmd := r.command(context, "events", "--stats", id) - rd, err := cmd.StdoutPipe() - if err != nil { - return nil, err - } - ec, err := Monitor.Start(cmd) + data, stderr, err := cmdOutput(cmd, false) if err != nil { - return nil, err + return nil, fmt.Errorf("%w: %s", err, stderr) } - defer func() { - rd.Close() - Monitor.Wait(cmd, ec) - }() var e runc.Event - if err := json.NewDecoder(rd).Decode(&e); err != nil { + if err := json.Unmarshal(data, &e); err != nil { log.L.Debugf("Parsing events error: %v", err) return nil, err } @@ -459,9 +450,9 @@ func (r *Runsc) Events(context context.Context, id string, interval time.Duratio // Ps lists all the processes inside the container returning their pids. func (r *Runsc) Ps(context context.Context, id string) ([]int, error) { - data, err := cmdOutput(r.command(context, "ps", "--format", "json", id), true) + data, stderr, err := cmdOutput(r.command(context, "ps", "--format", "json", id), false) if err != nil { - return nil, fmt.Errorf("%s: %s", err, data) + return nil, fmt.Errorf("%w: %s", err, stderr) } var pids []int if err := json.Unmarshal(data, &pids); err != nil { @@ -472,9 +463,9 @@ func (r *Runsc) Ps(context context.Context, id string) ([]int, error) { // Top lists all the processes inside the container returning the full ps data. func (r *Runsc) Top(context context.Context, id string) (*runc.TopResults, error) { - data, err := cmdOutput(r.command(context, "ps", "--format", "table", id), true) + data, stderr, err := cmdOutput(r.command(context, "ps", "--format", "table", id), false) if err != nil { - return nil, fmt.Errorf("%s: %s", err, data) + return nil, fmt.Errorf("%w: %s", err, stderr) } topResults, err := runc.ParsePSOutput(data) @@ -517,9 +508,9 @@ func (r *Runsc) runOrError(cmd *exec.Cmd) error { } return err } - data, err := cmdOutput(cmd, true) + out, _, err := cmdOutput(cmd, true) if err != nil { - return fmt.Errorf("%s: %s", err, data) + return fmt.Errorf("%w: %s", err, out) } return nil } @@ -540,23 +531,29 @@ func (r *Runsc) command(context context.Context, args ...string) *exec.Cmd { return cmd } -func cmdOutput(cmd *exec.Cmd, combined bool) ([]byte, error) { - b := getBuf() - defer putBuf(b) +func cmdOutput(cmd *exec.Cmd, combined bool) ([]byte, []byte, error) { + stdout := getBuf() + defer putBuf(stdout) + cmd.Stdout = stdout + cmd.Stderr = stdout - cmd.Stdout = b - if combined { - cmd.Stderr = b + var stderr *bytes.Buffer + if !combined { + stderr = getBuf() + defer putBuf(stderr) + cmd.Stderr = stderr } ec, err := Monitor.Start(cmd) if err != nil { - return nil, err + return nil, nil, err } status, err := Monitor.Wait(cmd, ec) if err == nil && status != 0 { - err = fmt.Errorf("%s did not terminate sucessfully", cmd.Args[0]) + err = fmt.Errorf("%q did not terminate sucessfully", cmd.Args[0]) } - - return b.Bytes(), err + if stderr == nil { + return stdout.Bytes(), nil, err + } + return stdout.Bytes(), stderr.Bytes(), err } diff --git a/pkg/shim/runtimeoptions/runtimeoptions.go b/pkg/shim/runtimeoptions/runtimeoptions.go index 072dd87f0..e76d73ea7 100644 --- a/pkg/shim/runtimeoptions/runtimeoptions.go +++ b/pkg/shim/runtimeoptions/runtimeoptions.go @@ -15,3 +15,10 @@ // Package runtimeoptions contains the runtimeoptions proto. package runtimeoptions + +import proto "github.com/gogo/protobuf/proto" + +func init() { + // TODO(gvisor.dev/issue/6449): Upgrade runtimeoptions.proto after upgrading to containerd 1.5 + proto.RegisterType((*Options)(nil), "runtimeoptions.v1.Options") +} diff --git a/pkg/shim/runtimeoptions/runtimeoptions_cri.go b/pkg/shim/runtimeoptions/runtimeoptions_cri.go index e6102b4cf..23bbd82be 100644 --- a/pkg/shim/runtimeoptions/runtimeoptions_cri.go +++ b/pkg/shim/runtimeoptions/runtimeoptions_cri.go @@ -13,6 +13,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package runtimeoptions import ( diff --git a/pkg/shim/service.go b/pkg/shim/service.go index 1f9adcb65..24e3b7a82 100644 --- a/pkg/shim/service.go +++ b/pkg/shim/service.go @@ -81,8 +81,6 @@ const ( // New returns a new shim service that can be used via GRPC. func New(ctx context.Context, id string, publisher shim.Publisher, cancel func()) (shim.Shim, error) { - log.L.Debugf("service.New, id: %s", id) - var opts shim.Opts if ctxOpts := ctx.Value(shim.OptsKey{}); ctxOpts != nil { opts = ctxOpts.(shim.Opts) @@ -304,8 +302,6 @@ func (s *service) Cleanup(ctx context.Context) (*taskAPI.DeleteResponse, error) // Create creates a new initial process and container with the underlying OCI // runtime. func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*taskAPI.CreateTaskResponse, error) { - log.L.Debugf("Create, id: %s, bundle: %q", r.ID, r.Bundle) - s.mu.Lock() defer s.mu.Unlock() @@ -396,6 +392,9 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*ta log.L.Debugf("stdout: %s", r.Stdout) log.L.Debugf("stderr: %s", r.Stderr) log.L.Debugf("***************************") + if log.L.Logger.IsLevelEnabled(logrus.DebugLevel) { + setDebugSigHandler() + } } // Save state before any action is taken to ensure Cleanup() will have all @@ -453,10 +452,10 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*ta } process, err := newInit(r.Bundle, filepath.Join(r.Bundle, "work"), ns, s.platform, config, &s.opts, st.Rootfs) if err != nil { - return nil, errdefs.ToGRPC(err) + return nil, utils.ErrToGRPC(err) } if err := process.Create(ctx, config); err != nil { - return nil, errdefs.ToGRPC(err) + return nil, utils.ErrToGRPC(err) } // Set up OOM notification on the sandbox's cgroup. This is done on @@ -506,9 +505,6 @@ func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAP if err != nil { return nil, err } - if p == nil { - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") - } if err := p.Delete(ctx); err != nil { return nil, err } @@ -534,10 +530,10 @@ func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*typ p := s.processes[r.ExecID] s.mu.Unlock() if p != nil { - return nil, errdefs.ToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID) + return nil, utils.ErrToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID) } if s.task == nil { - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } process, err := s.task.Exec(ctx, s.bundle, &proc.ExecConfig{ ID: r.ExecID, @@ -548,7 +544,7 @@ func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*typ Spec: r.Spec, }) if err != nil { - return nil, errdefs.ToGRPC(err) + return nil, utils.ErrToGRPC(err) } s.mu.Lock() s.processes[r.ExecID] = process @@ -569,7 +565,7 @@ func (s *service) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (* Height: uint16(r.Height), } if err := p.Resize(ws); err != nil { - return nil, errdefs.ToGRPC(err) + return nil, utils.ErrToGRPC(err) } return empty, nil } @@ -580,10 +576,12 @@ func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI. p, err := s.getProcess(r.ExecID) if err != nil { + log.L.Debugf("State failed to find process: %v", err) return nil, err } st, err := p.Status(ctx) if err != nil { + log.L.Debugf("State failed: %v", err) return nil, err } status := task.StatusUnknown @@ -596,7 +594,7 @@ func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI. status = task.StatusStopped } sio := p.Stdio() - return &taskAPI.StateResponse{ + res := &taskAPI.StateResponse{ ID: p.ID(), Bundle: s.bundle, Pid: uint32(p.Pid()), @@ -607,7 +605,9 @@ func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI. Terminal: sio.Terminal, ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), - }, nil + } + log.L.Debugf("State succeeded, response: %+v", res) + return res, nil } // Pause the container. @@ -615,7 +615,7 @@ func (s *service) Pause(ctx context.Context, r *taskAPI.PauseRequest) (*types.Em log.L.Debugf("Pause, id: %s", r.ID) if s.task == nil { log.L.Debugf("Pause error, id: %s: container not created", r.ID) - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } err := s.task.Runtime().Pause(ctx, r.ID) if err != nil { @@ -629,7 +629,7 @@ func (s *service) Resume(ctx context.Context, r *taskAPI.ResumeRequest) (*types. log.L.Debugf("Resume, id: %s", r.ID) if s.task == nil { log.L.Debugf("Resume error, id: %s: container not created", r.ID) - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } err := s.task.Runtime().Resume(ctx, r.ID) if err != nil { @@ -646,12 +646,11 @@ func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (*types.Empt if err != nil { return nil, err } - if p == nil { - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") - } if err := p.Kill(ctx, r.Signal, r.All); err != nil { - return nil, errdefs.ToGRPC(err) + log.L.Debugf("Kill failed: %v", err) + return nil, utils.ErrToGRPC(err) } + log.L.Debugf("Kill succeeded") return empty, nil } @@ -661,7 +660,7 @@ func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.Pi pids, err := s.getContainerPids(ctx, r.ID) if err != nil { - return nil, errdefs.ToGRPC(err) + return nil, utils.ErrToGRPC(err) } var processes []*task.ProcessInfo for _, pid := range pids { @@ -707,7 +706,7 @@ func (s *service) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*type // Checkpoint checkpoints the container. func (s *service) Checkpoint(ctx context.Context, r *taskAPI.CheckpointTaskRequest) (*types.Empty, error) { log.L.Debugf("Checkpoint, id: %s", r.ID) - return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) + return empty, utils.ErrToGRPC(errdefs.ErrNotImplemented) } // Connect returns shim information such as the shim's pid. @@ -738,9 +737,9 @@ func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI. log.L.Debugf("Stats, id: %s", r.ID) if s.task == nil { log.L.Debugf("Stats error, id: %s: container not created", r.ID) - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } - stats, err := s.task.Runtime().Stats(ctx, s.id) + stats, err := s.task.Stats(ctx, s.id) if err != nil { log.L.Debugf("Stats error, id: %s: %v", r.ID, err) return nil, err @@ -812,7 +811,7 @@ func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI. // Update updates a running container. func (s *service) Update(ctx context.Context, r *taskAPI.UpdateTaskRequest) (*types.Empty, error) { - return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) + return empty, utils.ErrToGRPC(errdefs.ErrNotImplemented) } // Wait waits for a process to exit. @@ -821,17 +820,17 @@ func (s *service) Wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.Wa p, err := s.getProcess(r.ExecID) if err != nil { + log.L.Debugf("Wait failed to find process: %v", err) return nil, err } - if p == nil { - return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") - } p.Wait() - return &taskAPI.WaitResponse{ + res := &taskAPI.WaitResponse{ ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), - }, nil + } + log.L.Debugf("Wait succeeded, response: %+v", res) + return res, nil } func (s *service) processExits(ctx context.Context) { @@ -848,10 +847,7 @@ func (s *service) checkProcesses(ctx context.Context, e proc.Exit) { if ip, ok := p.(*proc.Init); ok { // Ensure all children are killed. log.L.Debugf("Container init process exited, killing all container processes") - if err := ip.KillAll(ctx); err != nil { - log.G(ctx).WithError(err).WithField("id", ip.ID()). - Error("failed to kill init's children") - } + ip.KillAll(ctx) } p.SetExited(e.Status) s.events <- &events.TaskExit{ @@ -909,12 +905,17 @@ func (s *service) forward(ctx context.Context, publisher shim.Publisher) { func (s *service) getProcess(execID string) (process.Process, error) { s.mu.Lock() defer s.mu.Unlock() + if execID == "" { + if s.task == nil { + return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") + } return s.task, nil } + p := s.processes[execID] if p == nil { - return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID) + return nil, utils.ErrToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID) } return p, nil } diff --git a/pkg/shim/service_linux.go b/pkg/shim/service_linux.go index 829f69282..fb2f8b062 100644 --- a/pkg/shim/service_linux.go +++ b/pkg/shim/service_linux.go @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package shim diff --git a/pkg/shim/utils/BUILD b/pkg/shim/utils/BUILD index 54a0aabb7..2eb82f63c 100644 --- a/pkg/shim/utils/BUILD +++ b/pkg/shim/utils/BUILD @@ -6,6 +6,7 @@ go_library( name = "utils", srcs = [ "annotations.go", + "errors.go", "utils.go", "volumes.go", ], @@ -14,14 +15,23 @@ go_library( "//shim:__subpackages__", ], deps = [ + "@com_github_containerd_containerd//errdefs:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + "@org_golang_google_grpc//codes:go_default_library", + "@org_golang_google_grpc//status:go_default_library", ], ) go_test( name = "utils_test", size = "small", - srcs = ["volumes_test.go"], + srcs = [ + "errors_test.go", + "volumes_test.go", + ], library = ":utils", - deps = ["@com_github_opencontainers_runtime_spec//specs-go:go_default_library"], + deps = [ + "@com_github_containerd_containerd//errdefs:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + ], ) diff --git a/pkg/shim/utils/errors.go b/pkg/shim/utils/errors.go new file mode 100644 index 000000000..971d68c36 --- /dev/null +++ b/pkg/shim/utils/errors.go @@ -0,0 +1,74 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "context" + "errors" + "fmt" + + "github.com/containerd/containerd/errdefs" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// ErrToGRPC wraps containerd's ToGRPC error mapper which depends on +// github.com/pkg/errors to work correctly. Once we upgrade to containerd v1.4, +// this function can go away and we can use errdefs.ToGRPC directly instead. +// +// TODO(gvisor.dev/issue/6232): Remove after upgrading to containerd v1.4 +func ErrToGRPC(err error) error { + return errToGRPCMsg(err, err.Error()) +} + +// ErrToGRPCf maps the error to grpc error codes, assembling the formatting +// string and combining it with the target error string. +// +// TODO(gvisor.dev/issue/6232): Remove after upgrading to containerd v1.4 +func ErrToGRPCf(err error, format string, args ...interface{}) error { + formatted := fmt.Sprintf(format, args...) + msg := fmt.Sprintf("%s: %s", formatted, err.Error()) + return errToGRPCMsg(err, msg) +} + +func errToGRPCMsg(err error, msg string) error { + if err == nil { + return nil + } + if _, ok := status.FromError(err); ok { + return err + } + + switch { + case errors.Is(err, errdefs.ErrInvalidArgument): + return status.Errorf(codes.InvalidArgument, msg) + case errors.Is(err, errdefs.ErrNotFound): + return status.Errorf(codes.NotFound, msg) + case errors.Is(err, errdefs.ErrAlreadyExists): + return status.Errorf(codes.AlreadyExists, msg) + case errors.Is(err, errdefs.ErrFailedPrecondition): + return status.Errorf(codes.FailedPrecondition, msg) + case errors.Is(err, errdefs.ErrUnavailable): + return status.Errorf(codes.Unavailable, msg) + case errors.Is(err, errdefs.ErrNotImplemented): + return status.Errorf(codes.Unimplemented, msg) + case errors.Is(err, context.Canceled): + return status.Errorf(codes.Canceled, msg) + case errors.Is(err, context.DeadlineExceeded): + return status.Errorf(codes.DeadlineExceeded, msg) + } + + return errdefs.ToGRPC(err) +} diff --git a/pkg/shim/utils/errors_test.go b/pkg/shim/utils/errors_test.go new file mode 100644 index 000000000..0a8fe34c8 --- /dev/null +++ b/pkg/shim/utils/errors_test.go @@ -0,0 +1,50 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "fmt" + "testing" + + "github.com/containerd/containerd/errdefs" +) + +func TestGRPCRoundTripsErrors(t *testing.T) { + for _, tc := range []struct { + name string + err error + test func(err error) bool + }{ + { + name: "passthrough", + err: errdefs.ErrNotFound, + test: errdefs.IsNotFound, + }, + { + name: "wrapped", + err: fmt.Errorf("oh no: %w", errdefs.ErrNotFound), + test: errdefs.IsNotFound, + }, + } { + t.Run(tc.name, func(t *testing.T) { + if err := errdefs.FromGRPC(ErrToGRPC(tc.err)); !tc.test(err) { + t.Errorf("errToGRPC got %+v", err) + } + if err := errdefs.FromGRPC(ErrToGRPCf(tc.err, "testing %s", "123")); !tc.test(err) { + t.Errorf("errToGRPCf got %+v", err) + } + }) + } +} diff --git a/pkg/state/state_norace.go b/pkg/state/state_norace.go index 4281aed6d..be09d6141 100644 --- a/pkg/state/state_norace.go +++ b/pkg/state/state_norace.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !race // +build !race package state diff --git a/pkg/state/state_race.go b/pkg/state/state_race.go index 8232981ce..c9f4fd5cf 100644 --- a/pkg/state/state_race.go +++ b/pkg/state/state_race.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build race // +build race package state diff --git a/pkg/state/tests/register_test.go b/pkg/state/tests/register_test.go index 75bdbfc6e..2199d6b01 100644 --- a/pkg/state/tests/register_test.go +++ b/pkg/state/tests/register_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build race // +build race package tests diff --git a/pkg/sync/atomicptrmap/BUILD b/pkg/sync/atomicptrmap/BUILD index b0e218c79..1c0085d69 100644 --- a/pkg/sync/atomicptrmap/BUILD +++ b/pkg/sync/atomicptrmap/BUILD @@ -19,6 +19,7 @@ go_template( "Key", "Value", ], + visibility = ["//:sandbox"], deps = [ "//pkg/gohacks", "//pkg/sync", diff --git a/pkg/sync/checklocks_off_unsafe.go b/pkg/sync/checklocks_off_unsafe.go index 62c81b149..87c56dd12 100644 --- a/pkg/sync/checklocks_off_unsafe.go +++ b/pkg/sync/checklocks_off_unsafe.go @@ -3,6 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build !checklocks // +build !checklocks package sync diff --git a/pkg/sync/checklocks_on_unsafe.go b/pkg/sync/checklocks_on_unsafe.go index 24f933ed1..f2bfde083 100644 --- a/pkg/sync/checklocks_on_unsafe.go +++ b/pkg/sync/checklocks_on_unsafe.go @@ -3,6 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build checklocks // +build checklocks package sync diff --git a/pkg/sync/goyield_go113_unsafe.go b/pkg/sync/goyield_go113_unsafe.go index 8aee0d455..c4b03e9aa 100644 --- a/pkg/sync/goyield_go113_unsafe.go +++ b/pkg/sync/goyield_go113_unsafe.go @@ -3,8 +3,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build go1.13 -// +build !go1.14 +//go:build go1.13 && !go1.14 +// +build go1.13,!go1.14 package sync diff --git a/pkg/sync/goyield_unsafe.go b/pkg/sync/goyield_unsafe.go index f3cc12163..757edbaba 100644 --- a/pkg/sync/goyield_unsafe.go +++ b/pkg/sync/goyield_unsafe.go @@ -3,10 +3,12 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build go1.14 // +build go1.14 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package sync diff --git a/pkg/sync/mutex_test.go b/pkg/sync/mutex_test.go index 4fb51a8ab..9e4e3f0b2 100644 --- a/pkg/sync/mutex_test.go +++ b/pkg/sync/mutex_test.go @@ -64,7 +64,7 @@ func TestTryLockUnlock(t *testing.T) { if !m.TryLock() { t.Fatal("failed to aquire lock") } - m.Unlock() + m.Unlock() // +checklocksforce if !m.TryLock() { t.Fatal("failed to aquire lock after unlock") } diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go index 411a80a8a..e4701b464 100644 --- a/pkg/sync/mutex_unsafe.go +++ b/pkg/sync/mutex_unsafe.go @@ -3,8 +3,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build go1.13 -// +build !go1.18 +//go:build go1.13 && !go1.19 +// +build go1.13,!go1.19 // When updating the build constraint (above), check that syncMutex matches the // standard library sync.Mutex definition. @@ -32,6 +32,18 @@ func (m *CrossGoroutineMutex) state() *int32 { return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state } +// Lock locks the underlying Mutex. +// +checklocksignore +func (m *CrossGoroutineMutex) Lock() { + m.Mutex.Lock() +} + +// Unlock unlocks the underlying Mutex. +// +checklocksignore +func (m *CrossGoroutineMutex) Unlock() { + m.Mutex.Unlock() +} + const ( mutexUnlocked = 0 mutexLocked = 1 @@ -62,6 +74,7 @@ type Mutex struct { // Lock locks m. If the lock is already in use, the calling goroutine blocks // until the mutex is available. +// +checklocksignore func (m *Mutex) Lock() { noteLock(unsafe.Pointer(m)) m.m.Lock() @@ -80,6 +93,7 @@ func (m *Mutex) Unlock() { // TryLock tries to acquire the mutex. It returns true if it succeeds and false // otherwise. TryLock does not block. +// +checklocksignore func (m *Mutex) TryLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(m)) diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go index 70b5f3a5e..8eca99134 100644 --- a/pkg/sync/norace_unsafe.go +++ b/pkg/sync/norace_unsafe.go @@ -3,6 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build !race // +build !race package sync diff --git a/pkg/sync/race_amd64.s b/pkg/sync/race_amd64.s index 57bc0ec79..199602387 100644 --- a/pkg/sync/race_amd64.s +++ b/pkg/sync/race_amd64.s @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build race -// +build amd64 +//go:build race && amd64 +// +build race,amd64 #include "textflag.h" diff --git a/pkg/sync/race_arm64.s b/pkg/sync/race_arm64.s index 88f091fda..c4192e870 100644 --- a/pkg/sync/race_arm64.s +++ b/pkg/sync/race_arm64.s @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build race -// +build arm64 +//go:build race && arm64 +// +build race,arm64 #include "textflag.h" diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go index 59985c270..381163cac 100644 --- a/pkg/sync/race_unsafe.go +++ b/pkg/sync/race_unsafe.go @@ -3,6 +3,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build race // +build race package sync diff --git a/pkg/sync/runtime_unsafe.go b/pkg/sync/runtime_unsafe.go index 39c766331..f6e6a4f7b 100644 --- a/pkg/sync/runtime_unsafe.go +++ b/pkg/sync/runtime_unsafe.go @@ -3,11 +3,16 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build go1.13 -// +build !go1.18 +//go:build go1.13 && !go1.19 +// +build go1.13,!go1.19 -// Check go:linkname function signatures, type definitions, and constants when -// updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. + +// Check type definitions and constants when updating Go version. +// +// TODO(b/165820485): add these checks to checklinkname. package sync @@ -109,10 +114,10 @@ type maptype struct { // These functions are only used within the sync package. //go:linkname semacquire sync.runtime_Semacquire -func semacquire(s *uint32) +func semacquire(addr *uint32) //go:linkname semrelease sync.runtime_Semrelease -func semrelease(s *uint32, handoff bool, skipframes int) +func semrelease(addr *uint32, handoff bool, skipframes int) //go:linkname canSpin sync.runtime_canSpin func canSpin(i int) bool diff --git a/pkg/sync/rwmutex_test.go b/pkg/sync/rwmutex_test.go index 5ca96d12b..56a88e712 100644 --- a/pkg/sync/rwmutex_test.go +++ b/pkg/sync/rwmutex_test.go @@ -172,7 +172,7 @@ func TestRWTryLockUnlock(t *testing.T) { if !rwm.TryLock() { t.Fatal("failed to aquire lock") } - rwm.Unlock() + rwm.Unlock() // +checklocksforce if !rwm.TryLock() { t.Fatal("failed to aquire lock after unlock") } diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go index 892d3e641..7829b06db 100644 --- a/pkg/sync/rwmutex_unsafe.go +++ b/pkg/sync/rwmutex_unsafe.go @@ -37,6 +37,7 @@ const rwmutexMaxReaders = 1 << 30 // TryRLock locks rw for reading. It returns true if it succeeds and false // otherwise. It does not block. +// +checklocksignore func (rw *CrossGoroutineRWMutex) TryRLock() bool { if RaceEnabled { RaceDisable() @@ -65,6 +66,7 @@ func (rw *CrossGoroutineRWMutex) TryRLock() bool { // It should not be used for recursive read locking; a blocked Lock call // excludes new readers from acquiring the lock. See the documentation on the // RWMutex type. +// +checklocksignore func (rw *CrossGoroutineRWMutex) RLock() { if RaceEnabled { RaceDisable() @@ -83,6 +85,7 @@ func (rw *CrossGoroutineRWMutex) RLock() { // // Preconditions: // * rw is locked for reading. +// +checklocksignore func (rw *CrossGoroutineRWMutex) RUnlock() { if RaceEnabled { RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) @@ -134,6 +137,7 @@ func (rw *CrossGoroutineRWMutex) TryLock() bool { // Lock locks rw for writing. If the lock is already locked for reading or // writing, Lock blocks until the lock is available. +// +checklocksignore func (rw *CrossGoroutineRWMutex) Lock() { if RaceEnabled { RaceDisable() @@ -228,6 +232,7 @@ type RWMutex struct { // TryRLock locks rw for reading. It returns true if it succeeds and false // otherwise. It does not block. +// +checklocksignore func (rw *RWMutex) TryRLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(rw)) @@ -243,6 +248,7 @@ func (rw *RWMutex) TryRLock() bool { // It should not be used for recursive read locking; a blocked Lock call // excludes new readers from acquiring the lock. See the documentation on the // RWMutex type. +// +checklocksignore func (rw *RWMutex) RLock() { noteLock(unsafe.Pointer(rw)) rw.m.RLock() @@ -261,6 +267,7 @@ func (rw *RWMutex) RUnlock() { // TryLock locks rw for writing. It returns true if it succeeds and false // otherwise. It does not block. +// +checklocksignore func (rw *RWMutex) TryLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(rw)) @@ -273,6 +280,7 @@ func (rw *RWMutex) TryLock() bool { // Lock locks rw for writing. If the lock is already locked for reading or // writing, Lock blocks until the lock is available. +// +checklocksignore func (rw *RWMutex) Lock() { noteLock(unsafe.Pointer(rw)) rw.m.Lock() diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD index 9cc9e3bf2..d8c4c9613 100644 --- a/pkg/syserr/BUILD +++ b/pkg/syserr/BUILD @@ -11,8 +11,10 @@ go_library( ], visibility = ["//visibility:public"], deps = [ - "//pkg/abi/linux", - "//pkg/syserror", + "//pkg/abi/linux/errno", + "//pkg/errors", + "//pkg/errors/linuxerr", + "//pkg/safecopy", "//pkg/tcpip", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/syserr/host_linux.go b/pkg/syserr/host_linux.go index c8c10f48b..fb92738af 100644 --- a/pkg/syserr/host_linux.go +++ b/pkg/syserr/host_linux.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package syserr diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go index 90be24e15..eb44f1254 100644 --- a/pkg/syserr/netstack.go +++ b/pkg/syserr/netstack.go @@ -17,7 +17,7 @@ package syserr import ( "fmt" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/tcpip" ) @@ -25,33 +25,33 @@ import ( // Mapping for tcpip.Error types. var ( - ErrUnknownProtocol = New((&tcpip.ErrUnknownProtocol{}).String(), linux.EINVAL) - ErrUnknownNICID = New((&tcpip.ErrUnknownNICID{}).String(), linux.ENODEV) - ErrUnknownDevice = New((&tcpip.ErrUnknownDevice{}).String(), linux.ENODEV) - ErrUnknownProtocolOption = New((&tcpip.ErrUnknownProtocolOption{}).String(), linux.ENOPROTOOPT) - ErrDuplicateNICID = New((&tcpip.ErrDuplicateNICID{}).String(), linux.EEXIST) - ErrDuplicateAddress = New((&tcpip.ErrDuplicateAddress{}).String(), linux.EEXIST) - ErrAlreadyBound = New((&tcpip.ErrAlreadyBound{}).String(), linux.EINVAL) - ErrInvalidEndpointState = New((&tcpip.ErrInvalidEndpointState{}).String(), linux.EINVAL) - ErrAlreadyConnecting = New((&tcpip.ErrAlreadyConnecting{}).String(), linux.EALREADY) - ErrNoPortAvailable = New((&tcpip.ErrNoPortAvailable{}).String(), linux.EAGAIN) - ErrPortInUse = New((&tcpip.ErrPortInUse{}).String(), linux.EADDRINUSE) - ErrBadLocalAddress = New((&tcpip.ErrBadLocalAddress{}).String(), linux.EADDRNOTAVAIL) - ErrClosedForSend = New((&tcpip.ErrClosedForSend{}).String(), linux.EPIPE) - ErrClosedForReceive = New((&tcpip.ErrClosedForReceive{}).String(), linux.NOERRNO) - ErrTimeout = New((&tcpip.ErrTimeout{}).String(), linux.ETIMEDOUT) - ErrAborted = New((&tcpip.ErrAborted{}).String(), linux.EPIPE) - ErrConnectStarted = New((&tcpip.ErrConnectStarted{}).String(), linux.EINPROGRESS) - ErrDestinationRequired = New((&tcpip.ErrDestinationRequired{}).String(), linux.EDESTADDRREQ) - ErrNotSupported = New((&tcpip.ErrNotSupported{}).String(), linux.EOPNOTSUPP) - ErrQueueSizeNotSupported = New((&tcpip.ErrQueueSizeNotSupported{}).String(), linux.ENOTTY) - ErrNoSuchFile = New((&tcpip.ErrNoSuchFile{}).String(), linux.ENOENT) - ErrInvalidOptionValue = New((&tcpip.ErrInvalidOptionValue{}).String(), linux.EINVAL) - ErrBroadcastDisabled = New((&tcpip.ErrBroadcastDisabled{}).String(), linux.EACCES) - ErrNotPermittedNet = New((&tcpip.ErrNotPermitted{}).String(), linux.EPERM) - ErrBadBuffer = New((&tcpip.ErrBadBuffer{}).String(), linux.EFAULT) - ErrMalformedHeader = New((&tcpip.ErrMalformedHeader{}).String(), linux.EINVAL) - ErrInvalidPortRange = New((&tcpip.ErrInvalidPortRange{}).String(), linux.EINVAL) + ErrUnknownProtocol = New((&tcpip.ErrUnknownProtocol{}).String(), errno.EINVAL) + ErrUnknownNICID = New((&tcpip.ErrUnknownNICID{}).String(), errno.ENODEV) + ErrUnknownDevice = New((&tcpip.ErrUnknownDevice{}).String(), errno.ENODEV) + ErrUnknownProtocolOption = New((&tcpip.ErrUnknownProtocolOption{}).String(), errno.ENOPROTOOPT) + ErrDuplicateNICID = New((&tcpip.ErrDuplicateNICID{}).String(), errno.EEXIST) + ErrDuplicateAddress = New((&tcpip.ErrDuplicateAddress{}).String(), errno.EEXIST) + ErrAlreadyBound = New((&tcpip.ErrAlreadyBound{}).String(), errno.EINVAL) + ErrInvalidEndpointState = New((&tcpip.ErrInvalidEndpointState{}).String(), errno.EINVAL) + ErrAlreadyConnecting = New((&tcpip.ErrAlreadyConnecting{}).String(), errno.EALREADY) + ErrNoPortAvailable = New((&tcpip.ErrNoPortAvailable{}).String(), errno.EAGAIN) + ErrPortInUse = New((&tcpip.ErrPortInUse{}).String(), errno.EADDRINUSE) + ErrBadLocalAddress = New((&tcpip.ErrBadLocalAddress{}).String(), errno.EADDRNOTAVAIL) + ErrClosedForSend = New((&tcpip.ErrClosedForSend{}).String(), errno.EPIPE) + ErrClosedForReceive = New((&tcpip.ErrClosedForReceive{}).String(), errno.NOERRNO) + ErrTimeout = New((&tcpip.ErrTimeout{}).String(), errno.ETIMEDOUT) + ErrAborted = New((&tcpip.ErrAborted{}).String(), errno.EPIPE) + ErrConnectStarted = New((&tcpip.ErrConnectStarted{}).String(), errno.EINPROGRESS) + ErrDestinationRequired = New((&tcpip.ErrDestinationRequired{}).String(), errno.EDESTADDRREQ) + ErrNotSupported = New((&tcpip.ErrNotSupported{}).String(), errno.EOPNOTSUPP) + ErrQueueSizeNotSupported = New((&tcpip.ErrQueueSizeNotSupported{}).String(), errno.ENOTTY) + ErrNoSuchFile = New((&tcpip.ErrNoSuchFile{}).String(), errno.ENOENT) + ErrInvalidOptionValue = New((&tcpip.ErrInvalidOptionValue{}).String(), errno.EINVAL) + ErrBroadcastDisabled = New((&tcpip.ErrBroadcastDisabled{}).String(), errno.EACCES) + ErrNotPermittedNet = New((&tcpip.ErrNotPermitted{}).String(), errno.EPERM) + ErrBadBuffer = New((&tcpip.ErrBadBuffer{}).String(), errno.EFAULT) + ErrMalformedHeader = New((&tcpip.ErrMalformedHeader{}).String(), errno.EINVAL) + ErrInvalidPortRange = New((&tcpip.ErrInvalidPortRange{}).String(), errno.EINVAL) ) // TranslateNetstackError converts an error from the tcpip package to a sentry diff --git a/pkg/syserr/syserr.go b/pkg/syserr/syserr.go index d70521f32..b679f3046 100644 --- a/pkg/syserr/syserr.go +++ b/pkg/syserr/syserr.go @@ -21,8 +21,10 @@ import ( "fmt" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/abi/linux/errno" + "gvisor.dev/gvisor/pkg/errors" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/safecopy" ) // Error represents an internal error. @@ -31,34 +33,33 @@ type Error struct { message string // noTranslation indicates that this Error cannot be translated to a - // linux.Errno. + // errno.Errno. noTranslation bool - // errno is the linux.Errno this Error should be translated to. - errno linux.Errno + // errno is the errno.Errno this Error should be translated to. + errno errno.Errno } // New creates a new Error and adds a translation for it. // // New must only be called at init. -func New(message string, linuxTranslation linux.Errno) *Error { +func New(message string, linuxTranslation errno.Errno) *Error { err := &Error{message: message, errno: linuxTranslation} // TODO(b/34162363): Remove this. - errno := linuxTranslation - if errno < 0 || int(errno) >= len(linuxBackwardsTranslations) { - panic(fmt.Sprint("invalid errno: ", errno)) + if int(err.errno) >= len(linuxBackwardsTranslations) { + panic(fmt.Sprint("invalid errno: ", err.errno)) } - e := error(unix.Errno(errno)) - // syserror.ErrWouldBlock gets translated to syserror.EWOULDBLOCK and + e := error(unix.Errno(err.errno)) + // linuxerr.ErrWouldBlock gets translated to linuxerr.EWOULDBLOCK and // enables proper blocking semantics. This should temporary address the // class of blocking bugs that keep popping up with the current state of // the error space. - if e == syserror.EWOULDBLOCK { - e = syserror.ErrWouldBlock + if err.errno == linuxerr.EWOULDBLOCK.Errno() { + e = linuxerr.ErrWouldBlock } - linuxBackwardsTranslations[errno] = linuxBackwardsTranslation{err: e, ok: true} + linuxBackwardsTranslations[err.errno] = linuxBackwardsTranslation{err: e, ok: true} return err } @@ -69,7 +70,7 @@ func New(message string, linuxTranslation linux.Errno) *Error { // NewDynamic should only be used sparingly and not be used for static error // messages. Errors with static error messages should be declared with New as // global variables. -func NewDynamic(message string, linuxTranslation linux.Errno) *Error { +func NewDynamic(message string, linuxTranslation errno.Errno) *Error { return &Error{message: message, errno: linuxTranslation} } @@ -82,7 +83,7 @@ func NewWithoutTranslation(message string) *Error { return &Error{message: message, noTranslation: true} } -func newWithHost(message string, linuxTranslation linux.Errno, hostErrno unix.Errno) *Error { +func newWithHost(message string, linuxTranslation errno.Errno, hostErrno unix.Errno) *Error { e := New(message, linuxTranslation) addLinuxHostTranslation(hostErrno, e) return e @@ -114,19 +115,19 @@ func (e *Error) ToError() error { if e.noTranslation { panic(fmt.Sprintf("error %q does not support translation", e.message)) } - errno := int(e.errno) - if errno == linux.NOERRNO { + err := int(e.errno) + if err == errno.NOERRNO { return nil } - if errno <= 0 || errno >= len(linuxBackwardsTranslations) || !linuxBackwardsTranslations[errno].ok { - panic(fmt.Sprintf("unknown error %q (%d)", e.message, errno)) + if err >= len(linuxBackwardsTranslations) || !linuxBackwardsTranslations[err].ok { + panic(fmt.Sprintf("unknown error %q (%d)", e.message, err)) } - return linuxBackwardsTranslations[errno].err + return linuxBackwardsTranslations[err].err } // ToLinux converts the Error to a Linux ABI error that can be returned to the // application. -func (e *Error) ToLinux() linux.Errno { +func (e *Error) ToLinux() errno.Errno { if e.noTranslation { panic(fmt.Sprintf("No Linux ABI translation available for %q", e.message)) } @@ -138,137 +139,137 @@ func (e *Error) ToLinux() linux.Errno { // Some of the errors should be replaced with package specific errors and // others should be removed entirely. var ( - ErrNotPermitted = newWithHost("operation not permitted", linux.EPERM, unix.EPERM) - ErrNoFileOrDir = newWithHost("no such file or directory", linux.ENOENT, unix.ENOENT) - ErrNoProcess = newWithHost("no such process", linux.ESRCH, unix.ESRCH) - ErrInterrupted = newWithHost("interrupted system call", linux.EINTR, unix.EINTR) - ErrIO = newWithHost("I/O error", linux.EIO, unix.EIO) - ErrDeviceOrAddress = newWithHost("no such device or address", linux.ENXIO, unix.ENXIO) - ErrTooManyArgs = newWithHost("argument list too long", linux.E2BIG, unix.E2BIG) - ErrEcec = newWithHost("exec format error", linux.ENOEXEC, unix.ENOEXEC) - ErrBadFD = newWithHost("bad file number", linux.EBADF, unix.EBADF) - ErrNoChild = newWithHost("no child processes", linux.ECHILD, unix.ECHILD) - ErrTryAgain = newWithHost("try again", linux.EAGAIN, unix.EAGAIN) - ErrNoMemory = newWithHost("out of memory", linux.ENOMEM, unix.ENOMEM) - ErrPermissionDenied = newWithHost("permission denied", linux.EACCES, unix.EACCES) - ErrBadAddress = newWithHost("bad address", linux.EFAULT, unix.EFAULT) - ErrNotBlockDevice = newWithHost("block device required", linux.ENOTBLK, unix.ENOTBLK) - ErrBusy = newWithHost("device or resource busy", linux.EBUSY, unix.EBUSY) - ErrExists = newWithHost("file exists", linux.EEXIST, unix.EEXIST) - ErrCrossDeviceLink = newWithHost("cross-device link", linux.EXDEV, unix.EXDEV) - ErrNoDevice = newWithHost("no such device", linux.ENODEV, unix.ENODEV) - ErrNotDir = newWithHost("not a directory", linux.ENOTDIR, unix.ENOTDIR) - ErrIsDir = newWithHost("is a directory", linux.EISDIR, unix.EISDIR) - ErrInvalidArgument = newWithHost("invalid argument", linux.EINVAL, unix.EINVAL) - ErrFileTableOverflow = newWithHost("file table overflow", linux.ENFILE, unix.ENFILE) - ErrTooManyOpenFiles = newWithHost("too many open files", linux.EMFILE, unix.EMFILE) - ErrNotTTY = newWithHost("not a typewriter", linux.ENOTTY, unix.ENOTTY) - ErrTestFileBusy = newWithHost("text file busy", linux.ETXTBSY, unix.ETXTBSY) - ErrFileTooBig = newWithHost("file too large", linux.EFBIG, unix.EFBIG) - ErrNoSpace = newWithHost("no space left on device", linux.ENOSPC, unix.ENOSPC) - ErrIllegalSeek = newWithHost("illegal seek", linux.ESPIPE, unix.ESPIPE) - ErrReadOnlyFS = newWithHost("read-only file system", linux.EROFS, unix.EROFS) - ErrTooManyLinks = newWithHost("too many links", linux.EMLINK, unix.EMLINK) - ErrBrokenPipe = newWithHost("broken pipe", linux.EPIPE, unix.EPIPE) - ErrDomain = newWithHost("math argument out of domain of func", linux.EDOM, unix.EDOM) - ErrRange = newWithHost("math result not representable", linux.ERANGE, unix.ERANGE) - ErrDeadlock = newWithHost("resource deadlock would occur", linux.EDEADLOCK, unix.EDEADLOCK) - ErrNameTooLong = newWithHost("file name too long", linux.ENAMETOOLONG, unix.ENAMETOOLONG) - ErrNoLocksAvailable = newWithHost("no record locks available", linux.ENOLCK, unix.ENOLCK) - ErrInvalidSyscall = newWithHost("invalid system call number", linux.ENOSYS, unix.ENOSYS) - ErrDirNotEmpty = newWithHost("directory not empty", linux.ENOTEMPTY, unix.ENOTEMPTY) - ErrLinkLoop = newWithHost("too many symbolic links encountered", linux.ELOOP, unix.ELOOP) - ErrNoMessage = newWithHost("no message of desired type", linux.ENOMSG, unix.ENOMSG) - ErrIdentifierRemoved = newWithHost("identifier removed", linux.EIDRM, unix.EIDRM) - ErrChannelOutOfRange = newWithHost("channel number out of range", linux.ECHRNG, unix.ECHRNG) - ErrLevelTwoNotSynced = newWithHost("level 2 not synchronized", linux.EL2NSYNC, unix.EL2NSYNC) - ErrLevelThreeHalted = newWithHost("level 3 halted", linux.EL3HLT, unix.EL3HLT) - ErrLevelThreeReset = newWithHost("level 3 reset", linux.EL3RST, unix.EL3RST) - ErrLinkNumberOutOfRange = newWithHost("link number out of range", linux.ELNRNG, unix.ELNRNG) - ErrProtocolDriverNotAttached = newWithHost("protocol driver not attached", linux.EUNATCH, unix.EUNATCH) - ErrNoCSIAvailable = newWithHost("no CSI structure available", linux.ENOCSI, unix.ENOCSI) - ErrLevelTwoHalted = newWithHost("level 2 halted", linux.EL2HLT, unix.EL2HLT) - ErrInvalidExchange = newWithHost("invalid exchange", linux.EBADE, unix.EBADE) - ErrInvalidRequestDescriptor = newWithHost("invalid request descriptor", linux.EBADR, unix.EBADR) - ErrExchangeFull = newWithHost("exchange full", linux.EXFULL, unix.EXFULL) - ErrNoAnode = newWithHost("no anode", linux.ENOANO, unix.ENOANO) - ErrInvalidRequestCode = newWithHost("invalid request code", linux.EBADRQC, unix.EBADRQC) - ErrInvalidSlot = newWithHost("invalid slot", linux.EBADSLT, unix.EBADSLT) - ErrBadFontFile = newWithHost("bad font file format", linux.EBFONT, unix.EBFONT) - ErrNotStream = newWithHost("device not a stream", linux.ENOSTR, unix.ENOSTR) - ErrNoDataAvailable = newWithHost("no data available", linux.ENODATA, unix.ENODATA) - ErrTimerExpired = newWithHost("timer expired", linux.ETIME, unix.ETIME) - ErrStreamsResourceDepleted = newWithHost("out of streams resources", linux.ENOSR, unix.ENOSR) - ErrMachineNotOnNetwork = newWithHost("machine is not on the network", linux.ENONET, unix.ENONET) - ErrPackageNotInstalled = newWithHost("package not installed", linux.ENOPKG, unix.ENOPKG) - ErrIsRemote = newWithHost("object is remote", linux.EREMOTE, unix.EREMOTE) - ErrNoLink = newWithHost("link has been severed", linux.ENOLINK, unix.ENOLINK) - ErrAdvertise = newWithHost("advertise error", linux.EADV, unix.EADV) - ErrSRMount = newWithHost("srmount error", linux.ESRMNT, unix.ESRMNT) - ErrSendCommunication = newWithHost("communication error on send", linux.ECOMM, unix.ECOMM) - ErrProtocol = newWithHost("protocol error", linux.EPROTO, unix.EPROTO) - ErrMultihopAttempted = newWithHost("multihop attempted", linux.EMULTIHOP, unix.EMULTIHOP) - ErrRFS = newWithHost("RFS specific error", linux.EDOTDOT, unix.EDOTDOT) - ErrInvalidDataMessage = newWithHost("not a data message", linux.EBADMSG, unix.EBADMSG) - ErrOverflow = newWithHost("value too large for defined data type", linux.EOVERFLOW, unix.EOVERFLOW) - ErrNetworkNameNotUnique = newWithHost("name not unique on network", linux.ENOTUNIQ, unix.ENOTUNIQ) - ErrFDInBadState = newWithHost("file descriptor in bad state", linux.EBADFD, unix.EBADFD) - ErrRemoteAddressChanged = newWithHost("remote address changed", linux.EREMCHG, unix.EREMCHG) - ErrSharedLibraryInaccessible = newWithHost("can not access a needed shared library", linux.ELIBACC, unix.ELIBACC) - ErrCorruptedSharedLibrary = newWithHost("accessing a corrupted shared library", linux.ELIBBAD, unix.ELIBBAD) - ErrLibSectionCorrupted = newWithHost(".lib section in a.out corrupted", linux.ELIBSCN, unix.ELIBSCN) - ErrTooManySharedLibraries = newWithHost("attempting to link in too many shared libraries", linux.ELIBMAX, unix.ELIBMAX) - ErrSharedLibraryExeced = newWithHost("cannot exec a shared library directly", linux.ELIBEXEC, unix.ELIBEXEC) - ErrIllegalByteSequence = newWithHost("illegal byte sequence", linux.EILSEQ, unix.EILSEQ) - ErrShouldRestart = newWithHost("interrupted system call should be restarted", linux.ERESTART, unix.ERESTART) - ErrStreamPipe = newWithHost("streams pipe error", linux.ESTRPIPE, unix.ESTRPIPE) - ErrTooManyUsers = newWithHost("too many users", linux.EUSERS, unix.EUSERS) - ErrNotASocket = newWithHost("socket operation on non-socket", linux.ENOTSOCK, unix.ENOTSOCK) - ErrDestinationAddressRequired = newWithHost("destination address required", linux.EDESTADDRREQ, unix.EDESTADDRREQ) - ErrMessageTooLong = newWithHost("message too long", linux.EMSGSIZE, unix.EMSGSIZE) - ErrWrongProtocolForSocket = newWithHost("protocol wrong type for socket", linux.EPROTOTYPE, unix.EPROTOTYPE) - ErrProtocolNotAvailable = newWithHost("protocol not available", linux.ENOPROTOOPT, unix.ENOPROTOOPT) - ErrProtocolNotSupported = newWithHost("protocol not supported", linux.EPROTONOSUPPORT, unix.EPROTONOSUPPORT) - ErrSocketNotSupported = newWithHost("socket type not supported", linux.ESOCKTNOSUPPORT, unix.ESOCKTNOSUPPORT) - ErrEndpointOperation = newWithHost("operation not supported on transport endpoint", linux.EOPNOTSUPP, unix.EOPNOTSUPP) - ErrProtocolFamilyNotSupported = newWithHost("protocol family not supported", linux.EPFNOSUPPORT, unix.EPFNOSUPPORT) - ErrAddressFamilyNotSupported = newWithHost("address family not supported by protocol", linux.EAFNOSUPPORT, unix.EAFNOSUPPORT) - ErrAddressInUse = newWithHost("address already in use", linux.EADDRINUSE, unix.EADDRINUSE) - ErrAddressNotAvailable = newWithHost("cannot assign requested address", linux.EADDRNOTAVAIL, unix.EADDRNOTAVAIL) - ErrNetworkDown = newWithHost("network is down", linux.ENETDOWN, unix.ENETDOWN) - ErrNetworkUnreachable = newWithHost("network is unreachable", linux.ENETUNREACH, unix.ENETUNREACH) - ErrNetworkReset = newWithHost("network dropped connection because of reset", linux.ENETRESET, unix.ENETRESET) - ErrConnectionAborted = newWithHost("software caused connection abort", linux.ECONNABORTED, unix.ECONNABORTED) - ErrConnectionReset = newWithHost("connection reset by peer", linux.ECONNRESET, unix.ECONNRESET) - ErrNoBufferSpace = newWithHost("no buffer space available", linux.ENOBUFS, unix.ENOBUFS) - ErrAlreadyConnected = newWithHost("transport endpoint is already connected", linux.EISCONN, unix.EISCONN) - ErrNotConnected = newWithHost("transport endpoint is not connected", linux.ENOTCONN, unix.ENOTCONN) - ErrShutdown = newWithHost("cannot send after transport endpoint shutdown", linux.ESHUTDOWN, unix.ESHUTDOWN) - ErrTooManyRefs = newWithHost("too many references: cannot splice", linux.ETOOMANYREFS, unix.ETOOMANYREFS) - ErrTimedOut = newWithHost("connection timed out", linux.ETIMEDOUT, unix.ETIMEDOUT) - ErrConnectionRefused = newWithHost("connection refused", linux.ECONNREFUSED, unix.ECONNREFUSED) - ErrHostDown = newWithHost("host is down", linux.EHOSTDOWN, unix.EHOSTDOWN) - ErrNoRoute = newWithHost("no route to host", linux.EHOSTUNREACH, unix.EHOSTUNREACH) - ErrAlreadyInProgress = newWithHost("operation already in progress", linux.EALREADY, unix.EALREADY) - ErrInProgress = newWithHost("operation now in progress", linux.EINPROGRESS, unix.EINPROGRESS) - ErrStaleFileHandle = newWithHost("stale file handle", linux.ESTALE, unix.ESTALE) - ErrStructureNeedsCleaning = newWithHost("structure needs cleaning", linux.EUCLEAN, unix.EUCLEAN) - ErrIsNamedFile = newWithHost("is a named type file", linux.ENOTNAM, unix.ENOTNAM) - ErrRemoteIO = newWithHost("remote I/O error", linux.EREMOTEIO, unix.EREMOTEIO) - ErrQuotaExceeded = newWithHost("quota exceeded", linux.EDQUOT, unix.EDQUOT) - ErrNoMedium = newWithHost("no medium found", linux.ENOMEDIUM, unix.ENOMEDIUM) - ErrWrongMediumType = newWithHost("wrong medium type", linux.EMEDIUMTYPE, unix.EMEDIUMTYPE) - ErrCanceled = newWithHost("operation canceled", linux.ECANCELED, unix.ECANCELED) - ErrNoKey = newWithHost("required key not available", linux.ENOKEY, unix.ENOKEY) - ErrKeyExpired = newWithHost("key has expired", linux.EKEYEXPIRED, unix.EKEYEXPIRED) - ErrKeyRevoked = newWithHost("key has been revoked", linux.EKEYREVOKED, unix.EKEYREVOKED) - ErrKeyRejected = newWithHost("key was rejected by service", linux.EKEYREJECTED, unix.EKEYREJECTED) - ErrOwnerDied = newWithHost("owner died", linux.EOWNERDEAD, unix.EOWNERDEAD) - ErrNotRecoverable = newWithHost("state not recoverable", linux.ENOTRECOVERABLE, unix.ENOTRECOVERABLE) + ErrNotPermitted = newWithHost("operation not permitted", errno.EPERM, unix.EPERM) + ErrNoFileOrDir = newWithHost("no such file or directory", errno.ENOENT, unix.ENOENT) + ErrNoProcess = newWithHost("no such process", errno.ESRCH, unix.ESRCH) + ErrInterrupted = newWithHost("interrupted system call", errno.EINTR, unix.EINTR) + ErrIO = newWithHost("I/O error", errno.EIO, unix.EIO) + ErrDeviceOrAddress = newWithHost("no such device or address", errno.ENXIO, unix.ENXIO) + ErrTooManyArgs = newWithHost("argument list too long", errno.E2BIG, unix.E2BIG) + ErrEcec = newWithHost("exec format error", errno.ENOEXEC, unix.ENOEXEC) + ErrBadFD = newWithHost("bad file number", errno.EBADF, unix.EBADF) + ErrNoChild = newWithHost("no child processes", errno.ECHILD, unix.ECHILD) + ErrTryAgain = newWithHost("try again", errno.EAGAIN, unix.EAGAIN) + ErrNoMemory = newWithHost("out of memory", errno.ENOMEM, unix.ENOMEM) + ErrPermissionDenied = newWithHost("permission denied", errno.EACCES, unix.EACCES) + ErrBadAddress = newWithHost("bad address", errno.EFAULT, unix.EFAULT) + ErrNotBlockDevice = newWithHost("block device required", errno.ENOTBLK, unix.ENOTBLK) + ErrBusy = newWithHost("device or resource busy", errno.EBUSY, unix.EBUSY) + ErrExists = newWithHost("file exists", errno.EEXIST, unix.EEXIST) + ErrCrossDeviceLink = newWithHost("cross-device link", errno.EXDEV, unix.EXDEV) + ErrNoDevice = newWithHost("no such device", errno.ENODEV, unix.ENODEV) + ErrNotDir = newWithHost("not a directory", errno.ENOTDIR, unix.ENOTDIR) + ErrIsDir = newWithHost("is a directory", errno.EISDIR, unix.EISDIR) + ErrInvalidArgument = newWithHost("invalid argument", errno.EINVAL, unix.EINVAL) + ErrFileTableOverflow = newWithHost("file table overflow", errno.ENFILE, unix.ENFILE) + ErrTooManyOpenFiles = newWithHost("too many open files", errno.EMFILE, unix.EMFILE) + ErrNotTTY = newWithHost("not a typewriter", errno.ENOTTY, unix.ENOTTY) + ErrTestFileBusy = newWithHost("text file busy", errno.ETXTBSY, unix.ETXTBSY) + ErrFileTooBig = newWithHost("file too large", errno.EFBIG, unix.EFBIG) + ErrNoSpace = newWithHost("no space left on device", errno.ENOSPC, unix.ENOSPC) + ErrIllegalSeek = newWithHost("illegal seek", errno.ESPIPE, unix.ESPIPE) + ErrReadOnlyFS = newWithHost("read-only file system", errno.EROFS, unix.EROFS) + ErrTooManyLinks = newWithHost("too many links", errno.EMLINK, unix.EMLINK) + ErrBrokenPipe = newWithHost("broken pipe", errno.EPIPE, unix.EPIPE) + ErrDomain = newWithHost("math argument out of domain of func", errno.EDOM, unix.EDOM) + ErrRange = newWithHost("math result not representable", errno.ERANGE, unix.ERANGE) + ErrDeadlock = newWithHost("resource deadlock would occur", errno.EDEADLOCK, unix.EDEADLOCK) + ErrNameTooLong = newWithHost("file name too long", errno.ENAMETOOLONG, unix.ENAMETOOLONG) + ErrNoLocksAvailable = newWithHost("no record locks available", errno.ENOLCK, unix.ENOLCK) + ErrInvalidSyscall = newWithHost("invalid system call number", errno.ENOSYS, unix.ENOSYS) + ErrDirNotEmpty = newWithHost("directory not empty", errno.ENOTEMPTY, unix.ENOTEMPTY) + ErrLinkLoop = newWithHost("too many symbolic links encountered", errno.ELOOP, unix.ELOOP) + ErrNoMessage = newWithHost("no message of desired type", errno.ENOMSG, unix.ENOMSG) + ErrIdentifierRemoved = newWithHost("identifier removed", errno.EIDRM, unix.EIDRM) + ErrChannelOutOfRange = newWithHost("channel number out of range", errno.ECHRNG, unix.ECHRNG) + ErrLevelTwoNotSynced = newWithHost("level 2 not synchronized", errno.EL2NSYNC, unix.EL2NSYNC) + ErrLevelThreeHalted = newWithHost("level 3 halted", errno.EL3HLT, unix.EL3HLT) + ErrLevelThreeReset = newWithHost("level 3 reset", errno.EL3RST, unix.EL3RST) + ErrLinkNumberOutOfRange = newWithHost("link number out of range", errno.ELNRNG, unix.ELNRNG) + ErrProtocolDriverNotAttached = newWithHost("protocol driver not attached", errno.EUNATCH, unix.EUNATCH) + ErrNoCSIAvailable = newWithHost("no CSI structure available", errno.ENOCSI, unix.ENOCSI) + ErrLevelTwoHalted = newWithHost("level 2 halted", errno.EL2HLT, unix.EL2HLT) + ErrInvalidExchange = newWithHost("invalid exchange", errno.EBADE, unix.EBADE) + ErrInvalidRequestDescriptor = newWithHost("invalid request descriptor", errno.EBADR, unix.EBADR) + ErrExchangeFull = newWithHost("exchange full", errno.EXFULL, unix.EXFULL) + ErrNoAnode = newWithHost("no anode", errno.ENOANO, unix.ENOANO) + ErrInvalidRequestCode = newWithHost("invalid request code", errno.EBADRQC, unix.EBADRQC) + ErrInvalidSlot = newWithHost("invalid slot", errno.EBADSLT, unix.EBADSLT) + ErrBadFontFile = newWithHost("bad font file format", errno.EBFONT, unix.EBFONT) + ErrNotStream = newWithHost("device not a stream", errno.ENOSTR, unix.ENOSTR) + ErrNoDataAvailable = newWithHost("no data available", errno.ENODATA, unix.ENODATA) + ErrTimerExpired = newWithHost("timer expired", errno.ETIME, unix.ETIME) + ErrStreamsResourceDepleted = newWithHost("out of streams resources", errno.ENOSR, unix.ENOSR) + ErrMachineNotOnNetwork = newWithHost("machine is not on the network", errno.ENONET, unix.ENONET) + ErrPackageNotInstalled = newWithHost("package not installed", errno.ENOPKG, unix.ENOPKG) + ErrIsRemote = newWithHost("object is remote", errno.EREMOTE, unix.EREMOTE) + ErrNoLink = newWithHost("link has been severed", errno.ENOLINK, unix.ENOLINK) + ErrAdvertise = newWithHost("advertise error", errno.EADV, unix.EADV) + ErrSRMount = newWithHost("srmount error", errno.ESRMNT, unix.ESRMNT) + ErrSendCommunication = newWithHost("communication error on send", errno.ECOMM, unix.ECOMM) + ErrProtocol = newWithHost("protocol error", errno.EPROTO, unix.EPROTO) + ErrMultihopAttempted = newWithHost("multihop attempted", errno.EMULTIHOP, unix.EMULTIHOP) + ErrRFS = newWithHost("RFS specific error", errno.EDOTDOT, unix.EDOTDOT) + ErrInvalidDataMessage = newWithHost("not a data message", errno.EBADMSG, unix.EBADMSG) + ErrOverflow = newWithHost("value too large for defined data type", errno.EOVERFLOW, unix.EOVERFLOW) + ErrNetworkNameNotUnique = newWithHost("name not unique on network", errno.ENOTUNIQ, unix.ENOTUNIQ) + ErrFDInBadState = newWithHost("file descriptor in bad state", errno.EBADFD, unix.EBADFD) + ErrRemoteAddressChanged = newWithHost("remote address changed", errno.EREMCHG, unix.EREMCHG) + ErrSharedLibraryInaccessible = newWithHost("can not access a needed shared library", errno.ELIBACC, unix.ELIBACC) + ErrCorruptedSharedLibrary = newWithHost("accessing a corrupted shared library", errno.ELIBBAD, unix.ELIBBAD) + ErrLibSectionCorrupted = newWithHost(".lib section in a.out corrupted", errno.ELIBSCN, unix.ELIBSCN) + ErrTooManySharedLibraries = newWithHost("attempting to link in too many shared libraries", errno.ELIBMAX, unix.ELIBMAX) + ErrSharedLibraryExeced = newWithHost("cannot exec a shared library directly", errno.ELIBEXEC, unix.ELIBEXEC) + ErrIllegalByteSequence = newWithHost("illegal byte sequence", errno.EILSEQ, unix.EILSEQ) + ErrShouldRestart = newWithHost("interrupted system call should be restarted", errno.ERESTART, unix.ERESTART) + ErrStreamPipe = newWithHost("streams pipe error", errno.ESTRPIPE, unix.ESTRPIPE) + ErrTooManyUsers = newWithHost("too many users", errno.EUSERS, unix.EUSERS) + ErrNotASocket = newWithHost("socket operation on non-socket", errno.ENOTSOCK, unix.ENOTSOCK) + ErrDestinationAddressRequired = newWithHost("destination address required", errno.EDESTADDRREQ, unix.EDESTADDRREQ) + ErrMessageTooLong = newWithHost("message too long", errno.EMSGSIZE, unix.EMSGSIZE) + ErrWrongProtocolForSocket = newWithHost("protocol wrong type for socket", errno.EPROTOTYPE, unix.EPROTOTYPE) + ErrProtocolNotAvailable = newWithHost("protocol not available", errno.ENOPROTOOPT, unix.ENOPROTOOPT) + ErrProtocolNotSupported = newWithHost("protocol not supported", errno.EPROTONOSUPPORT, unix.EPROTONOSUPPORT) + ErrSocketNotSupported = newWithHost("socket type not supported", errno.ESOCKTNOSUPPORT, unix.ESOCKTNOSUPPORT) + ErrEndpointOperation = newWithHost("operation not supported on transport endpoint", errno.EOPNOTSUPP, unix.EOPNOTSUPP) + ErrProtocolFamilyNotSupported = newWithHost("protocol family not supported", errno.EPFNOSUPPORT, unix.EPFNOSUPPORT) + ErrAddressFamilyNotSupported = newWithHost("address family not supported by protocol", errno.EAFNOSUPPORT, unix.EAFNOSUPPORT) + ErrAddressInUse = newWithHost("address already in use", errno.EADDRINUSE, unix.EADDRINUSE) + ErrAddressNotAvailable = newWithHost("cannot assign requested address", errno.EADDRNOTAVAIL, unix.EADDRNOTAVAIL) + ErrNetworkDown = newWithHost("network is down", errno.ENETDOWN, unix.ENETDOWN) + ErrNetworkUnreachable = newWithHost("network is unreachable", errno.ENETUNREACH, unix.ENETUNREACH) + ErrNetworkReset = newWithHost("network dropped connection because of reset", errno.ENETRESET, unix.ENETRESET) + ErrConnectionAborted = newWithHost("software caused connection abort", errno.ECONNABORTED, unix.ECONNABORTED) + ErrConnectionReset = newWithHost("connection reset by peer", errno.ECONNRESET, unix.ECONNRESET) + ErrNoBufferSpace = newWithHost("no buffer space available", errno.ENOBUFS, unix.ENOBUFS) + ErrAlreadyConnected = newWithHost("transport endpoint is already connected", errno.EISCONN, unix.EISCONN) + ErrNotConnected = newWithHost("transport endpoint is not connected", errno.ENOTCONN, unix.ENOTCONN) + ErrShutdown = newWithHost("cannot send after transport endpoint shutdown", errno.ESHUTDOWN, unix.ESHUTDOWN) + ErrTooManyRefs = newWithHost("too many references: cannot splice", errno.ETOOMANYREFS, unix.ETOOMANYREFS) + ErrTimedOut = newWithHost("connection timed out", errno.ETIMEDOUT, unix.ETIMEDOUT) + ErrConnectionRefused = newWithHost("connection refused", errno.ECONNREFUSED, unix.ECONNREFUSED) + ErrHostDown = newWithHost("host is down", errno.EHOSTDOWN, unix.EHOSTDOWN) + ErrNoRoute = newWithHost("no route to host", errno.EHOSTUNREACH, unix.EHOSTUNREACH) + ErrAlreadyInProgress = newWithHost("operation already in progress", errno.EALREADY, unix.EALREADY) + ErrInProgress = newWithHost("operation now in progress", errno.EINPROGRESS, unix.EINPROGRESS) + ErrStaleFileHandle = newWithHost("stale file handle", errno.ESTALE, unix.ESTALE) + ErrStructureNeedsCleaning = newWithHost("structure needs cleaning", errno.EUCLEAN, unix.EUCLEAN) + ErrIsNamedFile = newWithHost("is a named type file", errno.ENOTNAM, unix.ENOTNAM) + ErrRemoteIO = newWithHost("remote I/O error", errno.EREMOTEIO, unix.EREMOTEIO) + ErrQuotaExceeded = newWithHost("quota exceeded", errno.EDQUOT, unix.EDQUOT) + ErrNoMedium = newWithHost("no medium found", errno.ENOMEDIUM, unix.ENOMEDIUM) + ErrWrongMediumType = newWithHost("wrong medium type", errno.EMEDIUMTYPE, unix.EMEDIUMTYPE) + ErrCanceled = newWithHost("operation canceled", errno.ECANCELED, unix.ECANCELED) + ErrNoKey = newWithHost("required key not available", errno.ENOKEY, unix.ENOKEY) + ErrKeyExpired = newWithHost("key has expired", errno.EKEYEXPIRED, unix.EKEYEXPIRED) + ErrKeyRevoked = newWithHost("key has been revoked", errno.EKEYREVOKED, unix.EKEYREVOKED) + ErrKeyRejected = newWithHost("key was rejected by service", errno.EKEYREJECTED, unix.EKEYREJECTED) + ErrOwnerDied = newWithHost("owner died", errno.EOWNERDEAD, unix.EOWNERDEAD) + ErrNotRecoverable = newWithHost("state not recoverable", errno.ENOTRECOVERABLE, unix.ENOTRECOVERABLE) // ErrWouldBlock translates to EWOULDBLOCK which is the same as EAGAIN // on Linux. - ErrWouldBlock = New("operation would block", linux.EWOULDBLOCK) + ErrWouldBlock = New("operation would block", errno.EWOULDBLOCK) ) // FromError converts a generic error to an *Error. @@ -278,11 +279,25 @@ func FromError(err error) *Error { if err == nil { return nil } - if errno, ok := err.(unix.Errno); ok { - return FromHost(errno) + + switch e := err.(type) { + case unix.Errno: + return FromHost(e) + case *errors.Error: + return FromHost(unix.Errno(e.Errno())) + case safecopy.SegvError, safecopy.BusError, safecopy.AlignmentError: + return FromHost(unix.EFAULT) } - if errno, ok := syserror.TranslateError(err); ok { - return FromHost(errno) + + msg := fmt.Sprintf("err: %s type: %T", err.Error(), err) + panic(msg) +} + +// ConvertIntr converts the provided error code (err) to another one (intr) if +// the first error corresponds to an interrupted operation. +func ConvertIntr(err, intr error) error { + if err == linuxerr.ErrInterrupted { + return intr } - panic("unknown error: " + err.Error()) + return err } diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go deleted file mode 100644 index 56b621357..000000000 --- a/pkg/syserror/syserror.go +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package syserror contains syscall error codes exported as error interface -// instead of Errno. This allows for fast comparison and returns when the -// comparand or return value is of type error because there is no need to -// convert from Errno to an interface, i.e., runtime.convT2I isn't called. -package syserror - -import ( - "errors" - - "golang.org/x/sys/unix" -) - -// The following variables have the same meaning as their syscall equivalent. -var ( - E2BIG = error(unix.E2BIG) - EACCES = error(unix.EACCES) - EADDRINUSE = error(unix.EADDRINUSE) - EAGAIN = error(unix.EAGAIN) - EBADF = error(unix.EBADF) - EBADFD = error(unix.EBADFD) - EBUSY = error(unix.EBUSY) - ECHILD = error(unix.ECHILD) - ECONNABORTED = error(unix.ECONNABORTED) - ECONNREFUSED = error(unix.ECONNREFUSED) - ECONNRESET = error(unix.ECONNRESET) - EDEADLK = error(unix.EDEADLK) - EEXIST = error(unix.EEXIST) - EFAULT = error(unix.EFAULT) - EFBIG = error(unix.EFBIG) - EIDRM = error(unix.EIDRM) - EINTR = error(unix.EINTR) - EINVAL = error(unix.EINVAL) - EIO = error(unix.EIO) - EISDIR = error(unix.EISDIR) - ELIBBAD = error(unix.ELIBBAD) - ELOOP = error(unix.ELOOP) - EMFILE = error(unix.EMFILE) - EMLINK = error(unix.EMLINK) - EMSGSIZE = error(unix.EMSGSIZE) - ENAMETOOLONG = error(unix.ENAMETOOLONG) - ENOATTR = ENODATA - ENOBUFS = error(unix.ENOBUFS) - ENODATA = error(unix.ENODATA) - ENODEV = error(unix.ENODEV) - ENOENT = error(unix.ENOENT) - ENOEXEC = error(unix.ENOEXEC) - ENOLCK = error(unix.ENOLCK) - ENOLINK = error(unix.ENOLINK) - ENOMEM = error(unix.ENOMEM) - ENOSPC = error(unix.ENOSPC) - ENOSYS = error(unix.ENOSYS) - ENOTCONN = error(unix.ENOTCONN) - ENOTDIR = error(unix.ENOTDIR) - ENOTEMPTY = error(unix.ENOTEMPTY) - ENOTSOCK = error(unix.ENOTSOCK) - ENOTSUP = error(unix.ENOTSUP) - ENOTTY = error(unix.ENOTTY) - ENXIO = error(unix.ENXIO) - EOPNOTSUPP = error(unix.EOPNOTSUPP) - EOVERFLOW = error(unix.EOVERFLOW) - EPERM = error(unix.EPERM) - EPIPE = error(unix.EPIPE) - ERANGE = error(unix.ERANGE) - EREMOTE = error(unix.EREMOTE) - EROFS = error(unix.EROFS) - ESPIPE = error(unix.ESPIPE) - ESRCH = error(unix.ESRCH) - ETIMEDOUT = error(unix.ETIMEDOUT) - EUSERS = error(unix.EUSERS) - EWOULDBLOCK = error(unix.EWOULDBLOCK) - EXDEV = error(unix.EXDEV) -) - -var ( - // ErrWouldBlock is an internal error used to indicate that an operation - // cannot be satisfied immediately, and should be retried at a later - // time, possibly when the caller has received a notification that the - // operation may be able to complete. It is used by implementations of - // the kio.File interface. - ErrWouldBlock = errors.New("request would block") - - // ErrInterrupted is returned if a request is interrupted before it can - // complete. - ErrInterrupted = errors.New("request was interrupted") - - // ErrExceedsFileSizeLimit is returned if a request would exceed the - // file's size limit. - ErrExceedsFileSizeLimit = errors.New("exceeds file size limit") -) - -// errorMap is the map used to convert generic errors into errnos. -var errorMap = map[error]unix.Errno{} - -// errorUnwrappers is an array of unwrap functions to extract typed errors. -var errorUnwrappers = []func(error) (unix.Errno, bool){} - -// AddErrorTranslation allows modules to populate the error map by adding their -// own translations during initialization. Returns if the error translation is -// accepted or not. A pre-existing translation will not be overwritten by the -// new translation. -func AddErrorTranslation(from error, to unix.Errno) bool { - if _, ok := errorMap[from]; ok { - return false - } - - errorMap[from] = to - return true -} - -// AddErrorUnwrapper registers an unwrap method that can extract a concrete error -// from a typed, but not initialized, error. -func AddErrorUnwrapper(unwrap func(e error) (unix.Errno, bool)) { - errorUnwrappers = append(errorUnwrappers, unwrap) -} - -// TranslateError translates errors to errnos, it will return false if -// the error was not registered. -func TranslateError(from error) (unix.Errno, bool) { - if err, ok := errorMap[from]; ok { - return err, true - } - // Try to unwrap the error if we couldn't match an error - // exactly. This might mean that a package has its own - // error type. - for _, unwrap := range errorUnwrappers { - if err, ok := unwrap(from); ok { - return err, true - } - } - return 0, false -} - -// ConvertIntr converts the provided error code (err) to another one (intr) if -// the first error corresponds to an interrupted operation. -func ConvertIntr(err, intr error) error { - if err == ErrInterrupted { - return intr - } - return err -} - -// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel -// include/linux/errno.h. These errnos are never returned to userspace -// directly, but are used to communicate the expected behavior of an -// interrupted syscall from the syscall to signal handling. -type SyscallRestartErrno int - -// These numeric values are significant because ptrace syscall exit tracing can -// observe them. -// -// For all of the following errnos, if the syscall is not interrupted by a -// signal delivered to a user handler, the syscall is restarted. -const ( - // ERESTARTSYS is returned by an interrupted syscall to indicate that it - // should be converted to EINTR if interrupted by a signal delivered to a - // user handler without SA_RESTART set, and restarted otherwise. - ERESTARTSYS = SyscallRestartErrno(512) - - // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it - // should always be restarted. - ERESTARTNOINTR = SyscallRestartErrno(513) - - // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it - // should be converted to EINTR if interrupted by a signal delivered to a - // user handler, and restarted otherwise. - ERESTARTNOHAND = SyscallRestartErrno(514) - - // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate - // that it should be restarted using a custom function. The interrupted - // syscall must register a custom restart function by calling - // Task.SetRestartSyscallFn. - ERESTART_RESTARTBLOCK = SyscallRestartErrno(516) -) - -// Error implements error.Error. -func (e SyscallRestartErrno) Error() string { - // Descriptions are borrowed from strace. - switch e { - case ERESTARTSYS: - return "to be restarted if SA_RESTART is set" - case ERESTARTNOINTR: - return "to be restarted" - case ERESTARTNOHAND: - return "to be restarted if no handler" - case ERESTART_RESTARTBLOCK: - return "interrupted by signal" - default: - return "(unknown interrupt error)" - } -} - -// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by -// rv, the value in a syscall return register. -func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) { - switch int(rv) { - case -int(ERESTARTSYS): - return ERESTARTSYS, true - case -int(ERESTARTNOINTR): - return ERESTARTNOINTR, true - case -int(ERESTARTNOHAND): - return ERESTARTNOHAND, true - case -int(ERESTART_RESTARTBLOCK): - return ERESTART_RESTARTBLOCK, true - default: - return 0, false - } -} - -func init() { - AddErrorTranslation(ErrWouldBlock, unix.EWOULDBLOCK) - AddErrorTranslation(ErrInterrupted, unix.EINTR) - AddErrorTranslation(ErrExceedsFileSizeLimit, unix.EFBIG) -} diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD index ed4d7e958..dbe4506cc 100644 --- a/pkg/tcpip/BUILD +++ b/pkg/tcpip/BUILD @@ -46,7 +46,6 @@ deps_test( "//pkg/gohacks", "//pkg/goid", "//pkg/ilist", - "//pkg/iovec", "//pkg/linewriter", "//pkg/log", "//pkg/rand", @@ -70,7 +69,6 @@ deps_test( "//pkg/tcpip/header", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/loopback", - "//pkg/tcpip/link/packetsocket", "//pkg/tcpip/link/qdisc/fifo", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go index 48b24692b..c8460e63c 100644 --- a/pkg/tcpip/adapters/gonet/gonet_test.go +++ b/pkg/tcpip/adapters/gonet/gonet_test.go @@ -137,7 +137,13 @@ func TestCloseReader(t *testing.T) { addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr.Addr.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr, err) + } l, e := ListenTCP(s, addr, ipv4.ProtocolNumber) if e != nil { @@ -190,7 +196,13 @@ func TestCloseReaderWithForwarder(t *testing.T) { }() addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr.Addr.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr, err) + } done := make(chan struct{}) @@ -244,7 +256,13 @@ func TestCloseRead(t *testing.T) { }() addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr.Addr.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr, err) + } fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) { var wq waiter.Queue @@ -288,7 +306,13 @@ func TestCloseWrite(t *testing.T) { }() addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr.Addr.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr, err) + } fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) { var wq waiter.Queue @@ -349,10 +373,22 @@ func TestUDPForwarder(t *testing.T) { ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4()) addr1 := tcpip.FullAddress{NICID, ip1, 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, ip1) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: ip1.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr1, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr1, err) + } ip2 := tcpip.Address(net.IPv4(169, 254, 10, 2).To4()) addr2 := tcpip.FullAddress{NICID, ip2, 11311} - s.AddAddress(NICID, ipv4.ProtocolNumber, ip2) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: ip2.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr2, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr2, err) + } done := make(chan struct{}) fwd := udp.NewForwarder(s, func(r *udp.ForwarderRequest) { @@ -410,7 +446,13 @@ func TestDeadlineChange(t *testing.T) { addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr.Addr.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr, err) + } l, e := ListenTCP(s, addr, ipv4.ProtocolNumber) if e != nil { @@ -465,10 +507,22 @@ func TestPacketConnTransfer(t *testing.T) { ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4()) addr1 := tcpip.FullAddress{NICID, ip1, 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, ip1) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: ip1.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr1, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr1, err) + } ip2 := tcpip.Address(net.IPv4(169, 254, 10, 2).To4()) addr2 := tcpip.FullAddress{NICID, ip2, 11311} - s.AddAddress(NICID, ipv4.ProtocolNumber, ip2) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: ip2.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr2, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr2, err) + } c1, err := DialUDP(s, &addr1, nil, ipv4.ProtocolNumber) if err != nil { @@ -521,7 +575,13 @@ func TestConnectedPacketConnTransfer(t *testing.T) { ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4()) addr := tcpip.FullAddress{NICID, ip, 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, ip) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: ip.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr, err) + } c1, err := DialUDP(s, &addr, nil, ipv4.ProtocolNumber) if err != nil { @@ -565,24 +625,30 @@ func makePipe() (c1, c2 net.Conn, stop func(), err error) { ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4()) addr := tcpip.FullAddress{NICID, ip, 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, ip) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: ip.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr, stack.AddressProperties{}); err != nil { + return nil, nil, nil, fmt.Errorf("AddProtocolAddress(%d, %+v, {}): %w", NICID, protocolAddr, err) + } l, err := ListenTCP(s, addr, ipv4.ProtocolNumber) if err != nil { - return nil, nil, nil, fmt.Errorf("NewListener: %v", err) + return nil, nil, nil, fmt.Errorf("NewListener: %w", err) } c1, err = DialTCP(s, addr, ipv4.ProtocolNumber) if err != nil { l.Close() - return nil, nil, nil, fmt.Errorf("DialTCP: %v", err) + return nil, nil, nil, fmt.Errorf("DialTCP: %w", err) } c2, err = l.Accept() if err != nil { l.Close() c1.Close() - return nil, nil, nil, fmt.Errorf("l.Accept: %v", err) + return nil, nil, nil, fmt.Errorf("l.Accept: %w", err) } stop = func() { @@ -594,7 +660,7 @@ func makePipe() (c1, c2 net.Conn, stop func(), err error) { if err := l.Close(); err != nil { stop() - return nil, nil, nil, fmt.Errorf("l.Close(): %v", err) + return nil, nil, nil, fmt.Errorf("l.Close(): %w", err) } return c1, c2, stop, nil @@ -681,7 +747,13 @@ func TestDialContextTCPCanceled(t *testing.T) { }() addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr.Addr.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr, err) + } ctx := context.Background() ctx, cancel := context.WithCancel(ctx) @@ -703,7 +775,13 @@ func TestDialContextTCPTimeout(t *testing.T) { }() addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211} - s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr.Addr.WithPrefix(), + } + if err := s.AddProtocolAddress(NICID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", NICID, protocolAddr, err) + } fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) { time.Sleep(time.Second) diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go index e0dfe5813..2f34bf8dd 100644 --- a/pkg/tcpip/checker/checker.go +++ b/pkg/tcpip/checker/checker.go @@ -729,7 +729,7 @@ func TCPTimestampChecker(wantTS bool, wantTSVal uint32, wantTSEcr uint32) Transp return } l := int(opts[i+1]) - if i < 2 || i+l > limit { + if l < 2 || i+l > limit { return } i += l diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go index 6aa9acfa8..e2c85e220 100644 --- a/pkg/tcpip/header/checksum.go +++ b/pkg/tcpip/header/checksum.go @@ -18,6 +18,7 @@ package header import ( "encoding/binary" + "fmt" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" @@ -234,3 +235,64 @@ func PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, srcAddr tcpip. return Checksum([]byte{0, uint8(protocol)}, xsum) } + +// checksumUpdate2ByteAlignedUint16 updates a uint16 value in a calculated +// checksum. +// +// The value MUST begin at a 2-byte boundary in the original buffer. +func checksumUpdate2ByteAlignedUint16(xsum, old, new uint16) uint16 { + // As per RFC 1071 page 4, + // (4) Incremental Update + // + // ... + // + // To update the checksum, simply add the differences of the + // sixteen bit integers that have been changed. To see why this + // works, observe that every 16-bit integer has an additive inverse + // and that addition is associative. From this it follows that + // given the original value m, the new value m', and the old + // checksum C, the new checksum C' is: + // + // C' = C + (-m) + m' = C + (m' - m) + return ChecksumCombine(xsum, ChecksumCombine(new, ^old)) +} + +// checksumUpdate2ByteAlignedAddress updates an address in a calculated +// checksum. +// +// The addresses must have the same length and must contain an even number +// of bytes. The address MUST begin at a 2-byte boundary in the original buffer. +func checksumUpdate2ByteAlignedAddress(xsum uint16, old, new tcpip.Address) uint16 { + const uint16Bytes = 2 + + if len(old) != len(new) { + panic(fmt.Sprintf("buffer lengths are different; old = %d, new = %d", len(old), len(new))) + } + + if len(old)%uint16Bytes != 0 { + panic(fmt.Sprintf("buffer has an odd number of bytes; got = %d", len(old))) + } + + // As per RFC 1071 page 4, + // (4) Incremental Update + // + // ... + // + // To update the checksum, simply add the differences of the + // sixteen bit integers that have been changed. To see why this + // works, observe that every 16-bit integer has an additive inverse + // and that addition is associative. From this it follows that + // given the original value m, the new value m', and the old + // checksum C, the new checksum C' is: + // + // C' = C + (-m) + m' = C + (m' - m) + for len(old) != 0 { + // Convert the 2 byte sequences to uint16 values then apply the increment + // update. + xsum = checksumUpdate2ByteAlignedUint16(xsum, (uint16(old[0])<<8)+uint16(old[1]), (uint16(new[0])<<8)+uint16(new[1])) + old = old[uint16Bytes:] + new = new[uint16Bytes:] + } + + return xsum +} diff --git a/pkg/tcpip/header/checksum_test.go b/pkg/tcpip/header/checksum_test.go index d267dabd0..3445511f4 100644 --- a/pkg/tcpip/header/checksum_test.go +++ b/pkg/tcpip/header/checksum_test.go @@ -23,6 +23,7 @@ import ( "sync" "testing" + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" ) @@ -256,3 +257,205 @@ func TestICMPv6Checksum(t *testing.T) { }) }, want, fmt.Sprintf("header: {% x} data {% x}", h, vv.ToView())) } + +func randomAddress(size int) tcpip.Address { + s := make([]byte, size) + for i := 0; i < size; i++ { + s[i] = byte(rand.Uint32()) + } + return tcpip.Address(s) +} + +func TestChecksummableNetworkUpdateAddress(t *testing.T) { + tests := []struct { + name string + update func(header.IPv4, tcpip.Address) + }{ + { + name: "SetSourceAddressWithChecksumUpdate", + update: header.IPv4.SetSourceAddressWithChecksumUpdate, + }, + { + name: "SetDestinationAddressWithChecksumUpdate", + update: header.IPv4.SetDestinationAddressWithChecksumUpdate, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + for i := 0; i < 1000; i++ { + var origBytes [header.IPv4MinimumSize]byte + header.IPv4(origBytes[:]).Encode(&header.IPv4Fields{ + TOS: 1, + TotalLength: header.IPv4MinimumSize, + ID: 2, + Flags: 3, + FragmentOffset: 4, + TTL: 5, + Protocol: 6, + Checksum: 0, + SrcAddr: randomAddress(header.IPv4AddressSize), + DstAddr: randomAddress(header.IPv4AddressSize), + }) + + addr := randomAddress(header.IPv4AddressSize) + + bytesCopy := origBytes + h := header.IPv4(bytesCopy[:]) + origXSum := h.CalculateChecksum() + h.SetChecksum(^origXSum) + + test.update(h, addr) + got := ^h.Checksum() + h.SetChecksum(0) + want := h.CalculateChecksum() + if got != want { + t.Errorf("got h.Checksum() = 0x%x, want = 0x%x; originalBytes = 0x%x, new addr = %s", got, want, origBytes, addr) + } + } + }) + } +} + +func TestChecksummableTransportUpdatePort(t *testing.T) { + // The fields in the pseudo header is not tested here so we just use 0. + const pseudoHeaderXSum = 0 + + tests := []struct { + name string + transportHdr func(_, _ uint16) (header.ChecksummableTransport, func(uint16) uint16) + proto tcpip.TransportProtocolNumber + }{ + { + name: "TCP", + transportHdr: func(src, dst uint16) (header.ChecksummableTransport, func(uint16) uint16) { + h := header.TCP(make([]byte, header.TCPMinimumSize)) + h.Encode(&header.TCPFields{ + SrcPort: src, + DstPort: dst, + SeqNum: 1, + AckNum: 2, + DataOffset: header.TCPMinimumSize, + Flags: 3, + WindowSize: 4, + Checksum: 0, + UrgentPointer: 5, + }) + h.SetChecksum(^h.CalculateChecksum(pseudoHeaderXSum)) + return h, h.CalculateChecksum + }, + proto: header.TCPProtocolNumber, + }, + { + name: "UDP", + transportHdr: func(src, dst uint16) (header.ChecksummableTransport, func(uint16) uint16) { + h := header.UDP(make([]byte, header.UDPMinimumSize)) + h.Encode(&header.UDPFields{ + SrcPort: src, + DstPort: dst, + Length: 0, + Checksum: 0, + }) + h.SetChecksum(^h.CalculateChecksum(pseudoHeaderXSum)) + return h, h.CalculateChecksum + }, + proto: header.UDPProtocolNumber, + }, + } + + for i := 0; i < 1000; i++ { + origSrcPort := uint16(rand.Uint32()) + origDstPort := uint16(rand.Uint32()) + newPort := uint16(rand.Uint32()) + + t.Run(fmt.Sprintf("OrigSrcPort=%d,OrigDstPort=%d,NewPort=%d", origSrcPort, origDstPort, newPort), func(*testing.T) { + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + for _, subTest := range []struct { + name string + update func(header.ChecksummableTransport) + }{ + { + name: "Source port", + update: func(h header.ChecksummableTransport) { h.SetSourcePortWithChecksumUpdate(newPort) }, + }, + { + name: "Destination port", + update: func(h header.ChecksummableTransport) { h.SetDestinationPortWithChecksumUpdate(newPort) }, + }, + } { + t.Run(subTest.name, func(t *testing.T) { + h, calcXSum := test.transportHdr(origSrcPort, origDstPort) + subTest.update(h) + // TCP and UDP hold the 1s complement of the fully calculated + // checksum. + got := ^h.Checksum() + h.SetChecksum(0) + + if want := calcXSum(pseudoHeaderXSum); got != want { + h, _ := test.transportHdr(origSrcPort, origDstPort) + t.Errorf("got Checksum() = 0x%x, want = 0x%x; originalBytes = %#v, new port = %d", got, want, h, newPort) + } + }) + } + }) + } + }) + } +} + +func TestChecksummableTransportUpdatePseudoHeaderAddress(t *testing.T) { + const addressSize = 6 + + tests := []struct { + name string + transportHdr func() header.ChecksummableTransport + proto tcpip.TransportProtocolNumber + }{ + { + name: "TCP", + transportHdr: func() header.ChecksummableTransport { return header.TCP(make([]byte, header.TCPMinimumSize)) }, + proto: header.TCPProtocolNumber, + }, + { + name: "UDP", + transportHdr: func() header.ChecksummableTransport { return header.UDP(make([]byte, header.UDPMinimumSize)) }, + proto: header.UDPProtocolNumber, + }, + } + + for i := 0; i < 1000; i++ { + permanent := randomAddress(addressSize) + old := randomAddress(addressSize) + new := randomAddress(addressSize) + + t.Run(fmt.Sprintf("Permanent=%q,Old=%q,New=%q", permanent, old, new), func(t *testing.T) { + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + for _, fullChecksum := range []bool{true, false} { + t.Run(fmt.Sprintf("FullChecksum=%t", fullChecksum), func(t *testing.T) { + initialXSum := header.PseudoHeaderChecksum(test.proto, permanent, old, 0) + if fullChecksum { + // TCP and UDP hold the 1s complement of the fully calculated + // checksum. + initialXSum = ^initialXSum + } + + h := test.transportHdr() + h.SetChecksum(initialXSum) + h.UpdateChecksumPseudoHeaderAddress(old, new, fullChecksum) + + got := h.Checksum() + if fullChecksum { + got = ^got + } + if want := header.PseudoHeaderChecksum(test.proto, permanent, new, 0); got != want { + t.Errorf("got Checksum() = 0x%x, want = 0x%x; h = %#v", got, want, h) + } + }) + } + }) + } + }) + } +} diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go index 95ade0e5c..1f18213e5 100644 --- a/pkg/tcpip/header/eth.go +++ b/pkg/tcpip/header/eth.go @@ -49,9 +49,9 @@ const ( // EthernetAddressSize is the size, in bytes, of an ethernet address. EthernetAddressSize = 6 - // unspecifiedEthernetAddress is the unspecified ethernet address + // UnspecifiedEthernetAddress is the unspecified ethernet address // (all bits set to 0). - unspecifiedEthernetAddress = tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00") + UnspecifiedEthernetAddress = tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00") // EthernetBroadcastAddress is an ethernet address that addresses every node // on a local link. @@ -134,7 +134,7 @@ func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool { return false } - if addr == unspecifiedEthernetAddress { + if addr == UnspecifiedEthernetAddress { return false } diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go index bf9ccbf1a..adc04e855 100644 --- a/pkg/tcpip/header/eth_test.go +++ b/pkg/tcpip/header/eth_test.go @@ -44,7 +44,7 @@ func TestIsValidUnicastEthernetAddress(t *testing.T) { }, { "Unspecified", - unspecifiedEthernetAddress, + UnspecifiedEthernetAddress, false, }, { @@ -91,7 +91,7 @@ func TestIsMulticastEthernetAddress(t *testing.T) { }, { "Unspecified", - unspecifiedEthernetAddress, + UnspecifiedEthernetAddress, false, }, { diff --git a/pkg/tcpip/header/interfaces.go b/pkg/tcpip/header/interfaces.go index 861cbbb70..3a41adfc4 100644 --- a/pkg/tcpip/header/interfaces.go +++ b/pkg/tcpip/header/interfaces.go @@ -53,6 +53,31 @@ type Transport interface { Payload() []byte } +// ChecksummableTransport is a Transport that supports checksumming. +type ChecksummableTransport interface { + Transport + + // SetSourcePortWithChecksumUpdate sets the source port and updates + // the checksum. + // + // The receiver's checksum must be a fully calculated checksum. + SetSourcePortWithChecksumUpdate(port uint16) + + // SetDestinationPortWithChecksumUpdate sets the destination port and updates + // the checksum. + // + // The receiver's checksum must be a fully calculated checksum. + SetDestinationPortWithChecksumUpdate(port uint16) + + // UpdateChecksumPseudoHeaderAddress updates the checksum to reflect an + // updated address in the pseudo header. + // + // If fullChecksum is true, the receiver's checksum field is assumed to hold a + // fully calculated checksum. Otherwise, it is assumed to hold a partially + // calculated checksum which only reflects the pseudo header. + UpdateChecksumPseudoHeaderAddress(old, new tcpip.Address, fullChecksum bool) +} + // Network offers generic methods to query and/or update the fields of the // header of a network protocol buffer. type Network interface { @@ -90,3 +115,16 @@ type Network interface { // SetTOS sets the values of the "type of service" and "flow label" fields. SetTOS(t uint8, l uint32) } + +// ChecksummableNetwork is a Network that supports checksumming. +type ChecksummableNetwork interface { + Network + + // SetSourceAddressAndChecksum sets the source address and updates the + // checksum to reflect the new address. + SetSourceAddressWithChecksumUpdate(tcpip.Address) + + // SetDestinationAddressAndChecksum sets the destination address and + // updates the checksum to reflect the new address. + SetDestinationAddressWithChecksumUpdate(tcpip.Address) +} diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go index e9abbb709..dcc549c7b 100644 --- a/pkg/tcpip/header/ipv4.go +++ b/pkg/tcpip/header/ipv4.go @@ -305,6 +305,18 @@ func (b IPv4) DestinationAddress() tcpip.Address { return tcpip.Address(b[dstAddr : dstAddr+IPv4AddressSize]) } +// SetSourceAddressWithChecksumUpdate implements ChecksummableNetwork. +func (b IPv4) SetSourceAddressWithChecksumUpdate(new tcpip.Address) { + b.SetChecksum(^checksumUpdate2ByteAlignedAddress(^b.Checksum(), b.SourceAddress(), new)) + b.SetSourceAddress(new) +} + +// SetDestinationAddressWithChecksumUpdate implements ChecksummableNetwork. +func (b IPv4) SetDestinationAddressWithChecksumUpdate(new tcpip.Address) { + b.SetChecksum(^checksumUpdate2ByteAlignedAddress(^b.Checksum(), b.DestinationAddress(), new)) + b.SetDestinationAddress(new) +} + // padIPv4OptionsLength returns the total length for IPv4 options of length l // after applying padding according to RFC 791: // The internet header padding is used to ensure that the internet diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go index d6cad3a94..a647ea968 100644 --- a/pkg/tcpip/header/ndp_options.go +++ b/pkg/tcpip/header/ndp_options.go @@ -148,15 +148,10 @@ const ( // NDP option. That is, the length field for NDP options is in units of // 8 octets, as per RFC 4861 section 4.6. lengthByteUnits = 8 -) -var ( // NDPInfiniteLifetime is a value that represents infinity for the // 4-byte lifetime fields found in various NDP options. Its value is // (2^32 - 1)s = 4294967295s. - // - // This is a variable instead of a constant so that tests can change - // this value to a smaller value. It should only be modified by tests. NDPInfiniteLifetime = time.Second * math.MaxUint32 ) @@ -238,6 +233,17 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) { case ndpNonceOptionType: return NDPNonceOption(body), false, nil + case ndpRouteInformationType: + if numBodyBytes > ndpRouteInformationMaxLength { + return nil, true, fmt.Errorf("got %d bytes for NDP Route Information option's body, expected at max %d bytes: %w", numBodyBytes, ndpRouteInformationMaxLength, ErrNDPOptMalformedBody) + } + opt := NDPRouteInformation(body) + if err := opt.hasError(); err != nil { + return nil, true, err + } + + return opt, false, nil + case ndpPrefixInformationType: // Make sure the length of a Prefix Information option // body is ndpPrefixInformationLength, as per RFC 4861 @@ -935,3 +941,137 @@ func isUpperLetter(b byte) bool { func isDigit(b byte) bool { return b >= '0' && b <= '9' } + +// As per RFC 4191 section 2.3, +// +// 2.3. Route Information Option +// +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Type | Length | Prefix Length |Resvd|Prf|Resvd| +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Route Lifetime | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Prefix (Variable Length) | +// . . +// . . +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// +// Fields: +// +// Type 24 +// +// +// Length 8-bit unsigned integer. The length of the option +// (including the Type and Length fields) in units of 8 +// octets. The Length field is 1, 2, or 3 depending on the +// Prefix Length. If Prefix Length is greater than 64, then +// Length must be 3. If Prefix Length is greater than 0, +// then Length must be 2 or 3. If Prefix Length is zero, +// then Length must be 1, 2, or 3. +const ( + ndpRouteInformationType = ndpOptionIdentifier(24) + ndpRouteInformationMaxLength = 22 + + ndpRouteInformationPrefixLengthIdx = 0 + ndpRouteInformationFlagsIdx = 1 + ndpRouteInformationPrfShift = 3 + ndpRouteInformationPrfMask = 3 << ndpRouteInformationPrfShift + ndpRouteInformationRouteLifetimeIdx = 2 + ndpRouteInformationRoutePrefixIdx = 6 +) + +// NDPRouteInformation is the NDP Router Information option, as defined by +// RFC 4191 section 2.3. +type NDPRouteInformation []byte + +func (NDPRouteInformation) kind() ndpOptionIdentifier { + return ndpRouteInformationType +} + +func (o NDPRouteInformation) length() int { + return len(o) +} + +func (o NDPRouteInformation) serializeInto(b []byte) int { + return copy(b, o) +} + +// String implements fmt.Stringer. +func (o NDPRouteInformation) String() string { + return fmt.Sprintf("%T", o) +} + +// PrefixLength returns the length of the prefix. +func (o NDPRouteInformation) PrefixLength() uint8 { + return o[ndpRouteInformationPrefixLengthIdx] +} + +// RoutePreference returns the preference of the route over other routes to the +// same destination but through a different router. +func (o NDPRouteInformation) RoutePreference() NDPRoutePreference { + return NDPRoutePreference((o[ndpRouteInformationFlagsIdx] & ndpRouteInformationPrfMask) >> ndpRouteInformationPrfShift) +} + +// RouteLifetime returns the lifetime of the route. +// +// Note, a value of 0 implies the route is now invalid and a value of +// infinity/forever is represented by NDPInfiniteLifetime. +func (o NDPRouteInformation) RouteLifetime() time.Duration { + return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRouteInformationRouteLifetimeIdx:])) +} + +// Prefix returns the prefix of the destination subnet this route is for. +func (o NDPRouteInformation) Prefix() (tcpip.Subnet, error) { + prefixLength := int(o.PrefixLength()) + if max := IPv6AddressSize * 8; prefixLength > max { + return tcpip.Subnet{}, fmt.Errorf("got prefix length = %d, want <= %d", prefixLength, max) + } + + prefix := o[ndpRouteInformationRoutePrefixIdx:] + var addrBytes [IPv6AddressSize]byte + if n := copy(addrBytes[:], prefix); n != len(prefix) { + panic(fmt.Sprintf("got copy(addrBytes, prefix) = %d, want = %d", n, len(prefix))) + } + + return tcpip.AddressWithPrefix{ + Address: tcpip.Address(addrBytes[:]), + PrefixLen: prefixLength, + }.Subnet(), nil +} + +func (o NDPRouteInformation) hasError() error { + l := len(o) + if l < ndpRouteInformationRoutePrefixIdx { + return fmt.Errorf("%T too small, got = %d bytes: %w", o, l, ErrNDPOptMalformedBody) + } + + prefixLength := int(o.PrefixLength()) + if max := IPv6AddressSize * 8; prefixLength > max { + return fmt.Errorf("got prefix length = %d, want <= %d: %w", prefixLength, max, ErrNDPOptMalformedBody) + } + + // Length 8-bit unsigned integer. The length of the option + // (including the Type and Length fields) in units of 8 + // octets. The Length field is 1, 2, or 3 depending on the + // Prefix Length. If Prefix Length is greater than 64, then + // Length must be 3. If Prefix Length is greater than 0, + // then Length must be 2 or 3. If Prefix Length is zero, + // then Length must be 1, 2, or 3. + l += 2 // Add 2 bytes for the type and length bytes. + lengthField := l / lengthByteUnits + if prefixLength > 64 { + if lengthField != 3 { + return fmt.Errorf("Length field must be 3 when Prefix Length (%d) is > 64 (got = %d): %w", prefixLength, lengthField, ErrNDPOptMalformedBody) + } + } else if prefixLength > 0 { + if lengthField != 2 && lengthField != 3 { + return fmt.Errorf("Length field must be 2 or 3 when Prefix Length (%d) is between 0 and 64 (got = %d): %w", prefixLength, lengthField, ErrNDPOptMalformedBody) + } + } else if lengthField == 0 || lengthField > 3 { + return fmt.Errorf("Length field must be 1, 2, or 3 when Prefix Length is zero (got = %d): %w", lengthField, ErrNDPOptMalformedBody) + } + + return nil +} diff --git a/pkg/tcpip/header/ndp_router_advert.go b/pkg/tcpip/header/ndp_router_advert.go index bf7610863..7d6efa083 100644 --- a/pkg/tcpip/header/ndp_router_advert.go +++ b/pkg/tcpip/header/ndp_router_advert.go @@ -16,15 +16,94 @@ package header import ( "encoding/binary" + "fmt" "time" ) +var _ fmt.Stringer = NDPRoutePreference(0) + +// NDPRoutePreference is the preference values for default routers or +// more-specific routes. +// +// As per RFC 4191 section 2.1, +// +// Default router preferences and preferences for more-specific routes +// are encoded the same way. +// +// Preference values are encoded as a two-bit signed integer, as +// follows: +// +// 01 High +// 00 Medium (default) +// 11 Low +// 10 Reserved - MUST NOT be sent +// +// Note that implementations can treat the value as a two-bit signed +// integer. +// +// Having just three values reinforces that they are not metrics and +// more values do not appear to be necessary for reasonable scenarios. +type NDPRoutePreference uint8 + +const ( + // HighRoutePreference indicates a high preference, as per + // RFC 4191 section 2.1. + HighRoutePreference NDPRoutePreference = 0b01 + + // MediumRoutePreference indicates a medium preference, as per + // RFC 4191 section 2.1. + // + // This is the default preference value. + MediumRoutePreference = 0b00 + + // LowRoutePreference indicates a low preference, as per + // RFC 4191 section 2.1. + LowRoutePreference = 0b11 + + // ReservedRoutePreference is a reserved preference value, as per + // RFC 4191 section 2.1. + // + // It MUST NOT be sent. + ReservedRoutePreference = 0b10 +) + +// String implements fmt.Stringer. +func (p NDPRoutePreference) String() string { + switch p { + case HighRoutePreference: + return "HighRoutePreference" + case MediumRoutePreference: + return "MediumRoutePreference" + case LowRoutePreference: + return "LowRoutePreference" + case ReservedRoutePreference: + return "ReservedRoutePreference" + default: + return fmt.Sprintf("NDPRoutePreference(%d)", p) + } +} + // NDPRouterAdvert is an NDP Router Advertisement message. It will only contain // the body of an ICMPv6 packet. // -// See RFC 4861 section 4.2 for more details. +// See RFC 4861 section 4.2 and RFC 4191 section 2.2 for more details. type NDPRouterAdvert []byte +// As per RFC 4191 section 2.2, +// +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Type | Code | Checksum | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Cur Hop Limit |M|O|H|Prf|Resvd| Router Lifetime | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Reachable Time | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Retrans Timer | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Options ... +// +-+-+-+-+-+-+-+-+-+-+-+- const ( // NDPRAMinimumSize is the minimum size of a valid NDP Router // Advertisement message (body of an ICMPv6 packet). @@ -47,6 +126,14 @@ const ( // within the bit-field/flags byte of an NDPRouterAdvert. ndpRAOtherConfFlagMask = (1 << 6) + // ndpDefaultRouterPreferenceShift is the shift of the Prf (Default Router + // Preference) field within the flags byte of an NDPRouterAdvert. + ndpDefaultRouterPreferenceShift = 3 + + // ndpDefaultRouterPreferenceMask is the mask of the Prf (Default Router + // Preference) field within the flags byte of an NDPRouterAdvert. + ndpDefaultRouterPreferenceMask = (0b11 << ndpDefaultRouterPreferenceShift) + // ndpRARouterLifetimeOffset is the start of the 2-byte Router Lifetime // field within an NDPRouterAdvert. ndpRARouterLifetimeOffset = 2 @@ -80,6 +167,11 @@ func (b NDPRouterAdvert) OtherConfFlag() bool { return b[ndpRAFlagsOffset]&ndpRAOtherConfFlagMask != 0 } +// DefaultRouterPreference returns the Default Router Preference field. +func (b NDPRouterAdvert) DefaultRouterPreference() NDPRoutePreference { + return NDPRoutePreference((b[ndpRAFlagsOffset] & ndpDefaultRouterPreferenceMask) >> ndpDefaultRouterPreferenceShift) +} + // RouterLifetime returns the lifetime associated with the default router. A // value of 0 means the source of the Router Advertisement is not a default // router and SHOULD NOT appear on the default router list. Note, a value of 0 diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go index 1b5093e58..2a897e938 100644 --- a/pkg/tcpip/header/ndp_test.go +++ b/pkg/tcpip/header/ndp_test.go @@ -21,6 +21,7 @@ import ( "fmt" "io" "regexp" + "strings" "testing" "time" @@ -58,6 +59,224 @@ func TestNDPNeighborSolicit(t *testing.T) { } } +func TestNDPRouteInformationOption(t *testing.T) { + tests := []struct { + name string + + length uint8 + prefixLength uint8 + prf NDPRoutePreference + lifetimeS uint32 + prefixBytes []byte + expectedPrefix tcpip.Subnet + + expectedErr error + }{ + { + name: "Length=1 with Prefix Length = 0", + length: 1, + prefixLength: 0, + prf: MediumRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: IPv6EmptySubnet, + }, + { + name: "Length=1 but Prefix Length > 0", + length: 1, + prefixLength: 1, + prf: MediumRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedErr: ErrNDPOptMalformedBody, + }, + { + name: "Length=2 with Prefix Length = 0", + length: 2, + prefixLength: 0, + prf: MediumRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: IPv6EmptySubnet, + }, + { + name: "Length=2 with Prefix Length in [1, 64] (1)", + length: 2, + prefixLength: 1, + prf: LowRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 1, + }.Subnet(), + }, + { + name: "Length=2 with Prefix Length in [1, 64] (64)", + length: 2, + prefixLength: 64, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 64, + }.Subnet(), + }, + { + name: "Length=2 with Prefix Length > 64", + length: 2, + prefixLength: 65, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedErr: ErrNDPOptMalformedBody, + }, + { + name: "Length=3 with Prefix Length = 0", + length: 3, + prefixLength: 0, + prf: MediumRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: IPv6EmptySubnet, + }, + { + name: "Length=3 with Prefix Length in [1, 64] (1)", + length: 3, + prefixLength: 1, + prf: LowRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 1, + }.Subnet(), + }, + { + name: "Length=3 with Prefix Length in [1, 64] (64)", + length: 3, + prefixLength: 64, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 64, + }.Subnet(), + }, + { + name: "Length=3 with Prefix Length in [65, 128] (65)", + length: 3, + prefixLength: 65, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 65, + }.Subnet(), + }, + { + name: "Length=3 with Prefix Length in [65, 128] (128)", + length: 3, + prefixLength: 128, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address(strings.Repeat("\x00", IPv6AddressSize)), + PrefixLen: 128, + }.Subnet(), + }, + { + name: "Length=3 with (invalid) Prefix Length > 128", + length: 3, + prefixLength: 129, + prf: HighRoutePreference, + lifetimeS: 1, + prefixBytes: nil, + expectedErr: ErrNDPOptMalformedBody, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + expectedRouteInformationBytes := [...]byte{ + // Type, Length + 24, test.length, + + // Prefix Length, Prf + uint8(test.prefixLength), uint8(test.prf) << 3, + + // Route Lifetime + 0, 0, 0, 0, + + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + } + binary.BigEndian.PutUint32(expectedRouteInformationBytes[4:], test.lifetimeS) + _ = copy(expectedRouteInformationBytes[8:], test.prefixBytes) + + opts := NDPOptions(expectedRouteInformationBytes[:test.length*lengthByteUnits]) + it, err := opts.Iter(false) + if err != nil { + t.Fatalf("got Iter(false) = (_, %s), want = (_, nil)", err) + } + opt, done, err := it.Next() + if !errors.Is(err, test.expectedErr) { + t.Fatalf("got Next() = (_, _, %s), want = (_, _, %s)", err, test.expectedErr) + } + if want := test.expectedErr != nil; done != want { + t.Fatalf("got Next() = (_, %t, _), want = (_, %t, _)", done, want) + } + if test.expectedErr != nil { + return + } + + if got := opt.kind(); got != ndpRouteInformationType { + t.Errorf("got kind() = %d, want = %d", got, ndpRouteInformationType) + } + + ri, ok := opt.(NDPRouteInformation) + if !ok { + t.Fatalf("got opt = %T, want = NDPRouteInformation", opt) + } + + if got := ri.PrefixLength(); got != test.prefixLength { + t.Errorf("got PrefixLength() = %d, want = %d", got, test.prefixLength) + } + if got := ri.RoutePreference(); got != test.prf { + t.Errorf("got RoutePreference() = %d, want = %d", got, test.prf) + } + if got, want := ri.RouteLifetime(), time.Duration(test.lifetimeS)*time.Second; got != want { + t.Errorf("got RouteLifetime() = %s, want = %s", got, want) + } + if got, err := ri.Prefix(); err != nil { + t.Errorf("Prefix(): %s", err) + } else if got != test.expectedPrefix { + t.Errorf("got Prefix() = %s, want = %s", got, test.expectedPrefix) + } + + // Iterator should not return anything else. + { + next, done, err := it.Next() + if err != nil { + t.Errorf("got Next() = (_, _, %s), want = (_, _, nil)", err) + } + if !done { + t.Error("got Next() = (_, false, _), want = (_, true, _)") + } + if next != nil { + t.Errorf("got Next() = (%x, _, _), want = (nil, _, _)", next) + } + } + }) + } +} + // TestNDPNeighborAdvert tests the functions of NDPNeighborAdvert. func TestNDPNeighborAdvert(t *testing.T) { b := []byte{ @@ -126,36 +345,83 @@ func TestNDPNeighborAdvert(t *testing.T) { } func TestNDPRouterAdvert(t *testing.T) { - b := []byte{ - 64, 128, 1, 2, - 3, 4, 5, 6, - 7, 8, 9, 10, + tests := []struct { + hopLimit uint8 + managedFlag, otherConfFlag bool + prf NDPRoutePreference + routerLifetimeS uint16 + reachableTimeMS, retransTimerMS uint32 + }{ + { + hopLimit: 1, + managedFlag: false, + otherConfFlag: true, + prf: HighRoutePreference, + routerLifetimeS: 2, + reachableTimeMS: 3, + retransTimerMS: 4, + }, + { + hopLimit: 64, + managedFlag: true, + otherConfFlag: false, + prf: LowRoutePreference, + routerLifetimeS: 258, + reachableTimeMS: 78492, + retransTimerMS: 13213, + }, } - ra := NDPRouterAdvert(b) + for i, test := range tests { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + flags := uint8(0) + if test.managedFlag { + flags |= 1 << 7 + } + if test.otherConfFlag { + flags |= 1 << 6 + } + flags |= uint8(test.prf) << 3 + + b := []byte{ + test.hopLimit, flags, 1, 2, + 3, 4, 5, 6, + 7, 8, 9, 10, + } + binary.BigEndian.PutUint16(b[2:], test.routerLifetimeS) + binary.BigEndian.PutUint32(b[4:], test.reachableTimeMS) + binary.BigEndian.PutUint32(b[8:], test.retransTimerMS) - if got := ra.CurrHopLimit(); got != 64 { - t.Errorf("got ra.CurrHopLimit = %d, want = 64", got) - } + ra := NDPRouterAdvert(b) - if got := ra.ManagedAddrConfFlag(); !got { - t.Errorf("got ManagedAddrConfFlag = false, want = true") - } + if got := ra.CurrHopLimit(); got != test.hopLimit { + t.Errorf("got ra.CurrHopLimit() = %d, want = %d", got, test.hopLimit) + } - if got := ra.OtherConfFlag(); got { - t.Errorf("got OtherConfFlag = true, want = false") - } + if got := ra.ManagedAddrConfFlag(); got != test.managedFlag { + t.Errorf("got ManagedAddrConfFlag() = %t, want = %t", got, test.managedFlag) + } - if got, want := ra.RouterLifetime(), time.Second*258; got != want { - t.Errorf("got ra.RouterLifetime = %d, want = %d", got, want) - } + if got := ra.OtherConfFlag(); got != test.otherConfFlag { + t.Errorf("got OtherConfFlag() = %t, want = %t", got, test.otherConfFlag) + } - if got, want := ra.ReachableTime(), time.Millisecond*50595078; got != want { - t.Errorf("got ra.ReachableTime = %d, want = %d", got, want) - } + if got := ra.DefaultRouterPreference(); got != test.prf { + t.Errorf("got DefaultRouterPreference() = %d, want = %d", got, test.prf) + } + + if got, want := ra.RouterLifetime(), time.Second*time.Duration(test.routerLifetimeS); got != want { + t.Errorf("got ra.RouterLifetime() = %d, want = %d", got, want) + } + + if got, want := ra.ReachableTime(), time.Millisecond*time.Duration(test.reachableTimeMS); got != want { + t.Errorf("got ra.ReachableTime() = %d, want = %d", got, want) + } - if got, want := ra.RetransTimer(), time.Millisecond*117967114; got != want { - t.Errorf("got ra.RetransTimer = %d, want = %d", got, want) + if got, want := ra.RetransTimer(), time.Millisecond*time.Duration(test.retransTimerMS); got != want { + t.Errorf("got ra.RetransTimer() = %d, want = %d", got, want) + } + }) } } @@ -1451,3 +1717,32 @@ func TestNDPOptionsIter(t *testing.T) { t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next) } } + +func TestNDPRoutePreferenceStringer(t *testing.T) { + p := NDPRoutePreference(0) + for { + var wantStr string + switch p { + case 0b01: + wantStr = "HighRoutePreference" + case 0b00: + wantStr = "MediumRoutePreference" + case 0b11: + wantStr = "LowRoutePreference" + case 0b10: + wantStr = "ReservedRoutePreference" + default: + wantStr = fmt.Sprintf("NDPRoutePreference(%d)", p) + } + + if gotStr := p.String(); gotStr != wantStr { + t.Errorf("got NDPRoutePreference(%d).String() = %s, want = %s", p, gotStr, wantStr) + } + + p++ + if p == 0 { + // Overflowed, we hit all values. + break + } + } +} diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go index 8dabe3354..a75e51a28 100644 --- a/pkg/tcpip/header/tcp.go +++ b/pkg/tcpip/header/tcp.go @@ -390,6 +390,35 @@ func (b TCP) EncodePartial(partialChecksum, length uint16, seqnum, acknum uint32 b.SetChecksum(^checksum) } +// SetSourcePortWithChecksumUpdate implements ChecksummableTransport. +func (b TCP) SetSourcePortWithChecksumUpdate(new uint16) { + old := b.SourcePort() + b.SetSourcePort(new) + b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) +} + +// SetDestinationPortWithChecksumUpdate implements ChecksummableTransport. +func (b TCP) SetDestinationPortWithChecksumUpdate(new uint16) { + old := b.DestinationPort() + b.SetDestinationPort(new) + b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) +} + +// UpdateChecksumPseudoHeaderAddress implements ChecksummableTransport. +func (b TCP) UpdateChecksumPseudoHeaderAddress(old, new tcpip.Address, fullChecksum bool) { + xsum := b.Checksum() + if fullChecksum { + xsum = ^xsum + } + + xsum = checksumUpdate2ByteAlignedAddress(xsum, old, new) + if fullChecksum { + xsum = ^xsum + } + + b.SetChecksum(xsum) +} + // ParseSynOptions parses the options received in a SYN segment and returns the // relevant ones. opts should point to the option part of the TCP header. func ParseSynOptions(opts []byte, isAck bool) TCPSynOptions { diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go index ae9d167ff..f69d53314 100644 --- a/pkg/tcpip/header/udp.go +++ b/pkg/tcpip/header/udp.go @@ -130,3 +130,32 @@ func (b UDP) Encode(u *UDPFields) { binary.BigEndian.PutUint16(b[udpLength:], u.Length) binary.BigEndian.PutUint16(b[udpChecksum:], u.Checksum) } + +// SetSourcePortWithChecksumUpdate implements ChecksummableTransport. +func (b UDP) SetSourcePortWithChecksumUpdate(new uint16) { + old := b.SourcePort() + b.SetSourcePort(new) + b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) +} + +// SetDestinationPortWithChecksumUpdate implements ChecksummableTransport. +func (b UDP) SetDestinationPortWithChecksumUpdate(new uint16) { + old := b.DestinationPort() + b.SetDestinationPort(new) + b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) +} + +// UpdateChecksumPseudoHeaderAddress implements ChecksummableTransport. +func (b UDP) UpdateChecksumPseudoHeaderAddress(old, new tcpip.Address, fullChecksum bool) { + xsum := b.Checksum() + if fullChecksum { + xsum = ^xsum + } + + xsum = checksumUpdate2ByteAlignedAddress(xsum, old, new) + if fullChecksum { + xsum = ^xsum + } + + b.SetChecksum(xsum) +} diff --git a/pkg/tcpip/internal/tcp/BUILD b/pkg/tcpip/internal/tcp/BUILD new file mode 100644 index 000000000..9ae258a0b --- /dev/null +++ b/pkg/tcpip/internal/tcp/BUILD @@ -0,0 +1,12 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "tcp", + srcs = ["tcp.go"], + visibility = ["//pkg/tcpip:__subpackages__"], + deps = [ + "//pkg/tcpip", + ], +) diff --git a/pkg/tcpip/internal/tcp/tcp.go b/pkg/tcpip/internal/tcp/tcp.go new file mode 100644 index 000000000..0616d368c --- /dev/null +++ b/pkg/tcpip/internal/tcp/tcp.go @@ -0,0 +1,48 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tcp contains internal type definitions that are not expected to be +// used by anyone else outside pkg/tcpip. +package tcp + +import ( + "time" + + "gvisor.dev/gvisor/pkg/tcpip" +) + +// TSOffset is an offset applied to the value of the TSVal field in the TCP +// Timestamp option. +// +// +stateify savable +type TSOffset struct { + milliseconds uint32 +} + +// NewTSOffset creates a new TSOffset from milliseconds. +func NewTSOffset(milliseconds uint32) TSOffset { + return TSOffset{ + milliseconds: milliseconds, + } +} + +// TSVal applies the offset to now and returns the timestamp in milliseconds. +func (offset TSOffset) TSVal(now tcpip.MonotonicTime) uint32 { + return uint32(now.Sub(tcpip.MonotonicTime{}).Milliseconds()) + offset.milliseconds +} + +// Elapsed calculates the elapsed time given now and the echoed back timestamp. +func (offset TSOffset) Elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { + return time.Duration(offset.TSVal(now)-tsEcr) * time.Millisecond +} diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go index f26c857eb..658557d62 100644 --- a/pkg/tcpip/link/channel/channel.go +++ b/pkg/tcpip/link/channel/channel.go @@ -28,7 +28,9 @@ import ( // PacketInfo holds all the information about an outbound packet. type PacketInfo struct { - Pkt *stack.PacketBuffer + Pkt *stack.PacketBuffer + + // TODO(https://gvisor.dev/issue/6537): Remove these fields. Proto tcpip.NetworkProtocolNumber Route stack.RouteInfo } @@ -244,7 +246,10 @@ func (e *Endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocol Route: r, } - e.q.Write(p) + // Write returns false if the queue is full. A full queue is not an error + // from the perspective of a LinkEndpoint so we ignore Write's return + // value and always return nil from this method. + _ = e.q.Write(p) return nil } @@ -290,3 +295,18 @@ func (*Endpoint) ARPHardwareType() header.ARPHardwareType { // AddHeader implements stack.LinkEndpoint.AddHeader. func (*Endpoint) AddHeader(tcpip.LinkAddress, tcpip.LinkAddress, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) { } + +// WriteRawPacket implements stack.LinkEndpoint. +func (e *Endpoint) WriteRawPacket(pkt *stack.PacketBuffer) tcpip.Error { + p := PacketInfo{ + Pkt: pkt, + Proto: pkt.NetworkProtocolNumber, + } + + // Write returns false if the queue is full. A full queue is not an error + // from the perspective of a LinkEndpoint so we ignore Write's return + // value and always return nil from this method. + _ = e.q.Write(p) + + return nil +} diff --git a/pkg/tcpip/link/ethernet/ethernet.go b/pkg/tcpip/link/ethernet/ethernet.go index b427c6170..8211a2031 100644 --- a/pkg/tcpip/link/ethernet/ethernet.go +++ b/pkg/tcpip/link/ethernet/ethernet.go @@ -42,6 +42,14 @@ type Endpoint struct { nested.Endpoint } +// LinkAddress implements stack.LinkEndpoint. +func (e *Endpoint) LinkAddress() tcpip.LinkAddress { + if l := e.Endpoint.LinkAddress(); len(l) != 0 { + return l + } + return header.UnspecifiedEthernetAddress +} + // DeliverNetworkPacket implements stack.NetworkDispatcher. func (e *Endpoint) DeliverNetworkPacket(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) @@ -57,18 +65,22 @@ func (e *Endpoint) DeliverNetworkPacket(_, _ tcpip.LinkAddress, _ tcpip.NetworkP // Capabilities implements stack.LinkEndpoint. func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { - return stack.CapabilityResolutionRequired | e.Endpoint.Capabilities() + c := e.Endpoint.Capabilities() + if c&stack.CapabilityLoopback == 0 { + c |= stack.CapabilityResolutionRequired + } + return c } // WritePacket implements stack.LinkEndpoint. func (e *Endpoint) WritePacket(r stack.RouteInfo, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { - e.AddHeader(e.Endpoint.LinkAddress(), r.RemoteLinkAddress, proto, pkt) + e.AddHeader(e.LinkAddress(), r.RemoteLinkAddress, proto, pkt) return e.Endpoint.WritePacket(r, proto, pkt) } // WritePackets implements stack.LinkEndpoint. func (e *Endpoint) WritePackets(r stack.RouteInfo, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, tcpip.Error) { - linkAddr := e.Endpoint.LinkAddress() + linkAddr := e.LinkAddress() for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { e.AddHeader(linkAddr, r.RemoteLinkAddress, proto, pkt) @@ -83,7 +95,10 @@ func (e *Endpoint) MaxHeaderLength() uint16 { } // ARPHardwareType implements stack.LinkEndpoint. -func (*Endpoint) ARPHardwareType() header.ARPHardwareType { +func (e *Endpoint) ARPHardwareType() header.ARPHardwareType { + if a := e.Endpoint.ARPHardwareType(); a != header.ARPHardwareNone { + return a + } return header.ARPHardwareEther } @@ -97,3 +112,8 @@ func (*Endpoint) AddHeader(local, remote tcpip.LinkAddress, proto tcpip.NetworkP } eth.Encode(&fields) } + +// WriteRawPacket implements stack.LinkEndpoint. +func (e *Endpoint) WriteRawPacket(pkt *stack.PacketBuffer) tcpip.Error { + return e.Endpoint.WriteRawPacket(pkt) +} diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index d971194e6..1d0163823 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -14,7 +14,6 @@ go_library( ], visibility = ["//visibility:public"], deps = [ - "//pkg/iovec", "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 735c28da1..058242f96 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package fdbased provides the implemention of data-link layer endpoints @@ -44,7 +45,6 @@ import ( "sync/atomic" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/iovec" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" @@ -56,6 +56,7 @@ import ( // linkDispatcher reads packets from the link FD and dispatches them to the // NetworkDispatcher. type linkDispatcher interface { + stop() dispatch() (bool, tcpip.Error) } @@ -138,6 +139,20 @@ type endpoint struct { // gsoKind is the supported kind of GSO. gsoKind stack.SupportedGSO + + // maxSyscallHeaderBytes has the same meaning as + // Options.MaxSyscallHeaderBytes. + maxSyscallHeaderBytes uintptr + + // writevMaxIovs is the maximum number of iovecs that may be passed to + // rawfile.NonBlockingWriteIovec, as possibly limited by + // maxSyscallHeaderBytes. (No analogous limit is defined for + // rawfile.NonBlockingSendMMsg, since in that case the maximum number of + // iovecs also depends on the number of mmsghdrs. Instead, if sendBatch + // encounters a packet whose iovec count is limited by + // maxSyscallHeaderBytes, it falls back to writing the packet using writev + // via WritePacket.) + writevMaxIovs int } // Options specify the details about the fd-based endpoint to be created. @@ -186,6 +201,11 @@ type Options struct { // RXChecksumOffload if true, indicates that this endpoints capability // set should include CapabilityRXChecksumOffload. RXChecksumOffload bool + + // If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes + // of struct iovec, msghdr, and mmsghdr that may be passed by each host + // system call. + MaxSyscallHeaderBytes int } // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT @@ -235,14 +255,25 @@ func New(opts *Options) (stack.LinkEndpoint, error) { return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") } + if opts.MaxSyscallHeaderBytes < 0 { + return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative") + } + e := &endpoint{ - fds: opts.FDs, - mtu: opts.MTU, - caps: caps, - closed: opts.ClosedFunc, - addr: opts.Address, - hdrSize: hdrSize, - packetDispatchMode: opts.PacketDispatchMode, + fds: opts.FDs, + mtu: opts.MTU, + caps: caps, + closed: opts.ClosedFunc, + addr: opts.Address, + hdrSize: hdrSize, + packetDispatchMode: opts.PacketDispatchMode, + maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes), + writevMaxIovs: rawfile.MaxIovs, + } + if e.maxSyscallHeaderBytes != 0 { + if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs { + e.writevMaxIovs = max + } } // Increment fanoutID to ensure that we don't re-use the same fanoutID for @@ -351,16 +382,27 @@ func isSocketFD(fd int) (bool, error) { // Attach launches the goroutine that reads packets from the file descriptor and // dispatches them via the provided dispatcher. func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { - e.dispatcher = dispatcher - // Link endpoints are not savable. When transportation endpoints are - // saved, they stop sending outgoing packets and all incoming packets - // are rejected. - for i := range e.inboundDispatchers { - e.wg.Add(1) - go func(i int) { // S/R-SAFE: See above. - e.dispatchLoop(e.inboundDispatchers[i]) - e.wg.Done() - }(i) + // nil means the NIC is being removed. + if dispatcher == nil && e.dispatcher != nil { + for _, dispatcher := range e.inboundDispatchers { + dispatcher.stop() + } + e.Wait() + e.dispatcher = nil + return + } + if dispatcher != nil && e.dispatcher == nil { + e.dispatcher = dispatcher + // Link endpoints are not savable. When transportation endpoints are + // saved, they stop sending outgoing packets and all incoming packets + // are rejected. + for i := range e.inboundDispatchers { + e.wg.Add(1) + go func(i int) { // S/R-SAFE: See above. + e.dispatchLoop(e.inboundDispatchers[i]) + e.wg.Done() + }(i) + } } } @@ -463,6 +505,9 @@ func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net } } +// WriteRawPacket implements stack.LinkEndpoint. +func (*endpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} } + // WritePacket writes outbound packets to the file descriptor. If it is not // currently writable, the packet is dropped. func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { @@ -470,9 +515,8 @@ func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocol e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt) } - var builder iovec.Builder - fd := e.fds[pkt.Hash%uint32(len(e.fds))] + var vnetHdrBuf []byte if e.gsoKind == stack.HWGSOSupported { vnetHdr := virtioNetHdr{} if pkt.GSOOptions.Type != stack.GSONone { @@ -494,71 +538,123 @@ func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocol vnetHdr.gsoSize = pkt.GSOOptions.MSS } } + vnetHdrBuf = vnetHdr.marshal() + } - vnetHdrBuf := vnetHdr.marshal() - builder.Add(vnetHdrBuf) + views := pkt.Views() + numIovecs := len(views) + if len(vnetHdrBuf) != 0 { + numIovecs++ + } + if numIovecs > e.writevMaxIovs { + numIovecs = e.writevMaxIovs } - for _, v := range pkt.Views() { - builder.Add(v) + // Allocate small iovec arrays on the stack. + var iovecsArr [8]unix.Iovec + iovecs := iovecsArr[:0] + if numIovecs > len(iovecsArr) { + iovecs = make([]unix.Iovec, 0, numIovecs) } - return rawfile.NonBlockingWriteIovec(fd, builder.Build()) + iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) + for _, v := range views { + iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) + } + return rawfile.NonBlockingWriteIovec(fd, iovecs) } -func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, tcpip.Error) { +func (e *endpoint) sendBatch(batchFD int, pkts []*stack.PacketBuffer) (int, tcpip.Error) { // Send a batch of packets through batchFD. - mmsgHdrs := make([]rawfile.MMsgHdr, 0, len(batch)) - for _, pkt := range batch { - if e.hdrSize > 0 { - e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress, pkt.NetworkProtocolNumber, pkt) - } + mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts)) + packets := 0 + for packets < len(pkts) { + mmsgHdrs := mmsgHdrsStorage + batch := pkts[packets:] + syscallHeaderBytes := uintptr(0) + for _, pkt := range batch { + if e.hdrSize > 0 { + e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress, pkt.NetworkProtocolNumber, pkt) + } - var vnetHdrBuf []byte - if e.gsoKind == stack.HWGSOSupported { - vnetHdr := virtioNetHdr{} - if pkt.GSOOptions.Type != stack.GSONone { - vnetHdr.hdrLen = uint16(pkt.HeaderSize()) - if pkt.GSOOptions.NeedsCsum { - vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM - vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen - vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset - } - if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { - switch pkt.GSOOptions.Type { - case stack.GSOTCPv4: - vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 - case stack.GSOTCPv6: - vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 - default: - panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) + var vnetHdrBuf []byte + if e.gsoKind == stack.HWGSOSupported { + vnetHdr := virtioNetHdr{} + if pkt.GSOOptions.Type != stack.GSONone { + vnetHdr.hdrLen = uint16(pkt.HeaderSize()) + if pkt.GSOOptions.NeedsCsum { + vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM + vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen + vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset + } + if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { + switch pkt.GSOOptions.Type { + case stack.GSOTCPv4: + vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 + case stack.GSOTCPv6: + vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 + default: + panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) + } + vnetHdr.gsoSize = pkt.GSOOptions.MSS } - vnetHdr.gsoSize = pkt.GSOOptions.MSS } + vnetHdrBuf = vnetHdr.marshal() } - vnetHdrBuf = vnetHdr.marshal() - } - var builder iovec.Builder - builder.Add(vnetHdrBuf) - for _, v := range pkt.Views() { - builder.Add(v) - } - iovecs := builder.Build() + views := pkt.Views() + numIovecs := len(views) + if len(vnetHdrBuf) != 0 { + numIovecs++ + } + if numIovecs > rawfile.MaxIovs { + numIovecs = rawfile.MaxIovs + } + if e.maxSyscallHeaderBytes != 0 { + syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec + if syscallHeaderBytes > e.maxSyscallHeaderBytes { + // We can't fit this packet into this call to sendmmsg(). + // We could potentially do so if we reduced numIovecs + // further, but this might incur considerable extra + // copying. Leave it to the next batch instead. + break + } + } - var mmsgHdr rawfile.MMsgHdr - mmsgHdr.Msg.Iov = &iovecs[0] - mmsgHdr.Msg.SetIovlen((len(iovecs))) - mmsgHdrs = append(mmsgHdrs, mmsgHdr) - } + // We can't easily allocate iovec arrays on the stack here since + // they will escape this loop iteration via mmsgHdrs. + iovecs := make([]unix.Iovec, 0, numIovecs) + iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) + for _, v := range views { + iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) + } - packets := 0 - for len(mmsgHdrs) > 0 { - sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs) - if err != nil { - return packets, err + var mmsgHdr rawfile.MMsgHdr + mmsgHdr.Msg.Iov = &iovecs[0] + mmsgHdr.Msg.SetIovlen(len(iovecs)) + mmsgHdrs = append(mmsgHdrs, mmsgHdr) + } + + if len(mmsgHdrs) == 0 { + // We can't fit batch[0] into a mmsghdr while staying under + // e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the + // mmsghdr (by using writev) and re-buffer iovecs more aggressively + // if necessary (by using e.writevMaxIovs instead of + // rawfile.MaxIovs). + pkt := batch[0] + if err := e.WritePacket(pkt.EgressRoute, pkt.NetworkProtocolNumber, pkt); err != nil { + return packets, err + } + packets++ + } else { + for len(mmsgHdrs) > 0 { + sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs) + if err != nil { + return packets, err + } + packets += sent + mmsgHdrs = mmsgHdrs[sent:] + } } - packets += sent - mmsgHdrs = mmsgHdrs[sent:] } return packets, nil @@ -676,8 +772,9 @@ func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabiliti unix.SetNonblock(fd, true) return &InjectableEndpoint{endpoint: endpoint{ - fds: []int{fd}, - mtu: mtu, - caps: capabilities, + fds: []int{fd}, + mtu: mtu, + caps: capabilities, + writevMaxIovs: rawfile.MaxIovs, }} } diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index 8aad338b6..eccd21579 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package fdbased diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go index df14eaad1..904393faa 100644 --- a/pkg/tcpip/link/fdbased/endpoint_unsafe.go +++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package fdbased diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go index 5d698a5e9..3f516cab5 100644 --- a/pkg/tcpip/link/fdbased/mmap.go +++ b/pkg/tcpip/link/fdbased/mmap.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build (linux && amd64) || (linux && arm64) // +build linux,amd64 linux,arm64 package fdbased @@ -113,6 +114,7 @@ func (t tPacketHdr) Payload() []byte { // packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets. // See: mmap_amd64_unsafe.go for implementation details. type packetMMapDispatcher struct { + stopFd // fd is the file descriptor used to send and receive packets. fd int @@ -128,18 +130,18 @@ type packetMMapDispatcher struct { ringOffset int } -func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, tcpip.Error) { +func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, bool, tcpip.Error) { hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:]) for hdr.tpStatus()&tpStatusUser == 0 { - event := rawfile.PollEvent{ - FD: int32(d.fd), - Events: unix.POLLIN | unix.POLLERR, - } - if _, errno := rawfile.BlockingPoll(&event, 1, nil); errno != 0 { + stopped, errno := rawfile.BlockingPollUntilStopped(d.efd, d.fd, unix.POLLIN|unix.POLLERR) + if errno != 0 { if errno == unix.EINTR { continue } - return nil, rawfile.TranslateErrno(errno) + return nil, stopped, rawfile.TranslateErrno(errno) + } + if stopped { + return nil, true, nil } if hdr.tpStatus()&tpStatusCopy != 0 { // This frame is truncated so skip it after flipping the @@ -157,14 +159,14 @@ func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, tcpip.Error) { // Release packet to kernel. hdr.setTPStatus(tpStatusKernel) d.ringOffset = (d.ringOffset + 1) % tpFrameNR - return pkt, nil + return pkt, false, nil } // dispatch reads packets from an mmaped ring buffer and dispatches them to the // network stack. func (d *packetMMapDispatcher) dispatch() (bool, tcpip.Error) { - pkt, err := d.readMMappedPacket() - if err != nil { + pkt, stopped, err := d.readMMappedPacket() + if err != nil || stopped { return false, err } var ( diff --git a/pkg/tcpip/link/fdbased/mmap_stub.go b/pkg/tcpip/link/fdbased/mmap_stub.go index 67be52d67..9d8679502 100644 --- a/pkg/tcpip/link/fdbased/mmap_stub.go +++ b/pkg/tcpip/link/fdbased/mmap_stub.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !linux || (!amd64 && !arm64) // +build !linux !amd64,!arm64 package fdbased diff --git a/pkg/tcpip/link/fdbased/mmap_unsafe.go b/pkg/tcpip/link/fdbased/mmap_unsafe.go index 1293f68a2..5b786169a 100644 --- a/pkg/tcpip/link/fdbased/mmap_unsafe.go +++ b/pkg/tcpip/link/fdbased/mmap_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build (linux && amd64) || (linux && arm64) // +build linux,amd64 linux,arm64 package fdbased @@ -46,9 +47,14 @@ func (t tPacketHdr) setTPStatus(status uint32) { } func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + stopFd, err := newStopFd() + if err != nil { + return nil, err + } d := &packetMMapDispatcher{ - fd: fd, - e: e, + stopFd: stopFd, + fd: fd, + e: e, } pageSize := unix.Getpagesize() if tpBlockSize%pageSize != 0 { diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go index 4b7ef3aac..fab34c5fa 100644 --- a/pkg/tcpip/link/fdbased/packet_dispatchers.go +++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package fdbased import ( + "fmt" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" @@ -113,9 +116,36 @@ func (b *iovecBuffer) pullViews(n int) buffer.VectorisedView { return buffer.NewVectorisedView(n, views) } +// stopFd is an eventfd used to signal the stop of a dispatcher. +type stopFd struct { + efd int +} + +func newStopFd() (stopFd, error) { + efd, err := unix.Eventfd(0, unix.EFD_NONBLOCK) + if err != nil { + return stopFd{efd: -1}, fmt.Errorf("failed to create eventfd: %w", err) + } + return stopFd{efd: efd}, nil +} + +// stop writes to the eventfd and notifies the dispatcher to stop. It does not +// block. +func (s *stopFd) stop() { + increment := []byte{1, 0, 0, 0, 0, 0, 0, 0} + if n, err := unix.Write(s.efd, increment); n != len(increment) || err != nil { + // There are two possible errors documented in eventfd(2) for writing: + // 1. We are writing 8 bytes and not 0xffffffffffffff, thus no EINVAL. + // 2. stop is only supposed to be called once, it can't reach the limit, + // thus no EAGAIN. + panic(fmt.Sprintf("write(efd) = (%d, %s), want (%d, nil)", n, err, len(increment))) + } +} + // readVDispatcher uses readv() system call to read inbound packets and // dispatches them. type readVDispatcher struct { + stopFd // fd is the file descriptor used to send and receive packets. fd int @@ -127,7 +157,15 @@ type readVDispatcher struct { } func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { - d := &readVDispatcher{fd: fd, e: e} + stopFd, err := newStopFd() + if err != nil { + return nil, err + } + d := &readVDispatcher{ + stopFd: stopFd, + fd: fd, + e: e, + } skipsVnetHdr := d.e.gsoKind == stack.HWGSOSupported d.buf = newIovecBuffer(BufConfig, skipsVnetHdr) return d, nil @@ -135,8 +173,8 @@ func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { // dispatch reads one packet from the file descriptor and dispatches it. func (d *readVDispatcher) dispatch() (bool, tcpip.Error) { - n, err := rawfile.BlockingReadv(d.fd, d.buf.nextIovecs()) - if n == 0 || err != nil { + n, err := rawfile.BlockingReadvUntilStopped(d.efd, d.fd, d.buf.nextIovecs()) + if n <= 0 || err != nil { return false, err } @@ -183,6 +221,7 @@ func (d *readVDispatcher) dispatch() (bool, tcpip.Error) { // recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and // dispatches them. type recvMMsgDispatcher struct { + stopFd // fd is the file descriptor used to send and receive packets. fd int @@ -206,7 +245,12 @@ const ( ) func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + stopFd, err := newStopFd() + if err != nil { + return nil, err + } d := &recvMMsgDispatcher{ + stopFd: stopFd, fd: fd, e: e, bufs: make([]*iovecBuffer, MaxMsgsPerRecv), @@ -234,8 +278,8 @@ func (d *recvMMsgDispatcher) dispatch() (bool, tcpip.Error) { d.msgHdrs[k].Msg.SetIovlen(iovLen) } - nMsgs, err := rawfile.BlockingRecvMMsg(d.fd, d.msgHdrs) - if err != nil { + nMsgs, err := rawfile.BlockingRecvMMsgUntilStopped(d.efd, d.fd, d.msgHdrs) + if nMsgs == -1 || err != nil { return false, err } // Process each of received packets. diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go index 7012d8829..ca1f9c08d 100644 --- a/pkg/tcpip/link/loopback/loopback.go +++ b/pkg/tcpip/link/loopback/loopback.go @@ -76,19 +76,8 @@ func (*endpoint) Wait() {} // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound // packets to the network-layer dispatcher. -func (e *endpoint) WritePacket(_ stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { - // Construct data as the unparsed portion for the loopback packet. - data := buffer.NewVectorisedView(pkt.Size(), pkt.Views()) - - // Because we're immediately turning around and writing the packet back - // to the rx path, we intentionally don't preserve the remote and local - // link addresses from the stack.Route we're passed. - newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: data, - }) - e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, newPkt) - - return nil +func (e *endpoint) WritePacket(_ stack.RouteInfo, _ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { + return e.WriteRawPacket(pkt) } // WritePackets implements stack.LinkEndpoint.WritePackets. @@ -103,3 +92,19 @@ func (*endpoint) ARPHardwareType() header.ARPHardwareType { func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { } + +// WriteRawPacket implements stack.LinkEndpoint. +func (e *endpoint) WriteRawPacket(pkt *stack.PacketBuffer) tcpip.Error { + // Construct data as the unparsed portion for the loopback packet. + data := buffer.NewVectorisedView(pkt.Size(), pkt.Views()) + + // Because we're immediately turning around and writing the packet back + // to the rx path, we intentionally don't preserve the remote and local + // link addresses from the stack.Route we're passed. + newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + Data: data, + }) + e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, pkt.NetworkProtocolNumber, newPkt) + + return nil +} diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go index 3e2a1aa94..844f5959b 100644 --- a/pkg/tcpip/link/muxed/injectable.go +++ b/pkg/tcpip/link/muxed/injectable.go @@ -131,6 +131,11 @@ func (*InjectableEndpoint) ARPHardwareType() header.ARPHardwareType { func (*InjectableEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { } +// WriteRawPacket implements stack.LinkEndpoint. +func (*InjectableEndpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { + return &tcpip.ErrNotSupported{} +} + // NewInjectableEndpoint creates a new multi-endpoint injectable endpoint. func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint) *InjectableEndpoint { return &InjectableEndpoint{ diff --git a/pkg/tcpip/link/nested/nested.go b/pkg/tcpip/link/nested/nested.go index 3e816b0c7..83a6c1cc8 100644 --- a/pkg/tcpip/link/nested/nested.go +++ b/pkg/tcpip/link/nested/nested.go @@ -60,16 +60,6 @@ func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protoco } } -// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket. -func (e *Endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { - e.mu.RLock() - d := e.dispatcher - e.mu.RUnlock() - if d != nil { - d.DeliverOutboundPacket(remote, local, protocol, pkt) - } -} - // Attach implements stack.LinkEndpoint. func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() @@ -152,3 +142,8 @@ func (e *Endpoint) ARPHardwareType() header.ARPHardwareType { func (e *Endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.child.AddHeader(local, remote, protocol, pkt) } + +// WriteRawPacket implements stack.LinkEndpoint. +func (e *Endpoint) WriteRawPacket(pkt *stack.PacketBuffer) tcpip.Error { + return e.child.WriteRawPacket(pkt) +} diff --git a/pkg/tcpip/link/packetsocket/BUILD b/pkg/tcpip/link/packetsocket/BUILD deleted file mode 100644 index 6fff160ce..000000000 --- a/pkg/tcpip/link/packetsocket/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "packetsocket", - srcs = ["endpoint.go"], - visibility = ["//visibility:public"], - deps = [ - "//pkg/tcpip", - "//pkg/tcpip/link/nested", - "//pkg/tcpip/stack", - ], -) diff --git a/pkg/tcpip/link/packetsocket/endpoint.go b/pkg/tcpip/link/packetsocket/endpoint.go deleted file mode 100644 index e01837e2d..000000000 --- a/pkg/tcpip/link/packetsocket/endpoint.go +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package packetsocket provides a link layer endpoint that provides the ability -// to loop outbound packets to any AF_PACKET sockets that may be interested in -// the outgoing packet. -package packetsocket - -import ( - "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/link/nested" - "gvisor.dev/gvisor/pkg/tcpip/stack" -) - -type endpoint struct { - nested.Endpoint -} - -// New creates a new packetsocket LinkEndpoint. -func New(lower stack.LinkEndpoint) stack.LinkEndpoint { - e := &endpoint{} - e.Endpoint.Init(lower, e) - return e -} - -// WritePacket implements stack.LinkEndpoint.WritePacket. -func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { - e.Endpoint.DeliverOutboundPacket(r.RemoteLinkAddress, r.LocalLinkAddress, protocol, pkt) - return e.Endpoint.WritePacket(r, protocol, pkt) -} - -// WritePackets implements stack.LinkEndpoint.WritePackets. -func (e *endpoint) WritePackets(r stack.RouteInfo, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, tcpip.Error) { - for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { - e.Endpoint.DeliverOutboundPacket(r.RemoteLinkAddress, r.LocalLinkAddress, pkt.NetworkProtocolNumber, pkt) - } - - return e.Endpoint.WritePackets(r, pkts, proto) -} diff --git a/pkg/tcpip/link/pipe/pipe.go b/pkg/tcpip/link/pipe/pipe.go index 5030b6ba1..3ed0aa3fe 100644 --- a/pkg/tcpip/link/pipe/pipe.go +++ b/pkg/tcpip/link/pipe/pipe.go @@ -121,3 +121,6 @@ func (*Endpoint) ARPHardwareType() header.ARPHardwareType { // AddHeader implements stack.LinkEndpoint. func (*Endpoint) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) { } + +// WriteRawPacket implements stack.LinkEndpoint. +func (*Endpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} } diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go index b1a28491d..b41e3e2fa 100644 --- a/pkg/tcpip/link/qdisc/fifo/endpoint.go +++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go @@ -108,13 +108,15 @@ func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protoco e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt) } -// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket. -func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { - e.dispatcher.DeliverOutboundPacket(remote, local, protocol, pkt) -} - // Attach implements stack.LinkEndpoint.Attach. func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { + // nil means the NIC is being removed. + if dispatcher == nil { + e.lower.Attach(nil) + e.Wait() + e.dispatcher = nil + return + } e.dispatcher = dispatcher e.lower.Attach(e) } @@ -221,3 +223,8 @@ func (e *endpoint) ARPHardwareType() header.ARPHardwareType { func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.lower.AddHeader(local, remote, protocol, pkt) } + +// WriteRawPacket implements stack.LinkEndpoint. +func (e *endpoint) WriteRawPacket(pkt *stack.PacketBuffer) tcpip.Error { + return e.lower.WriteRawPacket(pkt) +} diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s index 298bad55d..f2c230720 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s +++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s @@ -27,7 +27,7 @@ TEXT ·BlockingPoll(SB),NOSPLIT,$0-40 MOVQ $0x0, R10 // sigmask parameter which isn't used here MOVQ $0x10f, AX // SYS_PPOLL SYSCALL - CMPQ AX, $0xfffffffffffff001 + CMPQ AX, $0xfffffffffffff002 JLS ok MOVQ $-1, n+24(FP) NEGQ AX diff --git a/pkg/tcpip/link/rawfile/blockingpoll_arm64.s b/pkg/tcpip/link/rawfile/blockingpoll_arm64.s index b62888b93..8807586c7 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_arm64.s +++ b/pkg/tcpip/link/rawfile/blockingpoll_arm64.s @@ -27,7 +27,7 @@ TEXT ·BlockingPoll(SB),NOSPLIT,$0-40 MOVD $0x0, R3 // sigmask parameter which isn't used here MOVD $0x49, R8 // SYS_PPOLL SVC - CMP $0xfffffffffffff001, R0 + CMP $0xfffffffffffff002, R0 BLS ok MOVD $-1, R1 MOVD R1, n+24(FP) diff --git a/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go index 2206fe0e6..c1438da21 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux && !amd64 && !arm64 // +build linux,!amd64,!arm64 package rawfile diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go index 5002245a1..0b7b9e3de 100644 --- a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go +++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build ((linux && amd64) || (linux && arm64)) && go1.12 // +build linux,amd64 linux,arm64 // +build go1.12 -// +build !go1.18 -// Check go:linkname function signatures when updating Go version. +// //go:linkname directives type-checked by checklinkname. Any other +// non-linkname assumptions outside the Go 1 compatibility guarantee should +// have an accompanied vet check or version guard build tag. package rawfile diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go index 9743e70ea..7e21a78d4 100644 --- a/pkg/tcpip/link/rawfile/errors.go +++ b/pkg/tcpip/link/rawfile/errors.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package rawfile diff --git a/pkg/tcpip/link/rawfile/errors_test.go b/pkg/tcpip/link/rawfile/errors_test.go index 8f4bd60da..1b88c309b 100644 --- a/pkg/tcpip/link/rawfile/errors_test.go +++ b/pkg/tcpip/link/rawfile/errors_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package rawfile diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go index ba92aedbc..87a0b9a62 100644 --- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go +++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package rawfile contains utilities for using the netstack with raw host @@ -19,12 +20,66 @@ package rawfile import ( + "reflect" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/tcpip" ) +// SizeofIovec is the size of a unix.Iovec in bytes. +const SizeofIovec = unsafe.Sizeof(unix.Iovec{}) + +// MaxIovs is UIO_MAXIOV, the maximum number of iovecs that may be passed to a +// host system call in a single array. +const MaxIovs = 1024 + +// IovecFromBytes returns a unix.Iovec representing bs. +// +// Preconditions: len(bs) > 0. +func IovecFromBytes(bs []byte) unix.Iovec { + iov := unix.Iovec{ + Base: &bs[0], + } + iov.SetLen(len(bs)) + return iov +} + +func bytesFromIovec(iov unix.Iovec) (bs []byte) { + sh := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) + sh.Data = uintptr(unsafe.Pointer(iov.Base)) + sh.Len = int(iov.Len) + sh.Cap = int(iov.Len) + return +} + +// AppendIovecFromBytes returns append(iovs, IovecFromBytes(bs)). If len(bs) == +// 0, AppendIovecFromBytes returns iovs without modification. If len(iovs) >= +// max, AppendIovecFromBytes replaces the final iovec in iovs with one that +// also includes the contents of bs. Note that this implies that +// AppendIovecFromBytes is only usable when the returned iovec slice is used as +// the source of a write. +func AppendIovecFromBytes(iovs []unix.Iovec, bs []byte, max int) []unix.Iovec { + if len(bs) == 0 { + return iovs + } + if len(iovs) < max { + return append(iovs, IovecFromBytes(bs)) + } + iovs[len(iovs)-1] = IovecFromBytes(append(bytesFromIovec(iovs[len(iovs)-1]), bs...)) + return iovs +} + +// MMsgHdr represents the mmsg_hdr structure required by recvmmsg() on linux. +type MMsgHdr struct { + Msg unix.Msghdr + Len uint32 + _ [4]byte +} + +// SizeofMMsgHdr is the size of a MMsgHdr in bytes. +const SizeofMMsgHdr = unsafe.Sizeof(MMsgHdr{}) + // GetMTU determines the MTU of a network interface device. func GetMTU(name string) (uint32, error) { fd, err := unix.Socket(unix.AF_UNIX, unix.SOCK_DGRAM, 0) @@ -115,53 +170,77 @@ func BlockingRead(fd int, b []byte) (int, tcpip.Error) { } } -// BlockingReadv reads from a file descriptor that is set up as non-blocking and -// stores the data in a list of iovecs buffers. If no data is available, it will -// block in a poll() syscall until the file descriptor becomes readable. -func BlockingReadv(fd int, iovecs []unix.Iovec) (int, tcpip.Error) { +// BlockingReadvUntilStopped reads from a file descriptor that is set up as +// non-blocking and stores the data in a list of iovecs buffers. If no data is +// available, it will block in a poll() syscall until the file descriptor +// becomes readable or stop is signalled (efd becomes readable). Returns -1 in +// the latter case. +func BlockingReadvUntilStopped(efd int, fd int, iovecs []unix.Iovec) (int, tcpip.Error) { for { n, _, e := unix.RawSyscall(unix.SYS_READV, uintptr(fd), uintptr(unsafe.Pointer(&iovecs[0])), uintptr(len(iovecs))) if e == 0 { return int(n), nil } - - event := PollEvent{ - FD: int32(fd), - Events: 1, // POLLIN + if e != 0 && e != unix.EWOULDBLOCK { + return 0, TranslateErrno(e) + } + stopped, e := BlockingPollUntilStopped(efd, fd, unix.POLLIN) + if stopped { + return -1, nil } - - _, e = BlockingPoll(&event, 1, nil) if e != 0 && e != unix.EINTR { return 0, TranslateErrno(e) } } } -// MMsgHdr represents the mmsg_hdr structure required by recvmmsg() on linux. -type MMsgHdr struct { - Msg unix.Msghdr - Len uint32 - _ [4]byte -} - -// BlockingRecvMMsg reads from a file descriptor that is set up as non-blocking -// and stores the received messages in a slice of MMsgHdr structures. If no data -// is available, it will block in a poll() syscall until the file descriptor -// becomes readable. -func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, tcpip.Error) { +// BlockingRecvMMsgUntilStopped reads from a file descriptor that is set up as +// non-blocking and stores the received messages in a slice of MMsgHdr +// structures. If no data is available, it will block in a poll() syscall until +// the file descriptor becomes readable or stop is signalled (efd becomes +// readable). Returns -1 in the latter case. +func BlockingRecvMMsgUntilStopped(efd int, fd int, msgHdrs []MMsgHdr) (int, tcpip.Error) { for { n, _, e := unix.RawSyscall6(unix.SYS_RECVMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), unix.MSG_DONTWAIT, 0, 0) if e == 0 { return int(n), nil } - event := PollEvent{ - FD: int32(fd), - Events: 1, // POLLIN + if e != 0 && e != unix.EWOULDBLOCK { + return 0, TranslateErrno(e) } - if _, e := BlockingPoll(&event, 1, nil); e != 0 && e != unix.EINTR { + stopped, e := BlockingPollUntilStopped(efd, fd, unix.POLLIN) + if stopped { + return -1, nil + } + if e != 0 && e != unix.EINTR { return 0, TranslateErrno(e) } } } + +// BlockingPollUntilStopped polls for events on fd or until a stop is signalled +// on the event fd efd. Returns true if stopped, i.e., efd has event POLLIN. +func BlockingPollUntilStopped(efd int, fd int, events int16) (bool, unix.Errno) { + pevents := [...]PollEvent{ + { + FD: int32(efd), + Events: unix.POLLIN, + }, + { + FD: int32(fd), + Events: events, + }, + } + _, errno := BlockingPoll(&pevents[0], len(pevents), nil) + if errno != 0 { + return pevents[0].Revents&unix.POLLIN != 0, errno + } + + if pevents[1].Revents&unix.POLLHUP != 0 || pevents[1].Revents&unix.POLLERR != 0 { + errno = unix.ECONNRESET + } + + return pevents[0].Revents&unix.POLLIN != 0, errno +} diff --git a/pkg/tcpip/link/sharedmem/rx.go b/pkg/tcpip/link/sharedmem/rx.go index 8e6f3e5e3..e882a128c 100644 --- a/pkg/tcpip/link/sharedmem/rx.go +++ b/pkg/tcpip/link/sharedmem/rx.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package sharedmem diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index df9a0b90a..66efe6472 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package sharedmem provides the implemention of data-link layer endpoints @@ -201,6 +202,9 @@ func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net eth.Encode(ethHdr) } +// WriteRawPacket implements stack.LinkEndpoint. +func (*endpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} } + // WritePacket writes outbound packets to the file descriptor. If it is not // currently writable, the packet is dropped. func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go index 0f72d4e95..d6d953085 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem_test.go +++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux package sharedmem diff --git a/pkg/tcpip/link/sniffer/pcap.go b/pkg/tcpip/link/sniffer/pcap.go index c16c19647..d3edede63 100644 --- a/pkg/tcpip/link/sniffer/pcap.go +++ b/pkg/tcpip/link/sniffer/pcap.go @@ -14,7 +14,14 @@ package sniffer -import "time" +import ( + "encoding" + "encoding/binary" + "time" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) type pcapHeader struct { // MagicNumber is the file magic number. @@ -39,28 +46,38 @@ type pcapHeader struct { Network uint32 } -const pcapPacketHeaderLen = 16 - -type pcapPacketHeader struct { - // Seconds is the timestamp seconds. - Seconds uint32 - - // Microseconds is the timestamp microseconds. - Microseconds uint32 +var _ encoding.BinaryMarshaler = (*pcapPacket)(nil) - // IncludedLength is the number of octets of packet saved in file. - IncludedLength uint32 - - // OriginalLength is the actual length of packet. - OriginalLength uint32 +type pcapPacket struct { + timestamp time.Time + packet *stack.PacketBuffer + maxCaptureLen int } -func newPCAPPacketHeader(incLen, orgLen uint32) pcapPacketHeader { - now := time.Now() - return pcapPacketHeader{ - Seconds: uint32(now.Unix()), - Microseconds: uint32(now.Nanosecond() / 1000), - IncludedLength: incLen, - OriginalLength: orgLen, +func (p *pcapPacket) MarshalBinary() ([]byte, error) { + packetSize := p.packet.Size() + captureLen := p.maxCaptureLen + if packetSize < captureLen { + captureLen = packetSize + } + b := make([]byte, 16+captureLen) + binary.BigEndian.PutUint32(b[0:4], uint32(p.timestamp.Unix())) + binary.BigEndian.PutUint32(b[4:8], uint32(p.timestamp.Nanosecond()/1000)) + binary.BigEndian.PutUint32(b[8:12], uint32(captureLen)) + binary.BigEndian.PutUint32(b[12:16], uint32(packetSize)) + w := tcpip.SliceWriter(b[16:]) + for _, v := range p.packet.Views() { + if captureLen == 0 { + break + } + if len(v) > captureLen { + v = v[:captureLen] + } + n, err := w.Write(v) + if err != nil { + panic(err) + } + captureLen -= n } + return b, nil } diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index 2d6a3a833..2afa95af0 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -87,11 +87,7 @@ func NewWithPrefix(lower stack.LinkEndpoint, logPrefix string) stack.LinkEndpoin } func zoneOffset() (int32, error) { - loc, err := time.LoadLocation("Local") - if err != nil { - return 0, err - } - date := time.Date(0, 0, 0, 0, 0, 0, 0, loc) + date := time.Date(0, 0, 0, 0, 0, 0, 0, time.Local) _, offset := date.Zone() return int32(offset), nil } @@ -117,8 +113,9 @@ func writePCAPHeader(w io.Writer, maxLen uint32) error { // NewWithWriter creates a new sniffer link-layer endpoint. It wraps around // another endpoint and logs packets as they traverse the endpoint. // -// Packets are logged to writer in the pcap format. A sniffer created with this -// function will not emit packets using the standard log package. +// Each packet is written to writer in the pcap format in a single Write call +// without synchronization. A sniffer created with this function will not emit +// packets using the standard log package. // // snapLen is the maximum amount of a packet to be saved. Packets with a length // less than or equal to snapLen will be saved in their entirety. Longer @@ -143,43 +140,23 @@ func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protoco e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt) } -// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket. -func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { - e.Endpoint.DeliverOutboundPacket(remote, local, protocol, pkt) -} - func (e *endpoint) dumpPacket(dir direction, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { writer := e.writer if writer == nil && atomic.LoadUint32(&LogPackets) == 1 { logPacket(e.logPrefix, dir, protocol, pkt) } if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 { - totalLength := pkt.Size() - length := totalLength - if max := int(e.maxPCAPLen); length > max { - length = max + packet := pcapPacket{ + timestamp: time.Now(), + packet: pkt, + maxCaptureLen: int(e.maxPCAPLen), } - if err := binary.Write(writer, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(totalLength))); err != nil { + b, err := packet.MarshalBinary() + if err != nil { panic(err) } - write := func(b []byte) { - if len(b) > length { - b = b[:length] - } - for len(b) != 0 { - n, err := writer.Write(b) - if err != nil { - panic(err) - } - b = b[n:] - length -= n - } - } - for _, v := range pkt.Views() { - if length == 0 { - break - } - write(v) + if _, err := writer.Write(b); err != nil { + panic(err) } } } diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD index 7656cca6a..c3e4c3455 100644 --- a/pkg/tcpip/link/tun/BUILD +++ b/pkg/tcpip/link/tun/BUILD @@ -26,11 +26,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go index 36af2a029..fa2131c28 100644 --- a/pkg/tcpip/link/tun/device.go +++ b/pkg/tcpip/link/tun/device.go @@ -18,8 +18,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -88,12 +88,12 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags Flags) error { defer d.mu.Unlock() if d.endpoint != nil { - return syserror.EINVAL + return linuxerr.EINVAL } // Input validation. if flags.TAP && flags.TUN || !flags.TAP && !flags.TUN { - return syserror.EINVAL + return linuxerr.EINVAL } prefix := "tun" @@ -108,7 +108,7 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags Flags) error { endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps) if err != nil { - return syserror.EINVAL + return linuxerr.EINVAL } d.endpoint = endpoint @@ -125,7 +125,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE endpoint, ok := linkEP.(*tunEndpoint) if !ok { // Not a NIC created by tun device. - return nil, syserror.EOPNOTSUPP + return nil, linuxerr.EOPNOTSUPP } if !endpoint.TryIncRef() { // Race detected: NIC got deleted in between. @@ -159,7 +159,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE // Race detected: A NIC has been created in between. continue default: - return nil, syserror.EINVAL + return nil, linuxerr.EINVAL } } } @@ -170,10 +170,10 @@ func (d *Device) Write(data []byte) (int64, error) { endpoint := d.endpoint d.mu.RUnlock() if endpoint == nil { - return 0, syserror.EBADFD + return 0, linuxerr.EBADFD } if !endpoint.IsAttached() { - return 0, syserror.EIO + return 0, linuxerr.EIO } dataLen := int64(len(data)) @@ -207,6 +207,15 @@ func (d *Device) Write(data []byte) (int64, error) { protocol = pktInfoHdr.Protocol() case ethHdr != nil: protocol = ethHdr.Type() + case d.flags.TUN: + // TUN interface with IFF_NO_PI enabled, thus + // we need to determine protocol from version field + version := data[0] >> 4 + if version == 4 { + protocol = header.IPv4ProtocolNumber + } else if version == 6 { + protocol = header.IPv6ProtocolNumber + } } // Try to determine remote link address, default zero. @@ -233,13 +242,13 @@ func (d *Device) Read() ([]byte, error) { endpoint := d.endpoint d.mu.RUnlock() if endpoint == nil { - return nil, syserror.EBADFD + return nil, linuxerr.EBADFD } for { info, ok := endpoint.Read() if !ok { - return nil, syserror.ErrWouldBlock + return nil, linuxerr.ErrWouldBlock } v, ok := d.encodePkt(&info) @@ -264,13 +273,6 @@ func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) { vv.AppendView(buffer.View(hdr)) } - // If the packet does not already have link layer header, and the route - // does not exist, we can't compute it. This is possibly a raw packet, tun - // device doesn't support this at the moment. - if info.Pkt.LinkHeader().View().IsEmpty() && len(info.Route.RemoteLinkAddress) == 0 { - return nil, false - } - // Ethernet header (TAP only). if d.flags.TAP { // Add ethernet header if not provided. diff --git a/pkg/tcpip/link/tun/tun_unsafe.go b/pkg/tcpip/link/tun/tun_unsafe.go index 0591fbd63..db4338e79 100644 --- a/pkg/tcpip/link/tun/tun_unsafe.go +++ b/pkg/tcpip/link/tun/tun_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // Package tun contains methods to open TAP and TUN devices. diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go index a95602aa5..116e4defb 100644 --- a/pkg/tcpip/link/waitable/waitable.go +++ b/pkg/tcpip/link/waitable/waitable.go @@ -59,15 +59,6 @@ func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protoco e.dispatchGate.Leave() } -// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket. -func (e *Endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { - if !e.dispatchGate.Enter() { - return - } - e.dispatcher.DeliverOutboundPacket(remote, local, protocol, pkt) - e.dispatchGate.Leave() -} - // Attach implements stack.LinkEndpoint.Attach. It saves the dispatcher and // registers with the lower endpoint as its dispatcher so that "e" is called // for inbound packets. @@ -155,3 +146,6 @@ func (e *Endpoint) ARPHardwareType() header.ARPHardwareType { func (e *Endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.lower.AddHeader(local, remote, protocol, pkt) } + +// WriteRawPacket implements stack.LinkEndpoint. +func (*Endpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} } diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go index a71400ee9..b0e4237bd 100644 --- a/pkg/tcpip/link/waitable/waitable_test.go +++ b/pkg/tcpip/link/waitable/waitable_test.go @@ -80,6 +80,11 @@ func (e *countedEndpoint) WritePackets(_ stack.RouteInfo, pkts stack.PacketBuffe return pkts.Len(), nil } +// WriteRawPacket implements stack.LinkEndpoint. +func (*countedEndpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { + return &tcpip.ErrNotSupported{} +} + // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (*countedEndpoint) ARPHardwareType() header.ARPHardwareType { panic("unimplemented") diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD index 7b1ff44f4..c0179104a 100644 --- a/pkg/tcpip/network/BUILD +++ b/pkg/tcpip/network/BUILD @@ -23,8 +23,10 @@ go_test( "//pkg/tcpip/stack", "//pkg/tcpip/testutil", "//pkg/tcpip/transport/icmp", + "//pkg/tcpip/transport/raw", "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", + "//pkg/waiter", "@com_github_google_go_cmp//cmp:go_default_library", ], ) diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index 6515c31e5..e08243547 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -272,7 +272,6 @@ type protocol struct { func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber } func (p *protocol) MinimumPacketSize() int { return header.ARPSize } -func (p *protocol) DefaultPrefixLen() int { return 0 } func (*protocol) ParseAddresses(buffer.View) (src, dst tcpip.Address) { return "", "" diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go index 5fcbfeaa2..061cc35ae 100644 --- a/pkg/tcpip/network/arp/arp_test.go +++ b/pkg/tcpip/network/arp/arp_test.go @@ -153,8 +153,12 @@ func makeTestContext(t *testing.T, eventDepth int, packetDepth int) testContext t.Fatalf("CreateNIC failed: %s", err) } - if err := tc.s.AddAddress(nicID, ipv4.ProtocolNumber, stackAddr); err != nil { - t.Fatalf("AddAddress for ipv4 failed: %s", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: stackAddr.WithPrefix(), + } + if err := tc.s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } tc.s.SetRouteTable([]tcpip.Route{{ @@ -569,8 +573,12 @@ func TestLinkAddressRequest(t *testing.T) { } if len(test.nicAddr) != 0 { - if err := s.AddAddress(nicID, ipv4.ProtocolNumber, test.nicAddr); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, ipv4.ProtocolNumber, test.nicAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: test.nicAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } } diff --git a/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go b/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go index 0b51563cd..1261ad414 100644 --- a/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go +++ b/pkg/tcpip/network/internal/ip/generic_multicast_protocol_test.go @@ -126,7 +126,7 @@ func (m *mockMulticastGroupProtocol) sendQueuedReports() { // Precondition: m.mu must be read locked. func (m *mockMulticastGroupProtocol) Enabled() bool { if m.mu.TryLock() { - m.mu.Unlock() + m.mu.Unlock() // +checklocksforce: TryLock. m.t.Fatal("got write lock, expected to not take the lock; generic multicast protocol must take the read or write lock before calling Enabled") } @@ -138,11 +138,11 @@ func (m *mockMulticastGroupProtocol) Enabled() bool { // Precondition: m.mu must be locked. func (m *mockMulticastGroupProtocol) SendReport(groupAddress tcpip.Address) (bool, tcpip.Error) { if m.mu.TryLock() { - m.mu.Unlock() + m.mu.Unlock() // +checklocksforce: TryLock. m.t.Fatalf("got write lock, expected to not take the lock; generic multicast protocol must take the write lock before sending report for %s", groupAddress) } if m.mu.TryRLock() { - m.mu.RUnlock() + m.mu.RUnlock() // +checklocksforce: TryLock. m.t.Fatalf("got read lock, expected to not take the lock; generic multicast protocol must take the write lock before sending report for %s", groupAddress) } @@ -155,11 +155,11 @@ func (m *mockMulticastGroupProtocol) SendReport(groupAddress tcpip.Address) (boo // Precondition: m.mu must be locked. func (m *mockMulticastGroupProtocol) SendLeave(groupAddress tcpip.Address) tcpip.Error { if m.mu.TryLock() { - m.mu.Unlock() + m.mu.Unlock() // +checklocksforce: TryLock. m.t.Fatalf("got write lock, expected to not take the lock; generic multicast protocol must take the write lock before sending leave for %s", groupAddress) } if m.mu.TryRLock() { - m.mu.RUnlock() + m.mu.RUnlock() // +checklocksforce: TryLock. m.t.Fatalf("got read lock, expected to not take the lock; generic multicast protocol must take the write lock before sending leave for %s", groupAddress) } diff --git a/pkg/tcpip/network/internal/testutil/testutil.go b/pkg/tcpip/network/internal/testutil/testutil.go index 605e9ef8d..4d4d98caf 100644 --- a/pkg/tcpip/network/internal/testutil/testutil.go +++ b/pkg/tcpip/network/internal/testutil/testutil.go @@ -101,6 +101,11 @@ func (*MockLinkEndpoint) ARPHardwareType() header.ARPHardwareType { return heade func (*MockLinkEndpoint) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) { } +// WriteRawPacket implements stack.LinkEndpoint. +func (*MockLinkEndpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { + return &tcpip.ErrNotSupported{} +} + // MakeRandPkt generates a randomized packet. transportHeaderLength indicates // how many random bytes will be copied in the Transport Header. // extraHeaderReserveLength indicates how much extra space will be reserved for diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index bd63e0289..87f650661 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -15,6 +15,7 @@ package ip_test import ( + "bytes" "fmt" "strings" "testing" @@ -32,8 +33,10 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/testutil" "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" + "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/pkg/waiter" ) const nicID = 1 @@ -88,6 +91,7 @@ type testObject struct { dataCalls int controlCalls int + rawCalls int } // checkValues verifies that the transport protocol, data contents, src & dst @@ -148,6 +152,10 @@ func (t *testObject) DeliverTransportError(local, remote tcpip.Address, net tcpi t.controlCalls++ } +func (t *testObject) DeliverRawPacket(tcpip.TransportProtocolNumber, *stack.PacketBuffer) { + t.rawCalls++ +} + // Attach is only implemented to satisfy the LinkEndpoint interface. func (*testObject) Attach(stack.NetworkDispatcher) {} @@ -225,7 +233,13 @@ func buildIPv4Route(local, remote tcpip.Address) (*stack.Route, tcpip.Error) { TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol}, }) s.CreateNIC(nicID, loopback.New()) - s.AddAddress(nicID, ipv4.ProtocolNumber, local) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: local.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + return nil, err + } s.SetRouteTable([]tcpip.Route{{ Destination: header.IPv4EmptySubnet, Gateway: ipv4Gateway, @@ -241,7 +255,13 @@ func buildIPv6Route(local, remote tcpip.Address) (*stack.Route, tcpip.Error) { TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol}, }) s.CreateNIC(nicID, loopback.New()) - s.AddAddress(nicID, ipv6.ProtocolNumber, local) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: local.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + return nil, err + } s.SetRouteTable([]tcpip.Route{{ Destination: header.IPv6EmptySubnet, Gateway: ipv6Gateway, @@ -264,13 +284,13 @@ func buildDummyStackWithLinkEndpoint(t *testing.T, mtu uint32) (*stack.Stack, *c } v4Addr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: localIPv4AddrWithPrefix} - if err := s.AddProtocolAddress(nicID, v4Addr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v) = %s", nicID, v4Addr, err) + if err := s.AddProtocolAddress(nicID, v4Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}) = %s", nicID, v4Addr, err) } v6Addr := tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: localIPv6AddrWithPrefix} - if err := s.AddProtocolAddress(nicID, v6Addr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v) = %s", nicID, v6Addr, err) + if err := s.AddProtocolAddress(nicID, v6Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}) = %s", nicID, v6Addr, err) } return s, e @@ -705,8 +725,8 @@ func TestReceive(t *testing.T) { if !ok { t.Fatalf("expected network endpoint with number = %d to implement stack.AddressableEndpoint", test.protoNum) } - if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(test.epAddr, stack.CanBePrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */); err != nil { - t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, CanBePrimaryEndpoint, AddressConfigStatic, false): %s", test.epAddr, err) + if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(test.epAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, {}): %s", test.epAddr, err) } else { ep.DecRef() } @@ -717,7 +737,10 @@ func TestReceive(t *testing.T) { } test.handlePacket(t, ep, &nic) if nic.testObject.dataCalls != 1 { - t.Errorf("Bad number of data calls: got %x, want 1", nic.testObject.dataCalls) + t.Errorf("Bad number of data calls: got %d, want 1", nic.testObject.dataCalls) + } + if nic.testObject.rawCalls != 1 { + t.Errorf("Bad number of raw calls: got %d, want 1", nic.testObject.rawCalls) } if got := stat.Value(); got != 1 { t.Errorf("got s.Stats().IP.PacketsReceived.Value() = %d, want = 1", got) @@ -874,8 +897,8 @@ func TestIPv4ReceiveControl(t *testing.T) { t.Fatal("expected IPv4 network endpoint to implement stack.AddressableEndpoint") } addr := localIPv4Addr.WithPrefix() - if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.CanBePrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */); err != nil { - t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, CanBePrimaryEndpoint, AddressConfigStatic, false): %s", addr, err) + if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.AddressProperties{}); err != nil { + t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, {}): %s", addr, err) } else { ep.DecRef() } @@ -960,15 +983,18 @@ func TestIPv4FragmentationReceive(t *testing.T) { t.Fatal("expected IPv4 network endpoint to implement stack.AddressableEndpoint") } addr := localIPv4Addr.WithPrefix() - if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.CanBePrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */); err != nil { - t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, CanBePrimaryEndpoint, AddressConfigStatic, false): %s", addr, err) + if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.AddressProperties{}); err != nil { + t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, {}): %s", addr, err) } else { ep.DecRef() } ep.HandlePacket(pkt) if nic.testObject.dataCalls != 0 { - t.Fatalf("Bad number of data calls: got %x, want 0", nic.testObject.dataCalls) + t.Fatalf("Bad number of data calls: got %d, want 0", nic.testObject.dataCalls) + } + if nic.testObject.rawCalls != 0 { + t.Errorf("Bad number of raw calls: got %d, want 0", nic.testObject.rawCalls) } // Send second segment. @@ -977,7 +1003,10 @@ func TestIPv4FragmentationReceive(t *testing.T) { }) ep.HandlePacket(pkt) if nic.testObject.dataCalls != 1 { - t.Fatalf("Bad number of data calls: got %x, want 1", nic.testObject.dataCalls) + t.Fatalf("Bad number of data calls: got %d, want 1", nic.testObject.dataCalls) + } + if nic.testObject.rawCalls != 1 { + t.Errorf("Bad number of raw calls: got %d, want 1", nic.testObject.rawCalls) } } @@ -1220,8 +1249,8 @@ func TestIPv6ReceiveControl(t *testing.T) { t.Fatal("expected IPv6 network endpoint to implement stack.AddressableEndpoint") } addr := localIPv6Addr.WithPrefix() - if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.CanBePrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */); err != nil { - t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, CanBePrimaryEndpoint, AddressConfigStatic, false): %s", addr, err) + if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.AddressProperties{}); err != nil { + t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, {}): %s", addr, err) } else { ep.DecRef() } @@ -1287,7 +1316,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name string protoFactory stack.NetworkProtocolFactory protoNum tcpip.NetworkProtocolNumber - nicAddr tcpip.Address + nicAddr tcpip.AddressWithPrefix remoteAddr tcpip.Address pktGen func(*testing.T, tcpip.Address) buffer.VectorisedView checker func(*testing.T, *stack.PacketBuffer, tcpip.Address) @@ -1297,7 +1326,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv4", protoFactory: ipv4.NewProtocol, protoNum: ipv4.ProtocolNumber, - nicAddr: localIPv4Addr, + nicAddr: localIPv4AddrWithPrefix, remoteAddr: remoteIPv4Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { totalLen := header.IPv4MinimumSize + len(data) @@ -1310,7 +1339,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { Protocol: transportProto, TTL: ipv4.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv4Addr, }) return hdr.View().ToVectorisedView() }, @@ -1338,7 +1367,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv4 with IHL too small", protoFactory: ipv4.NewProtocol, protoNum: ipv4.ProtocolNumber, - nicAddr: localIPv4Addr, + nicAddr: localIPv4AddrWithPrefix, remoteAddr: remoteIPv4Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { totalLen := header.IPv4MinimumSize + len(data) @@ -1351,7 +1380,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { Protocol: transportProto, TTL: ipv4.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv4Addr, }) ip.SetHeaderLength(header.IPv4MinimumSize - 1) return hdr.View().ToVectorisedView() @@ -1362,7 +1391,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv4 too small", protoFactory: ipv4.NewProtocol, protoNum: ipv4.ProtocolNumber, - nicAddr: localIPv4Addr, + nicAddr: localIPv4AddrWithPrefix, remoteAddr: remoteIPv4Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { ip := header.IPv4(make([]byte, header.IPv4MinimumSize)) @@ -1370,7 +1399,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { Protocol: transportProto, TTL: ipv4.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv4Addr, }) return buffer.View(ip[:len(ip)-1]).ToVectorisedView() }, @@ -1380,7 +1409,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv4 minimum size", protoFactory: ipv4.NewProtocol, protoNum: ipv4.ProtocolNumber, - nicAddr: localIPv4Addr, + nicAddr: localIPv4AddrWithPrefix, remoteAddr: remoteIPv4Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { ip := header.IPv4(make([]byte, header.IPv4MinimumSize)) @@ -1388,7 +1417,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { Protocol: transportProto, TTL: ipv4.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv4Addr, }) return buffer.View(ip).ToVectorisedView() }, @@ -1416,7 +1445,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv4 with options", protoFactory: ipv4.NewProtocol, protoNum: ipv4.ProtocolNumber, - nicAddr: localIPv4Addr, + nicAddr: localIPv4AddrWithPrefix, remoteAddr: remoteIPv4Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { ipHdrLen := int(header.IPv4MinimumSize + ipv4Options.Length()) @@ -1430,7 +1459,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { Protocol: transportProto, TTL: ipv4.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv4Addr, Options: ipv4Options, }) return hdr.View().ToVectorisedView() @@ -1461,7 +1490,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv4 with options and data across views", protoFactory: ipv4.NewProtocol, protoNum: ipv4.ProtocolNumber, - nicAddr: localIPv4Addr, + nicAddr: localIPv4AddrWithPrefix, remoteAddr: remoteIPv4Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { ip := header.IPv4(make([]byte, header.IPv4MinimumSize+ipv4Options.Length())) @@ -1469,7 +1498,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { Protocol: transportProto, TTL: ipv4.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv4Addr, Options: ipv4Options, }) vv := buffer.View(ip).ToVectorisedView() @@ -1502,7 +1531,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv6", protoFactory: ipv6.NewProtocol, protoNum: ipv6.ProtocolNumber, - nicAddr: localIPv6Addr, + nicAddr: localIPv6AddrWithPrefix, remoteAddr: remoteIPv6Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { totalLen := header.IPv6MinimumSize + len(data) @@ -1515,7 +1544,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { TransportProtocol: transportProto, HopLimit: ipv6.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv6Addr, }) return hdr.View().ToVectorisedView() }, @@ -1542,7 +1571,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv6 with extension header", protoFactory: ipv6.NewProtocol, protoNum: ipv6.ProtocolNumber, - nicAddr: localIPv6Addr, + nicAddr: localIPv6AddrWithPrefix, remoteAddr: remoteIPv6Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { totalLen := header.IPv6MinimumSize + len(ipv6FragmentExtHdr) + len(data) @@ -1560,7 +1589,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { TransportProtocol: tcpip.TransportProtocolNumber(header.IPv6FragmentExtHdrIdentifier), HopLimit: ipv6.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv6Addr, }) return hdr.View().ToVectorisedView() }, @@ -1587,7 +1616,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv6 minimum size", protoFactory: ipv6.NewProtocol, protoNum: ipv6.ProtocolNumber, - nicAddr: localIPv6Addr, + nicAddr: localIPv6AddrWithPrefix, remoteAddr: remoteIPv6Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { ip := header.IPv6(make([]byte, header.IPv6MinimumSize)) @@ -1595,7 +1624,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { TransportProtocol: transportProto, HopLimit: ipv6.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv6Addr, }) return buffer.View(ip).ToVectorisedView() }, @@ -1622,7 +1651,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { name: "IPv6 too small", protoFactory: ipv6.NewProtocol, protoNum: ipv6.ProtocolNumber, - nicAddr: localIPv6Addr, + nicAddr: localIPv6AddrWithPrefix, remoteAddr: remoteIPv6Addr, pktGen: func(t *testing.T, src tcpip.Address) buffer.VectorisedView { ip := header.IPv6(make([]byte, header.IPv6MinimumSize)) @@ -1630,7 +1659,7 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { TransportProtocol: transportProto, HopLimit: ipv6.DefaultTTL, SrcAddr: src, - DstAddr: header.IPv4Any, + DstAddr: remoteIPv4Addr, }) return buffer.View(ip[:len(ip)-1]).ToVectorisedView() }, @@ -1646,11 +1675,11 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { }{ { name: "unspecified source", - srcAddr: tcpip.Address(strings.Repeat("\x00", len(test.nicAddr))), + srcAddr: tcpip.Address(strings.Repeat("\x00", len(test.nicAddr.Address))), }, { name: "random source", - srcAddr: tcpip.Address(strings.Repeat("\xab", len(test.nicAddr))), + srcAddr: tcpip.Address(strings.Repeat("\xab", len(test.nicAddr.Address))), }, } @@ -1663,15 +1692,19 @@ func TestWriteHeaderIncludedPacket(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddAddress(nicID, test.protoNum, test.nicAddr); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, test.protoNum, test.nicAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: test.protoNum, + AddressWithPrefix: test.nicAddr, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } s.SetRouteTable([]tcpip.Route{{Destination: test.remoteAddr.WithPrefix().Subnet(), NIC: nicID}}) - r, err := s.FindRoute(nicID, test.nicAddr, test.remoteAddr, test.protoNum, false /* multicastLoop */) + r, err := s.FindRoute(nicID, test.nicAddr.Address, test.remoteAddr, test.protoNum, false /* multicastLoop */) if err != nil { - t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, test.remoteAddr, test.nicAddr, test.protoNum, err) + t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, test.remoteAddr, test.nicAddr.Address, test.protoNum, err) } defer r.Release() @@ -2018,3 +2051,97 @@ func TestJoinLeaveAllRoutersGroup(t *testing.T) { }) } } + +func TestSetNICIDBeforeDeliveringToRawEndpoint(t *testing.T) { + const nicID = 1 + + tests := []struct { + name string + proto tcpip.NetworkProtocolNumber + addr tcpip.AddressWithPrefix + payloadOffset int + }{ + { + name: "IPv4", + proto: header.IPv4ProtocolNumber, + addr: localIPv4AddrWithPrefix, + payloadOffset: header.IPv4MinimumSize, + }, + { + name: "IPv6", + proto: header.IPv6ProtocolNumber, + addr: localIPv6AddrWithPrefix, + payloadOffset: 0, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ + ipv4.NewProtocol, + ipv6.NewProtocol, + }, + TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}, + RawFactory: raw.EndpointFactory{}, + }) + if err := s.CreateNIC(nicID, loopback.New()); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID, err) + } + protocolAddr := tcpip.ProtocolAddress{ + Protocol: test.proto, + AddressWithPrefix: test.addr, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) + } + + s.SetRouteTable([]tcpip.Route{ + { + Destination: test.addr.Subnet(), + NIC: nicID, + }, + }) + + var wq waiter.Queue + we, ch := waiter.NewChannelEntry(nil) + wq.EventRegister(&we, waiter.ReadableEvents) + ep, err := s.NewRawEndpoint(udp.ProtocolNumber, test.proto, &wq, true /* associated */) + if err != nil { + t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.proto, err) + } + defer ep.Close() + + writeOpts := tcpip.WriteOptions{ + To: &tcpip.FullAddress{ + Addr: test.addr.Address, + }, + } + data := []byte{1, 2, 3, 4} + var r bytes.Reader + r.Reset(data) + if n, err := ep.Write(&r, writeOpts); err != nil { + t.Fatalf("ep.Write(_, _): %s", err) + } else if want := int64(len(data)); n != want { + t.Fatalf("got ep.Write(_, _) = (%d, nil), want = (%d, nil)", n, want) + } + + // Wait for the endpoint to become readable. + <-ch + + var w bytes.Buffer + rr, err := ep.Read(&w, tcpip.ReadOptions{ + NeedRemoteAddr: true, + }) + if err != nil { + t.Fatalf("ep.Read(...): %s", err) + } + if diff := cmp.Diff(data, w.Bytes()[test.payloadOffset:]); diff != "" { + t.Errorf("payload mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(tcpip.FullAddress{Addr: test.addr.Address, NIC: nicID}, rr.RemoteAddr); diff != "" { + t.Errorf("remote addr mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD index c90974693..2257f728e 100644 --- a/pkg/tcpip/network/ipv4/BUILD +++ b/pkg/tcpip/network/ipv4/BUILD @@ -39,7 +39,6 @@ go_test( "//pkg/tcpip/faketime", "//pkg/tcpip/header", "//pkg/tcpip/link/channel", - "//pkg/tcpip/link/loopback", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", "//pkg/tcpip/network/internal/testutil", diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go index 5f6b0c6af..2aa38eb98 100644 --- a/pkg/tcpip/network/ipv4/icmp.go +++ b/pkg/tcpip/network/ipv4/icmp.go @@ -173,9 +173,8 @@ func (e *endpoint) handleControl(errInfo stack.TransportError, pkt *stack.Packet func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) { received := e.stats.icmp.packetsReceived - // TODO(gvisor.dev/issue/170): ICMP packets don't have their - // TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a - // full explanation. + // ICMP packets don't have their TransportHeader fields set. See + // icmp/protocol.go:protocol.Parse for a full explanation. v, ok := pkt.Data().PullUp(header.ICMPv4MinimumSize) if !ok { received.invalid.Increment() diff --git a/pkg/tcpip/network/ipv4/igmp_test.go b/pkg/tcpip/network/ipv4/igmp_test.go index 4bd6f462e..c6576fcbc 100644 --- a/pkg/tcpip/network/ipv4/igmp_test.go +++ b/pkg/tcpip/network/ipv4/igmp_test.go @@ -120,9 +120,12 @@ func createAndInjectIGMPPacket(e *channel.Endpoint, igmpType header.IGMPType, ma // cycles. func TestIGMPV1Present(t *testing.T) { e, s, clock := createStack(t, true) - addr := tcpip.AddressWithPrefix{Address: stackAddr, PrefixLen: defaultPrefixLength} - if err := s.AddAddressWithPrefix(nicID, ipv4.ProtocolNumber, addr); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s): %s", nicID, ipv4.ProtocolNumber, addr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{Address: stackAddr, PrefixLen: defaultPrefixLength}, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } if err := s.JoinGroup(ipv4.ProtocolNumber, nicID, multicastAddr); err != nil { @@ -215,8 +218,15 @@ func TestSendQueuedIGMPReports(t *testing.T) { // The initial set of IGMP reports that were queued should be sent once an // address is assigned. - if err := s.AddAddress(nicID, ipv4.ProtocolNumber, stackAddr); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv4.ProtocolNumber, stackAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: stackAddr, + PrefixLen: defaultPrefixLength, + }, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } if got := reportStat.Value(); got != 1 { t.Errorf("got reportStat.Value() = %d, want = 1", got) @@ -350,8 +360,12 @@ func TestIGMPPacketValidation(t *testing.T) { t.Run(test.name, func(t *testing.T) { e, s, _ := createStack(t, true) for _, address := range test.stackAddresses { - if err := s.AddAddressWithPrefix(nicID, ipv4.ProtocolNumber, address); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s): %s", nicID, ipv4.ProtocolNumber, address, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: address, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } } stats := s.Stats() diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index c99297a51..aef789b4c 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -240,7 +240,7 @@ func (e *endpoint) Enable() tcpip.Error { } // Create an endpoint to receive broadcast packets on this interface. - ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */) + ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.AddressProperties{PEB: stack.NeverPrimaryEndpoint}) if err != nil { return err } @@ -429,9 +429,9 @@ func (e *endpoint) WritePacket(r *stack.Route, params stack.NetworkHeaderParams, // based on destination address and do not send the packet to link // layer. // - // TODO(gvisor.dev/issue/170): We should do this for every - // packet, rather than only NATted packets, but removing this check - // short circuits broadcasts before they are sent out to other hosts. + // We should do this for every packet, rather than only NATted packets, but + // removing this check short circuits broadcasts before they are sent out to + // other hosts. if pkt.NatDone { netHeader := header.IPv4(pkt.NetworkHeader().View()) if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil { @@ -614,10 +614,6 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu ipH.SetSourceAddress(r.LocalAddress()) } - // Set the destination. If the packet already included a destination, it will - // be part of the route anyways. - ipH.SetDestinationAddress(r.RemoteAddress()) - // Set the packet ID when zero. if ipH.ID() == 0 { // RFC 6864 section 4.3 mandates uniqueness of ID values for @@ -861,6 +857,14 @@ func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum func (e *endpoint) handleValidatedPacket(h header.IPv4, pkt *stack.PacketBuffer, inNICName string) { pkt.NICID = e.nic.ID() + + // Raw socket packets are delivered based solely on the transport protocol + // number. We only require that the packet be valid IPv4, and that they not + // be fragmented. + if !h.More() && h.FragmentOffset() == 0 { + e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt) + } + stats := e.stats stats.ip.ValidPacketsReceived.Increment() @@ -995,6 +999,9 @@ func (e *endpoint) handleValidatedPacket(h header.IPv4, pkt *stack.PacketBuffer, // to do it here. h.SetTotalLength(uint16(pkt.Data().Size() + len(h))) h.SetFlagsFragmentOffset(0, 0) + + // Now that the packet is reassembled, it can be sent to raw sockets. + e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt) } stats.ip.PacketsDelivered.Increment() @@ -1068,11 +1075,11 @@ func (e *endpoint) Close() { } // AddAndAcquirePermanentAddress implements stack.AddressableEndpoint. -func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) { +func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties stack.AddressProperties) (stack.AddressEndpoint, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() - ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated) + ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, properties) if err == nil { e.mu.igmp.sendQueuedReports() } @@ -1219,11 +1226,6 @@ func (p *protocol) MinimumPacketSize() int { return header.IPv4MinimumSize } -// DefaultPrefixLen returns the IPv4 default prefix length. -func (p *protocol) DefaultPrefixLen() int { - return header.IPv4AddressSize * 8 -} - // ParseAddresses implements stack.NetworkProtocol. func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { h := header.IPv4(v) diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go index 4a4448cf9..e7b5b3ea2 100644 --- a/pkg/tcpip/network/ipv4/ipv4_test.go +++ b/pkg/tcpip/network/ipv4/ipv4_test.go @@ -32,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/faketime" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/channel" - "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" iptestutil "gvisor.dev/gvisor/pkg/tcpip/network/internal/testutil" @@ -102,8 +101,12 @@ func TestExcludeBroadcast(t *testing.T) { defer ep.Close() // Add a valid primary endpoint address, now we can connect. - if err := s.AddAddress(1, ipv4.ProtocolNumber, "\x0a\x00\x00\x02"); err != nil { - t.Fatalf("AddAddress failed: %v", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: tcpip.Address("\x0a\x00\x00\x02").WithPrefix(), + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } if err := ep.Connect(randomAddr); err != nil { t.Errorf("Connect failed: %v", err) @@ -357,8 +360,8 @@ func TestForwarding(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", incomingNICID, err) } incomingIPv4ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: incomingIPv4Addr} - if err := s.AddProtocolAddress(incomingNICID, incomingIPv4ProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", incomingNICID, incomingIPv4ProtoAddr, err) + if err := s.AddProtocolAddress(incomingNICID, incomingIPv4ProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", incomingNICID, incomingIPv4ProtoAddr, err) } expectedEmittedPacketCount := 1 @@ -370,8 +373,8 @@ func TestForwarding(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", outgoingNICID, err) } outgoingIPv4ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: outgoingIPv4Addr} - if err := s.AddProtocolAddress(outgoingNICID, outgoingIPv4ProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", outgoingNICID, outgoingIPv4ProtoAddr, err) + if err := s.AddProtocolAddress(outgoingNICID, outgoingIPv4ProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", outgoingNICID, outgoingIPv4ProtoAddr, err) } s.SetRouteTable([]tcpip.Route{ @@ -1185,8 +1188,8 @@ func TestIPv4Sanity(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } ipv4ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: ipv4Addr} - if err := s.AddProtocolAddress(nicID, ipv4ProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, ipv4ProtoAddr, err) + if err := s.AddProtocolAddress(nicID, ipv4ProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, ipv4ProtoAddr, err) } // Default routes for IPv4 so ICMP can find a route to the remote @@ -1746,8 +1749,8 @@ func TestInvalidFragments(t *testing.T) { const ( nicID = 1 linkAddr = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e") - addr1 = "\x0a\x00\x00\x01" - addr2 = "\x0a\x00\x00\x02" + addr1 = tcpip.Address("\x0a\x00\x00\x01") + addr2 = tcpip.Address("\x0a\x00\x00\x02") tos = 0 ident = 1 ttl = 48 @@ -2013,8 +2016,12 @@ func TestInvalidFragments(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ipv4.ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, addr2, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr2.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } for _, f := range test.fragments { @@ -2062,8 +2069,8 @@ func TestFragmentReassemblyTimeout(t *testing.T) { const ( nicID = 1 linkAddr = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e") - addr1 = "\x0a\x00\x00\x01" - addr2 = "\x0a\x00\x00\x02" + addr1 = tcpip.Address("\x0a\x00\x00\x01") + addr2 = tcpip.Address("\x0a\x00\x00\x02") tos = 0 ident = 1 ttl = 48 @@ -2238,8 +2245,12 @@ func TestFragmentReassemblyTimeout(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ipv4.ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, addr2, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr2.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } s.SetRouteTable([]tcpip.Route{{ Destination: header.IPv4EmptySubnet, @@ -2309,9 +2320,9 @@ func TestReceiveFragments(t *testing.T) { const ( nicID = 1 - addr1 = "\x0c\xa8\x00\x01" // 192.168.0.1 - addr2 = "\x0c\xa8\x00\x02" // 192.168.0.2 - addr3 = "\x0c\xa8\x00\x03" // 192.168.0.3 + addr1 = tcpip.Address("\x0c\xa8\x00\x01") // 192.168.0.1 + addr2 = tcpip.Address("\x0c\xa8\x00\x02") // 192.168.0.2 + addr3 = tcpip.Address("\x0c\xa8\x00\x03") // 192.168.0.3 ) // Build and return a UDP header containing payload. @@ -2704,8 +2715,12 @@ func TestReceiveFragments(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, addr2, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: addr2.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } wq := waiter.Queue{} @@ -2986,11 +3001,15 @@ func buildRoute(t *testing.T, ep stack.LinkEndpoint) *stack.Route { t.Fatalf("CreateNIC(1, _) failed: %s", err) } const ( - src = "\x10\x00\x00\x01" - dst = "\x10\x00\x00\x02" + src = tcpip.Address("\x10\x00\x00\x01") + dst = tcpip.Address("\x10\x00\x00\x02") ) - if err := s.AddAddress(1, ipv4.ProtocolNumber, src); err != nil { - t.Fatalf("AddAddress(1, %d, %s) failed: %s", ipv4.ProtocolNumber, src, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: src.WithPrefix(), + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } { mask := tcpip.AddressMask(header.IPv4Broadcast) @@ -3162,8 +3181,8 @@ func TestPacketQueuing(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddProtocolAddress(nicID, host1IPv4Addr); err != nil { - t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID, host1IPv4Addr, err) + if err := s.AddProtocolAddress(nicID, host1IPv4Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID, host1IPv4Addr, err) } s.SetRouteTable([]tcpip.Route{ @@ -3286,8 +3305,12 @@ func TestCloseLocking(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID1, err) } - if err := s.AddAddress(nicID1, ipv4.ProtocolNumber, src); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) failed: %s", nicID1, ipv4.ProtocolNumber, src, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: src.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID1, protocolAddr, err) } s.SetRouteTable([]tcpip.Route{{ @@ -3339,7 +3362,7 @@ func TestCloseLocking(t *testing.T) { defer wg.Done() for i := 0; i < iterations; i++ { - if err := s.CreateNIC(nicID2, loopback.New()); err != nil { + if err := s.CreateNIC(nicID2, stack.LinkEndpoint(channel.New(0, defaultMTU, ""))); err != nil { t.Errorf("CreateNIC(%d, _): %s", nicID2, err) return } diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go index 23fc94303..94caaae6c 100644 --- a/pkg/tcpip/network/ipv6/icmp.go +++ b/pkg/tcpip/network/ipv6/icmp.go @@ -285,8 +285,8 @@ func isMLDValid(pkt *stack.PacketBuffer, iph header.IPv6, routerAlert *header.IP func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool, routerAlert *header.IPv6RouterAlertOption) { sent := e.stats.icmp.packetsSent received := e.stats.icmp.packetsReceived - // TODO(gvisor.dev/issue/170): ICMP packets don't have their TransportHeader - // fields set. See icmp/protocol.go:protocol.Parse for a full explanation. + // ICMP packets don't have their TransportHeader fields set. See + // icmp/protocol.go:protocol.Parse for a full explanation. v, ok := pkt.Data().PullUp(header.ICMPv6HeaderSize) if !ok { received.invalid.Increment() diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go index c2e9544c1..3b4c235fa 100644 --- a/pkg/tcpip/network/ipv6/icmp_test.go +++ b/pkg/tcpip/network/ipv6/icmp_test.go @@ -90,6 +90,10 @@ func (*stubDispatcher) DeliverTransportPacket(tcpip.TransportProtocolNumber, *st return stack.TransportPacketHandled } +func (*stubDispatcher) DeliverRawPacket(tcpip.TransportProtocolNumber, *stack.PacketBuffer) { + // No-op. +} + var _ stack.NetworkInterface = (*testInterface)(nil) type testInterface struct { @@ -221,8 +225,8 @@ func TestICMPCounts(t *testing.T) { t.Fatalf("expected network endpoint to implement stack.AddressableEndpoint") } addr := lladdr0.WithPrefix() - if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.CanBePrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */); err != nil { - t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, CanBePrimaryEndpoint, AddressConfigStatic, false): %s", addr, err) + if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.AddressProperties{}); err != nil { + t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, {}): %s", addr, err) } else { ep.DecRef() } @@ -403,8 +407,12 @@ func newTestContext(t *testing.T) *testContext { if err := c.s0.CreateNIC(nicID, wrappedEP0); err != nil { t.Fatalf("CreateNIC s0: %v", err) } - if err := c.s0.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress lladdr0: %v", err) + llProtocolAddr0 := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := c.s0.AddProtocolAddress(nicID, llProtocolAddr0, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, llProtocolAddr0, err) } c.linkEP1 = channel.New(defaultChannelSize, defaultMTU, linkAddr1) @@ -412,8 +420,12 @@ func newTestContext(t *testing.T) *testContext { if err := c.s1.CreateNIC(nicID, wrappedEP1); err != nil { t.Fatalf("CreateNIC failed: %v", err) } - if err := c.s1.AddAddress(nicID, ProtocolNumber, lladdr1); err != nil { - t.Fatalf("AddAddress lladdr1: %v", err) + llProtocolAddr1 := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr1.WithPrefix(), + } + if err := c.s1.AddProtocolAddress(nicID, llProtocolAddr1, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, llProtocolAddr1, err) } subnet0, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1)))) @@ -686,8 +698,12 @@ func TestICMPChecksumValidationSimple(t *testing.T) { t.Fatalf("CreateNIC(_, _) = %s", err) } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } { subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1)))) @@ -879,8 +895,12 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) { t.Fatalf("CreateNIC(_, _) = %s", err) } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } { subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1)))) @@ -1061,8 +1081,12 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } { subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1)))) @@ -1236,8 +1260,12 @@ func TestLinkAddressRequest(t *testing.T) { } if len(test.nicAddr) != 0 { - if err := s.AddAddress(nicID, ProtocolNumber, test.nicAddr); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, ProtocolNumber, test.nicAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: test.nicAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } } @@ -1411,8 +1439,8 @@ func TestPacketQueing(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddProtocolAddress(nicID, host1IPv6Addr); err != nil { - t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID, host1IPv6Addr, err) + if err := s.AddProtocolAddress(nicID, host1IPv6Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID, host1IPv6Addr, err) } s.SetRouteTable([]tcpip.Route{ @@ -1665,8 +1693,12 @@ func TestCallsToNeighborCache(t *testing.T) { if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil { t.Fatalf("CreateNIC(_, _) = %s", err) } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } } { @@ -1700,8 +1732,8 @@ func TestCallsToNeighborCache(t *testing.T) { t.Fatalf("expected network endpoint to implement stack.AddressableEndpoint") } addr := lladdr0.WithPrefix() - if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.CanBePrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */); err != nil { - t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, CanBePrimaryEndpoint, AddressConfigStatic, false): %s", addr, err) + if ep, err := addressableEndpoint.AddAndAcquirePermanentAddress(addr, stack.AddressProperties{}); err != nil { + t.Fatalf("addressableEndpoint.AddAndAcquirePermanentAddress(%s, {}): %s", addr, err) } else { ep.DecRef() } diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index 12763add6..c824e27fa 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -344,7 +344,10 @@ func (e *endpoint) onAddressAssignedLocked(addr tcpip.Address) { func (e *endpoint) InvalidateDefaultRouter(rtr tcpip.Address) { e.mu.Lock() defer e.mu.Unlock() - e.mu.ndp.invalidateDefaultRouter(rtr) + + // We represent default routers with a default (off-link) route through the + // router. + e.mu.ndp.invalidateOffLinkRoute(offLinkRoute{dest: header.IPv6EmptySubnet, router: rtr}) } // SetNDPConfigurations implements NDPEndpoint. @@ -755,9 +758,9 @@ func (e *endpoint) WritePacket(r *stack.Route, params stack.NetworkHeaderParams, // based on destination address and do not send the packet to link // layer. // - // TODO(gvisor.dev/issue/170): We should do this for every - // packet, rather than only NATted packets, but removing this check - // short circuits broadcasts before they are sent out to other hosts. + // We should do this for every packet, rather than only NATted packets, but + // removing this check short circuits broadcasts before they are sent out to + // other hosts. if pkt.NatDone { netHeader := header.IPv6(pkt.NetworkHeader().View()) if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil { @@ -928,10 +931,6 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu ipH.SetSourceAddress(r.LocalAddress()) } - // Set the destination. If the packet already included a destination, it will - // be part of the route anyways. - ipH.SetDestinationAddress(r.RemoteAddress()) - // Populate the packet buffer's network header and don't allow an invalid // packet to be sent. // @@ -1129,6 +1128,11 @@ func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum func (e *endpoint) handleValidatedPacket(h header.IPv6, pkt *stack.PacketBuffer, inNICName string) { pkt.NICID = e.nic.ID() + + // Raw socket packets are delivered based solely on the transport protocol + // number. We only require that the packet be valid IPv6. + e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt) + stats := e.stats.ip stats.ValidPacketsReceived.Increment() @@ -1624,12 +1628,12 @@ func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber { } // AddAndAcquirePermanentAddress implements stack.AddressableEndpoint. -func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) { +func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties stack.AddressProperties) (stack.AddressEndpoint, tcpip.Error) { // TODO(b/169350103): add checks here after making sure we no longer receive // an empty address. e.mu.Lock() defer e.mu.Unlock() - return e.addAndAcquirePermanentAddressLocked(addr, peb, configType, deprecated) + return e.addAndAcquirePermanentAddressLocked(addr, properties) } // addAndAcquirePermanentAddressLocked is like AddAndAcquirePermanentAddress but @@ -1639,8 +1643,8 @@ func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, p // solicited-node multicast group and start duplicate address detection. // // Precondition: e.mu must be write locked. -func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) { - addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated) +func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPrefix, properties stack.AddressProperties) (stack.AddressEndpoint, tcpip.Error) { + addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, properties) if err != nil { return nil, err } @@ -2007,11 +2011,6 @@ func (p *protocol) MinimumPacketSize() int { return header.IPv6MinimumSize } -// DefaultPrefixLen returns the IPv6 default prefix length. -func (p *protocol) DefaultPrefixLen() int { - return header.IPv6AddressSize * 8 -} - // ParseAddresses implements stack.NetworkProtocol. func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { h := header.IPv6(v) diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go index d2a23fd4f..0735ebb23 100644 --- a/pkg/tcpip/network/ipv6/ipv6_test.go +++ b/pkg/tcpip/network/ipv6/ipv6_test.go @@ -41,12 +41,12 @@ import ( ) const ( - addr1 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" - addr2 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" + addr1 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01") + addr2 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02") // The least significant 3 bytes are the same as addr2 so both addr2 and // addr3 will have the same solicited-node address. - addr3 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x02" - addr4 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x03" + addr3 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x02") + addr4 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x03") // Tests use the extension header identifier values as uint8 instead of // header.IPv6ExtensionHeaderIdentifier. @@ -298,16 +298,24 @@ func TestReceiveOnSolicitedNodeAddr(t *testing.T) { // addr2/addr3 yet as we haven't added those addresses. test.rxf(t, s, e, addr1, snmc, 0) - if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: addr2.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr2, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr2, err) } // Should receive a packet destined to the solicited node address of // addr2/addr3 now that we have added added addr2. test.rxf(t, s, e, addr1, snmc, 1) - if err := s.AddAddress(nicID, ProtocolNumber, addr3); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr3, err) + protocolAddr3 := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: addr3.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr3, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr3, err) } // Should still receive a packet destined to the solicited node address of @@ -374,8 +382,12 @@ func TestAddIpv6Address(t *testing.T) { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, test.addr); err != nil { - t.Fatalf("AddAddress(%d, %d, nil) = %s", nicID, ProtocolNumber, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: test.addr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } if addr, err := s.GetMainNICAddress(nicID, ProtocolNumber); err != nil { @@ -898,8 +910,12 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: addr2.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } // Add a default route so that a return packet knows where to go. @@ -1992,8 +2008,12 @@ func TestReceiveIPv6Fragments(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: addr2.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } wq := waiter.Queue{} @@ -2060,8 +2080,8 @@ func TestReceiveIPv6Fragments(t *testing.T) { func TestInvalidIPv6Fragments(t *testing.T) { const ( - addr1 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" - addr2 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" + addr1 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01") + addr2 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02") linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e") nicID = 1 hoplimit = 255 @@ -2150,8 +2170,12 @@ func TestInvalidIPv6Fragments(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: addr2.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } s.SetRouteTable([]tcpip.Route{{ Destination: header.IPv6EmptySubnet, @@ -2216,8 +2240,8 @@ func TestInvalidIPv6Fragments(t *testing.T) { func TestFragmentReassemblyTimeout(t *testing.T) { const ( - addr1 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" - addr2 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" + addr1 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01") + addr2 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02") linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e") nicID = 1 hoplimit = 255 @@ -2402,8 +2426,12 @@ func TestFragmentReassemblyTimeout(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr2, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: addr2.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } s.SetRouteTable([]tcpip.Route{{ Destination: header.IPv6EmptySubnet, @@ -2645,11 +2673,15 @@ func buildRoute(t *testing.T, ep stack.LinkEndpoint) *stack.Route { t.Fatalf("CreateNIC(1, _) failed: %s", err) } const ( - src = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" - dst = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" + src = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01") + dst = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02") ) - if err := s.AddAddress(1, ProtocolNumber, src); err != nil { - t.Fatalf("AddAddress(1, %d, %s) failed: %s", ProtocolNumber, src, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: src.WithPrefix(), + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } { mask := tcpip.AddressMask("\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff") @@ -3297,8 +3329,8 @@ func TestForwarding(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", incomingNICID, err) } incomingIPv6ProtoAddr := tcpip.ProtocolAddress{Protocol: ProtocolNumber, AddressWithPrefix: incomingIPv6Addr} - if err := s.AddProtocolAddress(incomingNICID, incomingIPv6ProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", incomingNICID, incomingIPv6ProtoAddr, err) + if err := s.AddProtocolAddress(incomingNICID, incomingIPv6ProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", incomingNICID, incomingIPv6ProtoAddr, err) } outgoingEndpoint := channel.New(1, header.IPv6MinimumMTU, "") @@ -3306,8 +3338,8 @@ func TestForwarding(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", outgoingNICID, err) } outgoingIPv6ProtoAddr := tcpip.ProtocolAddress{Protocol: ProtocolNumber, AddressWithPrefix: outgoingIPv6Addr} - if err := s.AddProtocolAddress(outgoingNICID, outgoingIPv6ProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", outgoingNICID, outgoingIPv6ProtoAddr, err) + if err := s.AddProtocolAddress(outgoingNICID, outgoingIPv6ProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", outgoingNICID, outgoingIPv6ProtoAddr, err) } s.SetRouteTable([]tcpip.Route{ diff --git a/pkg/tcpip/network/ipv6/mld_test.go b/pkg/tcpip/network/ipv6/mld_test.go index bc9cf6999..3e5c438d3 100644 --- a/pkg/tcpip/network/ipv6/mld_test.go +++ b/pkg/tcpip/network/ipv6/mld_test.go @@ -75,8 +75,12 @@ func TestIPv6JoinLeaveSolicitedNodeAddressPerformsMLD(t *testing.T) { // The stack will join an address's solicited node multicast address when // an address is added. An MLD report message should be sent for the // solicited-node group. - if err := s.AddAddress(nicID, ipv6.ProtocolNumber, linkLocalAddr); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ipv6.ProtocolNumber, linkLocalAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: linkLocalAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } if p, ok := e.Read(); !ok { t.Fatal("expected a report message to be sent") @@ -216,8 +220,13 @@ func TestSendQueuedMLDReports(t *testing.T) { // Note, we will still expect to send a report for the global address's // solicited node address from the unspecified address as per RFC 3590 // section 4. - if err := s.AddAddressWithOptions(nicID, ipv6.ProtocolNumber, globalAddr, stack.FirstPrimaryEndpoint); err != nil { - t.Fatalf("AddAddressWithOptions(%d, %d, %s, %d): %s", nicID, ipv6.ProtocolNumber, globalAddr, stack.FirstPrimaryEndpoint, err) + properties := stack.AddressProperties{PEB: stack.FirstPrimaryEndpoint} + globalProtocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: globalAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, globalProtocolAddr, properties); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, %+v): %s", nicID, globalProtocolAddr, properties, err) } reportCounter++ if got := reportStat.Value(); got != reportCounter { @@ -252,8 +261,12 @@ func TestSendQueuedMLDReports(t *testing.T) { // Adding a link-local address should send a report for its solicited node // address and globalMulticastAddr. - if err := s.AddAddressWithOptions(nicID, ipv6.ProtocolNumber, linkLocalAddr, stack.CanBePrimaryEndpoint); err != nil { - t.Fatalf("AddAddressWithOptions(%d, %d, %s, %d): %s", nicID, ipv6.ProtocolNumber, linkLocalAddr, stack.CanBePrimaryEndpoint, err) + linkLocalProtocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: linkLocalAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, linkLocalProtocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, linkLocalProtocolAddr, err) } if dadResolutionTime != 0 { reportCounter++ @@ -567,8 +580,12 @@ func TestMLDSkipProtocol(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddAddress(nicID, ipv6.ProtocolNumber, linkLocalAddr); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ipv6.ProtocolNumber, linkLocalAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: linkLocalAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } if p, ok := e.Read(); !ok { t.Fatal("expected a report message to be sent") diff --git a/pkg/tcpip/network/ipv6/ndp.go b/pkg/tcpip/network/ipv6/ndp.go index 851cd6e75..938427420 100644 --- a/pkg/tcpip/network/ipv6/ndp.go +++ b/pkg/tcpip/network/ipv6/ndp.go @@ -54,6 +54,11 @@ const ( // Advertisements, as a host. defaultDiscoverDefaultRouters = true + // defaultDiscoverMoreSpecificRoutes is the default configuration for + // whether or not to discover more-specific routes from incoming Router + // Advertisements, as a host. + defaultDiscoverMoreSpecificRoutes = true + // defaultDiscoverOnLinkPrefixes is the default configuration for // whether or not to discover on-link prefixes from incoming Router // Advertisements' Prefix Information option, as a host. @@ -78,13 +83,13 @@ const ( // we cannot have a negative delay. minimumMaxRtrSolicitationDelay = 0 - // MaxDiscoveredDefaultRouters is the maximum number of discovered - // default routers. The stack should stop discovering new routers after - // discovering MaxDiscoveredDefaultRouters routers. + // MaxDiscoveredOffLinkRoutes is the maximum number of discovered off-link + // routes. The stack should stop discovering new off-link routes after + // this limit is reached. // // This value MUST be at minimum 2 as per RFC 4861 section 6.3.4, and // SHOULD be more. - MaxDiscoveredDefaultRouters = 10 + MaxDiscoveredOffLinkRoutes = 10 // MaxDiscoveredOnLinkPrefixes is the maximum number of discovered // on-link prefixes. The stack should stop discovering new on-link @@ -127,25 +132,17 @@ const ( // maxSLAACAddrLocalRegenAttempts is the maximum number of times to attempt // SLAAC address regenerations in response to an IPv6 endpoint-local conflict. maxSLAACAddrLocalRegenAttempts = 10 -) -var ( // MinPrefixInformationValidLifetimeForUpdate is the minimum Valid // Lifetime to update the valid lifetime of a generated address by // SLAAC. // - // This is exported as a variable (instead of a constant) so tests - // can update it to a smaller value. - // // Min = 2hrs. MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour // MaxDesyncFactor is the upper bound for the preferred lifetime's desync // factor for temporary SLAAC addresses. // - // This is exported as a variable (instead of a constant) so tests - // can update it to a smaller value. - // // Must be greater than 0. // // Max = 10m (from RFC 4941 section 5). @@ -154,9 +151,6 @@ var ( // MinMaxTempAddrPreferredLifetime is the minimum value allowed for the // maximum preferred lifetime for temporary SLAAC addresses. // - // This is exported as a variable (instead of a constant) so tests - // can update it to a smaller value. - // // This value guarantees that a temporary address is preferred for at // least 1hr if the SLAAC prefix is valid for at least that time. MinMaxTempAddrPreferredLifetime = defaultRegenAdvanceDuration + MaxDesyncFactor + time.Hour @@ -164,9 +158,6 @@ var ( // MinMaxTempAddrValidLifetime is the minimum value allowed for the // maximum valid lifetime for temporary SLAAC addresses. // - // This is exported as a variable (instead of a constant) so tests - // can update it to a smaller value. - // // This value guarantees that a temporary address is valid for at least // 2hrs if the SLAAC prefix is valid for at least that time. MinMaxTempAddrValidLifetime = 2 * time.Hour @@ -214,28 +205,23 @@ type NDPDispatcher interface { // is also not permitted to call into the stack. OnDuplicateAddressDetectionResult(tcpip.NICID, tcpip.Address, stack.DADResult) - // OnDefaultRouterDiscovered is called when a new default router is - // discovered. Implementations must return true if the newly discovered - // router should be remembered. + // OnOffLinkRouteUpdated is called when an off-link route is updated. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. - OnDefaultRouterDiscovered(tcpip.NICID, tcpip.Address) bool + OnOffLinkRouteUpdated(tcpip.NICID, tcpip.Subnet, tcpip.Address, header.NDPRoutePreference) - // OnDefaultRouterInvalidated is called when a discovered default router that - // was remembered is invalidated. + // OnOffLinkRouteInvalidated is called when an off-link route is invalidated. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. - OnDefaultRouterInvalidated(tcpip.NICID, tcpip.Address) + OnOffLinkRouteInvalidated(tcpip.NICID, tcpip.Subnet, tcpip.Address) // OnOnLinkPrefixDiscovered is called when a new on-link prefix is discovered. - // Implementations must return true if the newly discovered on-link prefix - // should be remembered. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. - OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) bool + OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) // OnOnLinkPrefixInvalidated is called when a discovered on-link prefix that // was remembered is invalidated. @@ -245,13 +231,11 @@ type NDPDispatcher interface { OnOnLinkPrefixInvalidated(tcpip.NICID, tcpip.Subnet) // OnAutoGenAddress is called when a new prefix with its autonomous address- - // configuration flag set is received and SLAAC was performed. Implementations - // may prevent the stack from assigning the address to the NIC by returning - // false. + // configuration flag set is received and SLAAC was performed. // // This function is not permitted to block indefinitely. It must not // call functions on the stack itself. - OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool + OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) // OnAutoGenAddressDeprecated is called when an auto-generated address (SLAAC) // is deprecated, but is still considered valid. Note, if an address is @@ -373,12 +357,18 @@ type NDPConfigurations struct { // DiscoverDefaultRouters determines whether or not default routers are // discovered from Router Advertisements, as per RFC 4861 section 6. This - // configuration is ignored if HandleRAs is false. + // configuration is ignored if RAs will not be processed (see HandleRAs). DiscoverDefaultRouters bool + // DiscoverMoreSpecificRoutes determines whether or not more specific routes + // are discovered from Router Advertisements, as per RFC 4191. This + // configuration is ignored if RAs will not be processed (see HandleRAs). + DiscoverMoreSpecificRoutes bool + // DiscoverOnLinkPrefixes determines whether or not on-link prefixes are // discovered from Router Advertisements' Prefix Information option, as per - // RFC 4861 section 6. This configuration is ignored if HandleRAs is false. + // RFC 4861 section 6. This configuration is ignored if RAs will not be + // processed (see HandleRAs). DiscoverOnLinkPrefixes bool // AutoGenGlobalAddresses determines whether or not an IPv6 endpoint performs @@ -429,6 +419,7 @@ func DefaultNDPConfigurations() NDPConfigurations { MaxRtrSolicitationDelay: defaultMaxRtrSolicitationDelay, HandleRAs: defaultHandleRAs, DiscoverDefaultRouters: defaultDiscoverDefaultRouters, + DiscoverMoreSpecificRoutes: defaultDiscoverMoreSpecificRoutes, DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes, AutoGenGlobalAddresses: defaultAutoGenGlobalAddresses, AutoGenTempGlobalAddresses: defaultAutoGenTempGlobalAddresses, @@ -469,6 +460,11 @@ type timer struct { timer tcpip.Timer } +type offLinkRoute struct { + dest tcpip.Subnet + router tcpip.Address +} + // ndpState is the per-Interface NDP state. type ndpState struct { // Do not allow overwriting this state. @@ -483,8 +479,8 @@ type ndpState struct { // The DAD timers to send the next NS message, or resolve the address. dad ip.DAD - // The default routers discovered through Router Advertisements. - defaultRouters map[tcpip.Address]defaultRouterState + // The off-link routes discovered through Router Advertisements. + offLinkRoutes map[offLinkRoute]offLinkRouteState // rtrSolicitTimer is the timer used to send the next router solicitation // message. @@ -512,10 +508,12 @@ type ndpState struct { temporaryAddressDesyncFactor time.Duration } -// defaultRouterState holds data associated with a default router discovered by +// offLinkRouteState holds data associated with an off-link route discovered by // a Router Advertisement (RA). -type defaultRouterState struct { - // Job to invalidate the default router. +type offLinkRouteState struct { + prf header.NDPRoutePreference + + // Job to invalidate the route. // // Must not be nil. invalidationJob *tcpip.Job @@ -571,11 +569,11 @@ type slaacPrefixState struct { // Must not be nil. invalidationJob *tcpip.Job - // Nonzero only when the address is not valid forever. - validUntil tcpip.MonotonicTime + // nil iff the address is valid forever. + validUntil *tcpip.MonotonicTime - // Nonzero only when the address is not preferred forever. - preferredUntil tcpip.MonotonicTime + // nil iff the address is preferred forever. + preferredUntil *tcpip.MonotonicTime // State associated with the stable address generated for the prefix. stableAddr struct { @@ -733,30 +731,22 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { // Is the IPv6 endpoint configured to discover default routers? if ndp.configs.DiscoverDefaultRouters { - rtr, ok := ndp.defaultRouters[ip] - rl := ra.RouterLifetime() - switch { - case !ok && rl != 0: - // This is a new default router we are discovering. + prf := ra.DefaultRouterPreference() + if prf == header.ReservedRoutePreference { + // As per RFC 4191 section 2.2, // - // Only remember it if we currently know about less than - // MaxDiscoveredDefaultRouters routers. - if len(ndp.defaultRouters) < MaxDiscoveredDefaultRouters { - ndp.rememberDefaultRouter(ip, rl) - } - - case ok && rl != 0: - // This is an already discovered default router. Update - // the invalidation job. - rtr.invalidationJob.Cancel() - rtr.invalidationJob.Schedule(rl) - ndp.defaultRouters[ip] = rtr - - case ok && rl == 0: - // We know about the router but it is no longer to be - // used as a default router so invalidate it. - ndp.invalidateDefaultRouter(ip) + // Prf (Default Router Preference) + // + // If the Reserved (10) value is received, the receiver MUST treat the + // value as if it were (00). + // + // Note that the value 00 is the medium (default) router preference value. + prf = header.MediumRoutePreference } + + // We represent default routers with a default (off-link) route through the + // router. + ndp.handleOffLinkRouteDiscovery(offLinkRoute{dest: header.IPv6EmptySubnet, router: ip}, ra.RouterLifetime(), prf) } // TODO(b/141556115): Do (RetransTimer, ReachableTime)) Parameter @@ -808,61 +798,107 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { if opt.AutonomousAddressConfigurationFlag() { ndp.handleAutonomousPrefixInformation(opt) } + + case header.NDPRouteInformation: + if !ndp.configs.DiscoverMoreSpecificRoutes { + continue + } + + dest, err := opt.Prefix() + if err != nil { + panic(fmt.Sprintf("%T.Prefix(): %s", opt, err)) + } + + prf := opt.RoutePreference() + if prf == header.ReservedRoutePreference { + // As per RFC 4191 section 2.3, + // + // Prf (Route Preference) + // 2-bit signed integer. The Route Preference indicates + // whether to prefer the router associated with this prefix + // over others, when multiple identical prefixes (for + // different routers) have been received. If the Reserved + // (10) value is received, the Route Information Option MUST + // be ignored. + continue + } + + ndp.handleOffLinkRouteDiscovery(offLinkRoute{dest: dest, router: ip}, opt.RouteLifetime(), prf) } // TODO(b/141556115): Do (MTU) Parameter Discovery. } } -// invalidateDefaultRouter invalidates a discovered default router. +// invalidateOffLinkRoute invalidates a discovered off-link route. // // The IPv6 endpoint that ndp belongs to MUST be locked. -func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) { - rtr, ok := ndp.defaultRouters[ip] - - // Is the router still discovered? +func (ndp *ndpState) invalidateOffLinkRoute(route offLinkRoute) { + state, ok := ndp.offLinkRoutes[route] if !ok { - // ...Nope, do nothing further. return } - rtr.invalidationJob.Cancel() - delete(ndp.defaultRouters, ip) + state.invalidationJob.Cancel() + delete(ndp.offLinkRoutes, route) - // Let the integrator know a discovered default router is invalidated. + // Let the integrator know a discovered off-link route is invalidated. if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { - ndpDisp.OnDefaultRouterInvalidated(ndp.ep.nic.ID(), ip) + ndpDisp.OnOffLinkRouteInvalidated(ndp.ep.nic.ID(), route.dest, route.router) } } -// rememberDefaultRouter remembers a newly discovered default router with IPv6 -// link-local address ip with lifetime rl. -// -// The router identified by ip MUST NOT already be known by the IPv6 endpoint. +// handleOffLinkRouteDiscovery handles the discovery of an off-link route. // -// The IPv6 endpoint that ndp belongs to MUST be locked. -func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) { +// Precondition: ndp.ep.mu must be locked. +func (ndp *ndpState) handleOffLinkRouteDiscovery(route offLinkRoute, lifetime time.Duration, prf header.NDPRoutePreference) { ndpDisp := ndp.ep.protocol.options.NDPDisp if ndpDisp == nil { return } - // Inform the integrator when we discovered a default router. - if !ndpDisp.OnDefaultRouterDiscovered(ndp.ep.nic.ID(), ip) { - // Informed by the integrator to not remember the router, do - // nothing further. - return - } + state, ok := ndp.offLinkRoutes[route] + switch { + case !ok && lifetime != 0: + // This is a new route we are discovering. + // + // Only remember it if we currently know about less than + // MaxDiscoveredOffLinkRoutes routers. + if len(ndp.offLinkRoutes) < MaxDiscoveredOffLinkRoutes { + // Inform the integrator when we discovered an off-link route. + ndpDisp.OnOffLinkRouteUpdated(ndp.ep.nic.ID(), route.dest, route.router, prf) + + state := offLinkRouteState{ + prf: prf, + invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() { + ndp.invalidateOffLinkRoute(route) + }), + } - state := defaultRouterState{ - invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() { - ndp.invalidateDefaultRouter(ip) - }), - } + state.invalidationJob.Schedule(lifetime) + + ndp.offLinkRoutes[route] = state + } - state.invalidationJob.Schedule(rl) + case ok && lifetime != 0: + // This is an already discovered off-link route. Update the lifetime. + state.invalidationJob.Cancel() + state.invalidationJob.Schedule(lifetime) - ndp.defaultRouters[ip] = state + if prf != state.prf { + state.prf = prf + + // Inform the integrator about route preference updates. + ndpDisp.OnOffLinkRouteUpdated(ndp.ep.nic.ID(), route.dest, route.router, prf) + } + + ndp.offLinkRoutes[route] = state + + case ok && lifetime == 0: + // The already discovered off-link route is no longer considered valid so we + // invalidate it immediately. + ndp.invalidateOffLinkRoute(route) + } } // rememberOnLinkPrefix remembers a newly discovered on-link prefix with IPv6 @@ -878,11 +914,7 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) } // Inform the integrator when we discovered an on-link prefix. - if !ndpDisp.OnOnLinkPrefixDiscovered(ndp.ep.nic.ID(), prefix) { - // Informed by the integrator to not remember the prefix, do - // nothing further. - return - } + ndpDisp.OnOnLinkPrefixDiscovered(ndp.ep.nic.ID(), prefix) state := onLinkPrefixState{ invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() { @@ -1055,7 +1087,8 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { // The time an address is preferred until is needed to properly generate the // address. if pl < header.NDPInfiniteLifetime { - state.preferredUntil = now.Add(pl) + t := now.Add(pl) + state.preferredUntil = &t } if !ndp.generateSLAACAddr(prefix, &state) { @@ -1073,7 +1106,8 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { if vl < header.NDPInfiniteLifetime { state.invalidationJob.Schedule(vl) - state.validUntil = now.Add(vl) + t := now.Add(vl) + state.validUntil = &t } // If the address is assigned (DAD resolved), generate a temporary address. @@ -1096,16 +1130,17 @@ func (ndp *ndpState) addAndAcquireSLAACAddr(addr tcpip.AddressWithPrefix, config return nil } - if !ndpDisp.OnAutoGenAddress(ndp.ep.nic.ID(), addr) { - // Informed by the integrator not to add the address. - return nil - } - - addressEndpoint, err := ndp.ep.addAndAcquirePermanentAddressLocked(addr, stack.FirstPrimaryEndpoint, configType, deprecated) + addressEndpoint, err := ndp.ep.addAndAcquirePermanentAddressLocked(addr, stack.AddressProperties{ + PEB: stack.FirstPrimaryEndpoint, + ConfigType: configType, + Deprecated: deprecated, + }) if err != nil { panic(fmt.Sprintf("ndp: error when adding SLAAC address %+v: %s", addr, err)) } + ndpDisp.OnAutoGenAddress(ndp.ep.nic.ID(), addr) + return addressEndpoint } @@ -1181,7 +1216,8 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt state.stableAddr.localGenerationFailures++ } - if addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, stack.AddressConfigSlaac, ndp.ep.protocol.stack.Clock().NowMonotonic().Sub(state.preferredUntil) >= 0 /* deprecated */); addressEndpoint != nil { + deprecated := state.preferredUntil != nil && !state.preferredUntil.After(ndp.ep.protocol.stack.Clock().NowMonotonic()) + if addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, stack.AddressConfigSlaac, deprecated); addressEndpoint != nil { state.stableAddr.addressEndpoint = addressEndpoint state.generationAttempts++ return true @@ -1242,7 +1278,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla // address is the lower of the valid lifetime of the stable address or the // maximum temporary address valid lifetime. vl := ndp.configs.MaxTempAddrValidLifetime - if prefixState.validUntil != (tcpip.MonotonicTime{}) { + if prefixState.validUntil != nil { if prefixVL := prefixState.validUntil.Sub(now); vl > prefixVL { vl = prefixVL } @@ -1258,7 +1294,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla // maximum temporary address preferred lifetime - the temporary address desync // factor. pl := ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor - if prefixState.preferredUntil != (tcpip.MonotonicTime{}) { + if prefixState.preferredUntil != nil { if prefixPL := prefixState.preferredUntil.Sub(now); pl > prefixPL { // Respect the preferred lifetime of the prefix, as per RFC 4941 section // 3.3 step 4. @@ -1400,9 +1436,10 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat if !deprecated { prefixState.deprecationJob.Schedule(pl) } - prefixState.preferredUntil = now.Add(pl) + t := now.Add(pl) + prefixState.preferredUntil = &t } else { - prefixState.preferredUntil = tcpip.MonotonicTime{} + prefixState.preferredUntil = nil } // As per RFC 4862 section 5.5.3.e, update the valid lifetime for prefix: @@ -1420,14 +1457,14 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat // Handle the infinite valid lifetime separately as we do not schedule a // job in this case. prefixState.invalidationJob.Cancel() - prefixState.validUntil = tcpip.MonotonicTime{} + prefixState.validUntil = nil } else { var effectiveVl time.Duration var rl time.Duration // If the prefix was originally set to be valid forever, assume the // remaining time to be the maximum possible value. - if prefixState.validUntil == (tcpip.MonotonicTime{}) { + if prefixState.validUntil == nil { rl = header.NDPInfiniteLifetime } else { rl = prefixState.validUntil.Sub(now) @@ -1442,7 +1479,8 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat if effectiveVl != 0 { prefixState.invalidationJob.Cancel() prefixState.invalidationJob.Schedule(effectiveVl) - prefixState.validUntil = now.Add(effectiveVl) + t := now.Add(effectiveVl) + prefixState.validUntil = &t } } @@ -1462,8 +1500,8 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat // maximum temporary address valid lifetime. Note, the valid lifetime of a // temporary address is relative to the address's creation time. validUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrValidLifetime) - if prefixState.validUntil != (tcpip.MonotonicTime{}) && validUntil.Sub(prefixState.validUntil) > 0 { - validUntil = prefixState.validUntil + if prefixState.validUntil != nil && prefixState.validUntil.Before(validUntil) { + validUntil = *prefixState.validUntil } // If the address is no longer valid, invalidate it immediately. Otherwise, @@ -1482,14 +1520,15 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat // desync factor. Note, the preferred lifetime of a temporary address is // relative to the address's creation time. preferredUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor) - if prefixState.preferredUntil != (tcpip.MonotonicTime{}) && preferredUntil.Sub(prefixState.preferredUntil) > 0 { - preferredUntil = prefixState.preferredUntil + if prefixState.preferredUntil != nil && prefixState.preferredUntil.Before(preferredUntil) { + preferredUntil = *prefixState.preferredUntil } // If the address is no longer preferred, deprecate it immediately. // Otherwise, schedule the deprecation job again. newPreferredLifetime := preferredUntil.Sub(now) tempAddrState.deprecationJob.Cancel() + if newPreferredLifetime <= 0 { ndp.deprecateSLAACAddress(tempAddrState.addressEndpoint) } else { @@ -1679,12 +1718,12 @@ func (ndp *ndpState) cleanupState() { panic(fmt.Sprintf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got)) } - for router := range ndp.defaultRouters { - ndp.invalidateDefaultRouter(router) + for route := range ndp.offLinkRoutes { + ndp.invalidateOffLinkRoute(route) } - if got := len(ndp.defaultRouters); got != 0 { - panic(fmt.Sprintf("ndp: still have discovered default routers after cleaning up; found = %d", got)) + if got := len(ndp.offLinkRoutes); got != 0 { + panic(fmt.Sprintf("ndp: still have discovered off-link routes after cleaning up; found = %d", got)) } ndp.dhcpv6Configuration = 0 @@ -1847,21 +1886,19 @@ func (ndp *ndpState) stopSolicitingRouters() { } func (ndp *ndpState) init(ep *endpoint, dadOptions ip.DADOptions) { - if ndp.defaultRouters != nil { + if ndp.offLinkRoutes != nil { panic("attempted to initialize NDP state twice") } ndp.ep = ep ndp.configs = ep.protocol.options.NDPConfigs ndp.dad.Init(&ndp.ep.mu, ep.protocol.options.DADConfigs, dadOptions) - ndp.defaultRouters = make(map[tcpip.Address]defaultRouterState) + ndp.offLinkRoutes = make(map[offLinkRoute]offLinkRouteState) ndp.onLinkPrefixes = make(map[tcpip.Subnet]onLinkPrefixState) ndp.slaacPrefixes = make(map[tcpip.Subnet]slaacPrefixState) header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.ep.protocol.options.TempIIDSeed, ndp.ep.nic.ID()) - if MaxDesyncFactor != 0 { - ndp.temporaryAddressDesyncFactor = time.Duration(ep.protocol.stack.Rand().Int63n(int64(MaxDesyncFactor))) - } + ndp.temporaryAddressDesyncFactor = time.Duration(ep.protocol.stack.Rand().Int63n(int64(MaxDesyncFactor))) } func (ndp *ndpState) SendDADMessage(addr tcpip.Address, nonce []byte) tcpip.Error { diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go index 3438deb79..8297a7e10 100644 --- a/pkg/tcpip/network/ipv6/ndp_test.go +++ b/pkg/tcpip/network/ipv6/ndp_test.go @@ -42,24 +42,21 @@ type testNDPDispatcher struct { func (*testNDPDispatcher) OnDuplicateAddressDetectionResult(tcpip.NICID, tcpip.Address, stack.DADResult) { } -func (t *testNDPDispatcher) OnDefaultRouterDiscovered(_ tcpip.NICID, addr tcpip.Address) bool { +func (t *testNDPDispatcher) OnOffLinkRouteUpdated(_ tcpip.NICID, _ tcpip.Subnet, addr tcpip.Address, _ header.NDPRoutePreference) { t.addr = addr - return true } -func (t *testNDPDispatcher) OnDefaultRouterInvalidated(_ tcpip.NICID, addr tcpip.Address) { +func (t *testNDPDispatcher) OnOffLinkRouteInvalidated(_ tcpip.NICID, _ tcpip.Subnet, addr tcpip.Address) { t.addr = addr } -func (*testNDPDispatcher) OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) bool { - return false +func (*testNDPDispatcher) OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) { } func (*testNDPDispatcher) OnOnLinkPrefixInvalidated(tcpip.NICID, tcpip.Subnet) { } -func (*testNDPDispatcher) OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool { - return false +func (*testNDPDispatcher) OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) { } func (*testNDPDispatcher) OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix) { @@ -96,7 +93,7 @@ func TestStackNDPEndpointInvalidateDefaultRouter(t *testing.T) { ipv6EP := ep.(*endpoint) ipv6EP.mu.Lock() - ipv6EP.mu.ndp.rememberDefaultRouter(lladdr1, time.Hour) + ipv6EP.mu.ndp.handleOffLinkRouteDiscovery(offLinkRoute{dest: header.IPv6EmptySubnet, router: lladdr1}, time.Hour, header.MediumRoutePreference) ipv6EP.mu.Unlock() if ndpDisp.addr != lladdr1 { @@ -147,8 +144,12 @@ func TestNeighborSolicitationWithSourceLinkLayerOption(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + len(test.optsBuf) @@ -409,8 +410,12 @@ func TestNeighborSolicitationResponse(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: nicAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } s.SetRouteTable([]tcpip.Route{ @@ -605,8 +610,12 @@ func TestNeighborAdvertisementWithTargetLinkLayerOption(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + len(test.optsBuf) @@ -834,8 +843,12 @@ func TestNDPValidation(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ProtocolNumber, lladdr0, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } ep, err := s.GetNetworkEndpoint(nicID, ProtocolNumber) @@ -965,8 +978,12 @@ func TestNeighborAdvertisementValidation(t *testing.T) { if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } ndpNASize := header.ICMPv6NeighborAdvertMinimumSize @@ -1286,8 +1303,12 @@ func TestCheckDuplicateAddress(t *testing.T) { checker.NDPNSOptions([]header.NDPOption{header.NDPNonceOption(nonces[dadPacketsSent])}), )) } - if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ProtocolNumber, + AddressWithPrefix: lladdr0.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } checkDADMsg() diff --git a/pkg/tcpip/network/multicast_group_test.go b/pkg/tcpip/network/multicast_group_test.go index 1b96b1fb8..26640b7ee 100644 --- a/pkg/tcpip/network/multicast_group_test.go +++ b/pkg/tcpip/network/multicast_group_test.go @@ -151,15 +151,22 @@ func createStackWithLinkEndpoint(t *testing.T, v4, mgpEnabled bool, e stack.Link if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - addr := tcpip.AddressWithPrefix{ - Address: stackIPv4Addr, - PrefixLen: defaultIPv4PrefixLength, + addr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: stackIPv4Addr, + PrefixLen: defaultIPv4PrefixLength, + }, + } + if err := s.AddProtocolAddress(nicID, addr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, addr, err) } - if err := s.AddAddressWithPrefix(nicID, ipv4.ProtocolNumber, addr); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s): %s", nicID, ipv4.ProtocolNumber, addr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: linkLocalIPv6Addr1.WithPrefix(), } - if err := s.AddAddress(nicID, ipv6.ProtocolNumber, linkLocalIPv6Addr1); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, linkLocalIPv6Addr1, err) + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } return s, clock diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD index b7f6d52ae..fe98a52af 100644 --- a/pkg/tcpip/ports/BUILD +++ b/pkg/tcpip/ports/BUILD @@ -12,6 +12,7 @@ go_library( deps = [ "//pkg/sync", "//pkg/tcpip", + "//pkg/tcpip/header", ], ) diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go index 854d6a6ba..fb8ef1ee2 100644 --- a/pkg/tcpip/ports/ports.go +++ b/pkg/tcpip/ports/ports.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" ) const ( @@ -122,7 +123,7 @@ type deviceToDest map[tcpip.NICID]destToCounter // If either of the port reuse flags is enabled on any of the nodes, all nodes // sharing a port must share at least one reuse flag. This matches Linux's // behavior. -func (dd deviceToDest) isAvailable(res Reservation) bool { +func (dd deviceToDest) isAvailable(res Reservation, portSpecified bool) bool { flagBits := res.Flags.Bits() if res.BindToDevice == 0 { intersection := FlagMask @@ -138,6 +139,9 @@ func (dd deviceToDest) isAvailable(res Reservation) bool { return false } } + if !portSpecified && res.Transport == header.TCPProtocolNumber { + return false + } return true } @@ -146,16 +150,26 @@ func (dd deviceToDest) isAvailable(res Reservation) bool { if dests, ok := dd[0]; ok { var count int intersection, count = dests.intersectionFlags(res) - if count > 0 && intersection&flagBits == 0 { - return false + if count > 0 { + if intersection&flagBits == 0 { + return false + } + if !portSpecified && res.Transport == header.TCPProtocolNumber { + return false + } } } if dests, ok := dd[res.BindToDevice]; ok { flags, count := dests.intersectionFlags(res) intersection &= flags - if count > 0 && intersection&flagBits == 0 { - return false + if count > 0 { + if intersection&flagBits == 0 { + return false + } + if !portSpecified && res.Transport == header.TCPProtocolNumber { + return false + } } } @@ -168,12 +182,12 @@ type addrToDevice map[tcpip.Address]deviceToDest // isAvailable checks whether an IP address is available to bind to. If the // address is the "any" address, check all other addresses. Otherwise, just // check against the "any" address and the provided address. -func (ad addrToDevice) isAvailable(res Reservation) bool { +func (ad addrToDevice) isAvailable(res Reservation, portSpecified bool) bool { if res.Addr == anyIPAddress { // If binding to the "any" address then check that there are no // conflicts with all addresses. for _, devices := range ad { - if !devices.isAvailable(res) { + if !devices.isAvailable(res, portSpecified) { return false } } @@ -182,14 +196,14 @@ func (ad addrToDevice) isAvailable(res Reservation) bool { // Check that there is no conflict with the "any" address. if devices, ok := ad[anyIPAddress]; ok { - if !devices.isAvailable(res) { + if !devices.isAvailable(res, portSpecified) { return false } } // Check that this is no conflict with the provided address. if devices, ok := ad[res.Addr]; ok { - if !devices.isAvailable(res) { + if !devices.isAvailable(res, portSpecified) { return false } } @@ -310,7 +324,7 @@ func (pm *PortManager) ReservePort(rng *rand.Rand, res Reservation, testPort Por // If a port is specified, just try to reserve it for all network // protocols. if res.Port != 0 { - if !pm.reserveSpecificPortLocked(res) { + if !pm.reserveSpecificPortLocked(res, true /* portSpecified */) { return 0, &tcpip.ErrPortInUse{} } if testPort != nil { @@ -330,7 +344,7 @@ func (pm *PortManager) ReservePort(rng *rand.Rand, res Reservation, testPort Por // A port wasn't specified, so try to find one. return pm.PickEphemeralPort(rng, func(p uint16) (bool, tcpip.Error) { res.Port = p - if !pm.reserveSpecificPortLocked(res) { + if !pm.reserveSpecificPortLocked(res, false /* portSpecified */) { return false, nil } if testPort != nil { @@ -350,12 +364,12 @@ func (pm *PortManager) ReservePort(rng *rand.Rand, res Reservation, testPort Por // reserveSpecificPortLocked tries to reserve the given port on all given // protocols. -func (pm *PortManager) reserveSpecificPortLocked(res Reservation) bool { +func (pm *PortManager) reserveSpecificPortLocked(res Reservation, portSpecified bool) bool { // Make sure the port is available. for _, network := range res.Networks { desc := portDescriptor{network, res.Transport, res.Port} if addrs, ok := pm.allocatedPorts[desc]; ok { - if !addrs.isAvailable(res) { + if !addrs.isAvailable(res, portSpecified) { return false } } diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go index b9a24ff56..05b879543 100644 --- a/pkg/tcpip/sample/tun_tcp_connect/main.go +++ b/pkg/tcpip/sample/tun_tcp_connect/main.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // This sample creates a stack with TCP and IPv4 protocols on top of a TUN @@ -145,8 +146,12 @@ func main() { log.Fatal(err) } - if err := s.AddAddress(1, ipv4.ProtocolNumber, addr); err != nil { - log.Fatal(err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addr.WithPrefix(), + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + log.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } // Add default route. diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go index ef1bfc186..a72afadda 100644 --- a/pkg/tcpip/sample/tun_tcp_echo/main.go +++ b/pkg/tcpip/sample/tun_tcp_echo/main.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux // +build linux // This sample creates a stack with TCP and IPv4 protocols on top of a TUN @@ -123,13 +124,13 @@ func main() { log.Fatalf("Bad IP address: %v", addrName) } - var addr tcpip.Address + var addrWithPrefix tcpip.AddressWithPrefix var proto tcpip.NetworkProtocolNumber if parsedAddr.To4() != nil { - addr = tcpip.Address(parsedAddr.To4()) + addrWithPrefix = tcpip.Address(parsedAddr.To4()).WithPrefix() proto = ipv4.ProtocolNumber } else if parsedAddr.To16() != nil { - addr = tcpip.Address(parsedAddr.To16()) + addrWithPrefix = tcpip.Address(parsedAddr.To16()).WithPrefix() proto = ipv6.ProtocolNumber } else { log.Fatalf("Unknown IP type: %v", addrName) @@ -175,11 +176,15 @@ func main() { log.Fatal(err) } - if err := s.AddAddress(1, proto, addr); err != nil { - log.Fatal(err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: proto, + AddressWithPrefix: addrWithPrefix, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + log.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } - subnet, err := tcpip.NewSubnet(tcpip.Address(strings.Repeat("\x00", len(addr))), tcpip.AddressMask(strings.Repeat("\x00", len(addr)))) + subnet, err := tcpip.NewSubnet(tcpip.Address(strings.Repeat("\x00", len(addrWithPrefix.Address))), tcpip.AddressMask(strings.Repeat("\x00", len(addrWithPrefix.Address)))) if err != nil { log.Fatal(err) } diff --git a/pkg/tcpip/socketops.go b/pkg/tcpip/socketops.go index 0ea85f9ed..34ac62444 100644 --- a/pkg/tcpip/socketops.go +++ b/pkg/tcpip/socketops.go @@ -15,17 +15,12 @@ package tcpip import ( - "math" "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sync" ) -// PacketOverheadFactor is used to multiply the value provided by the user on a -// SetSockOpt for setting the send/receive buffer sizes sockets. -const PacketOverheadFactor = 2 - // SocketOptionsHandler holds methods that help define endpoint specific // behavior for socket level socket options. These must be implemented by // endpoints to get notified when socket level options are set. @@ -60,8 +55,13 @@ type SocketOptionsHandler interface { // buffer size. It also returns the newly set value. OnSetSendBufferSize(v int64) (newSz int64) - // OnSetReceiveBufferSize is invoked to set the SO_RCVBUFSIZE. + // OnSetReceiveBufferSize is invoked by SO_RCVBUF and SO_RCVBUFFORCE. OnSetReceiveBufferSize(v, oldSz int64) (newSz int64) + + // WakeupWriters is invoked when the send buffer size for an endpoint is + // changed. The handler notifies the writers if the send buffer size is + // increased with setsockopt(2) for TCP endpoints. + WakeupWriters() } // DefaultSocketOptionsHandler is an embeddable type that implements no-op @@ -103,6 +103,9 @@ func (*DefaultSocketOptionsHandler) OnSetSendBufferSize(v int64) (newSz int64) { return v } +// WakeupWriters implements SocketOptionsHandler.WakeupWriters. +func (*DefaultSocketOptionsHandler) WakeupWriters() {} + // OnSetReceiveBufferSize implements SocketOptionsHandler.OnSetReceiveBufferSize. func (*DefaultSocketOptionsHandler) OnSetReceiveBufferSize(v, oldSz int64) (newSz int64) { return v @@ -617,39 +620,23 @@ func (so *SocketOptions) GetSendBufferSize() int64 { return so.sendBufferSize.Load() } +// SendBufferLimits returns the [min, max) range of allowable send buffer +// sizes. +func (so *SocketOptions) SendBufferLimits() (min, max int64) { + limits := so.getSendBufferLimits(so.stackHandler) + return int64(limits.Min), int64(limits.Max) +} + // SetSendBufferSize sets value for SO_SNDBUF option. notify indicates if the // stack handler should be invoked to set the send buffer size. func (so *SocketOptions) SetSendBufferSize(sendBufferSize int64, notify bool) { - v := sendBufferSize - - if !notify { - so.sendBufferSize.Store(v) - return - } - - // Make sure the send buffer size is within the min and max - // allowed. - ss := so.getSendBufferLimits(so.stackHandler) - min := int64(ss.Min) - max := int64(ss.Max) - // Validate the send buffer size with min and max values. - // Multiply it by factor of 2. - if v > max { - v = max + if notify { + sendBufferSize = so.handler.OnSetSendBufferSize(sendBufferSize) } - - if v < math.MaxInt32/PacketOverheadFactor { - v *= PacketOverheadFactor - if v < min { - v = min - } - } else { - v = math.MaxInt32 + so.sendBufferSize.Store(sendBufferSize) + if notify { + so.handler.WakeupWriters() } - - // Notify endpoint about change in buffer size. - newSz := so.handler.OnSetSendBufferSize(v) - so.sendBufferSize.Store(newSz) } // GetReceiveBufferSize gets value for SO_RCVBUF option. @@ -657,36 +644,19 @@ func (so *SocketOptions) GetReceiveBufferSize() int64 { return so.receiveBufferSize.Load() } -// SetReceiveBufferSize sets value for SO_RCVBUF option. -func (so *SocketOptions) SetReceiveBufferSize(receiveBufferSize int64, notify bool) { - if !notify { - so.receiveBufferSize.Store(receiveBufferSize) - return - } - - // Make sure the send buffer size is within the min and max - // allowed. - v := receiveBufferSize - ss := so.getReceiveBufferLimits(so.stackHandler) - min := int64(ss.Min) - max := int64(ss.Max) - // Validate the send buffer size with min and max values. - if v > max { - v = max - } +// ReceiveBufferLimits returns the [min, max) range of allowable receive buffer +// sizes. +func (so *SocketOptions) ReceiveBufferLimits() (min, max int64) { + limits := so.getReceiveBufferLimits(so.stackHandler) + return int64(limits.Min), int64(limits.Max) +} - // Multiply it by factor of 2. - if v < math.MaxInt32/PacketOverheadFactor { - v *= PacketOverheadFactor - if v < min { - v = min - } - } else { - v = math.MaxInt32 +// SetReceiveBufferSize sets the value of the SO_RCVBUF option, optionally +// notifying the owning endpoint. +func (so *SocketOptions) SetReceiveBufferSize(receiveBufferSize int64, notify bool) { + if notify { + oldSz := so.receiveBufferSize.Load() + receiveBufferSize = so.handler.OnSetReceiveBufferSize(receiveBufferSize, oldSz) } - - oldSz := so.receiveBufferSize.Load() - // Notify endpoint about change in buffer size. - newSz := so.handler.OnSetReceiveBufferSize(v, oldSz) - so.receiveBufferSize.Store(newSz) + so.receiveBufferSize.Store(receiveBufferSize) } diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index 395ff9a07..6c42ab29b 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -85,6 +85,7 @@ go_library( "//pkg/tcpip/buffer", "//pkg/tcpip/hash/jenkins", "//pkg/tcpip/header", + "//pkg/tcpip/internal/tcp", "//pkg/tcpip/ports", "//pkg/tcpip/seqnum", "//pkg/tcpip/transport/tcpconntrack", @@ -95,7 +96,7 @@ go_library( go_test( name = "stack_x_test", - size = "medium", + size = "small", srcs = [ "addressable_endpoint_state_test.go", "ndp_test.go", diff --git a/pkg/tcpip/stack/addressable_endpoint_state.go b/pkg/tcpip/stack/addressable_endpoint_state.go index ce9cebdaa..7e4b5bf74 100644 --- a/pkg/tcpip/stack/addressable_endpoint_state.go +++ b/pkg/tcpip/stack/addressable_endpoint_state.go @@ -117,10 +117,10 @@ func (a *AddressableEndpointState) releaseAddressStateLocked(addrState *addressS } // AddAndAcquirePermanentAddress implements AddressableEndpoint. -func (a *AddressableEndpointState) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated bool) (AddressEndpoint, tcpip.Error) { +func (a *AddressableEndpointState) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties AddressProperties) (AddressEndpoint, tcpip.Error) { a.mu.Lock() defer a.mu.Unlock() - ep, err := a.addAndAcquireAddressLocked(addr, peb, configType, deprecated, true /* permanent */) + ep, err := a.addAndAcquireAddressLocked(addr, properties, true /* permanent */) // From https://golang.org/doc/faq#nil_error: // // Under the covers, interfaces are implemented as two elements, a type T and @@ -149,7 +149,7 @@ func (a *AddressableEndpointState) AddAndAcquirePermanentAddress(addr tcpip.Addr func (a *AddressableEndpointState) AddAndAcquireTemporaryAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior) (AddressEndpoint, tcpip.Error) { a.mu.Lock() defer a.mu.Unlock() - ep, err := a.addAndAcquireAddressLocked(addr, peb, AddressConfigStatic, false /* deprecated */, false /* permanent */) + ep, err := a.addAndAcquireAddressLocked(addr, AddressProperties{PEB: peb}, false /* permanent */) // From https://golang.org/doc/faq#nil_error: // // Under the covers, interfaces are implemented as two elements, a type T and @@ -180,7 +180,7 @@ func (a *AddressableEndpointState) AddAndAcquireTemporaryAddress(addr tcpip.Addr // returned, regardless the kind of address that is being added. // // Precondition: a.mu must be write locked. -func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated, permanent bool) (*addressState, tcpip.Error) { +func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.AddressWithPrefix, properties AddressProperties, permanent bool) (*addressState, tcpip.Error) { // attemptAddToPrimary is false when the address is already in the primary // address list. attemptAddToPrimary := true @@ -208,7 +208,7 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address // We now promote the address. for i, s := range a.mu.primary { if s == addrState { - switch peb { + switch properties.PEB { case CanBePrimaryEndpoint: // The address is already in the primary address list. attemptAddToPrimary = false @@ -222,7 +222,7 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address case NeverPrimaryEndpoint: a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...) default: - panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", peb)) + panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", properties.PEB)) } break } @@ -249,7 +249,7 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address // or we are adding a new temporary or permanent address. // // The address MUST be write locked at this point. - defer addrState.mu.Unlock() + defer addrState.mu.Unlock() // +checklocksforce if permanent { if addrState.mu.kind.IsPermanent() { @@ -262,11 +262,11 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address } // Acquire the address before returning it. addrState.mu.refs++ - addrState.mu.deprecated = deprecated - addrState.mu.configType = configType + addrState.mu.deprecated = properties.Deprecated + addrState.mu.configType = properties.ConfigType if attemptAddToPrimary { - switch peb { + switch properties.PEB { case NeverPrimaryEndpoint: case CanBePrimaryEndpoint: a.mu.primary = append(a.mu.primary, addrState) @@ -285,7 +285,7 @@ func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.Address a.mu.primary[0] = addrState } default: - panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", peb)) + panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", properties.PEB)) } } @@ -489,12 +489,12 @@ func (a *AddressableEndpointState) AcquireAssignedAddressOrMatching(localAddr tc // Proceed to add a new temporary endpoint. addr := localAddr.WithPrefix() - ep, err := a.addAndAcquireAddressLocked(addr, tempPEB, AddressConfigStatic, false /* deprecated */, false /* permanent */) + ep, err := a.addAndAcquireAddressLocked(addr, AddressProperties{PEB: tempPEB}, false /* permanent */) if err != nil { // addAndAcquireAddressLocked only returns an error if the address is // already assigned but we just checked above if the address exists so we // expect no error. - panic(fmt.Sprintf("a.addAndAcquireAddressLocked(%s, %d, %d, false, false): %s", addr, tempPEB, AddressConfigStatic, err)) + panic(fmt.Sprintf("a.addAndAcquireAddressLocked(%s, AddressProperties{PEB: %s}, false): %s", addr, tempPEB, err)) } // From https://golang.org/doc/faq#nil_error: diff --git a/pkg/tcpip/stack/addressable_endpoint_state_test.go b/pkg/tcpip/stack/addressable_endpoint_state_test.go index 140f146f6..c55f85743 100644 --- a/pkg/tcpip/stack/addressable_endpoint_state_test.go +++ b/pkg/tcpip/stack/addressable_endpoint_state_test.go @@ -38,9 +38,9 @@ func TestAddressableEndpointStateCleanup(t *testing.T) { } { - ep, err := s.AddAndAcquirePermanentAddress(addr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */) + ep, err := s.AddAndAcquirePermanentAddress(addr, stack.AddressProperties{PEB: stack.NeverPrimaryEndpoint}) if err != nil { - t.Fatalf("s.AddAndAcquirePermanentAddress(%s, %d, %d, false): %s", addr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, err) + t.Fatalf("s.AddAndAcquirePermanentAddress(%s, AddressProperties{PEB: NeverPrimaryEndpoint}): %s", addr, err) } // We don't need the address endpoint. ep.DecRef() diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go index f7fbcbaa7..068dab7ce 100644 --- a/pkg/tcpip/stack/conntrack.go +++ b/pkg/tcpip/stack/conntrack.go @@ -35,7 +35,6 @@ import ( // Currently, only TCP tracking is supported. // Our hash table has 16K buckets. -// TODO(gvisor.dev/issue/170): These should be tunable. const numBuckets = 1 << 14 // Direction of the tuple. @@ -165,8 +164,6 @@ func (cn *conn) updateLocked(tcpHeader header.TCP, hook Hook) { // Update the state of tcb. tcb assumes it's always initialized on the // client. However, we only need to know whether the connection is // established or not, so the client/server distinction isn't important. - // TODO(gvisor.dev/issue/170): Add support in tcpconntrack to handle - // other tcp states. if cn.tcb.IsEmpty() { cn.tcb.Init(tcpHeader) } else if hook == cn.tcbHook { @@ -246,8 +243,7 @@ func (ct *ConnTrack) init() { // connFor gets the conn for pkt if it exists, or returns nil // if it does not. It returns an error when pkt does not contain a valid TCP // header. -// TODO(gvisor.dev/issue/170): Only TCP packets are supported. Need to support -// other transport protocols. +// TODO(gvisor.dev/issue/6168): Support UDP. func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) { tid, err := packetToTupleID(pkt) if err != nil { @@ -367,7 +363,7 @@ func (ct *ConnTrack) insertConn(conn *conn) { // Unlocking can happen in any order. ct.buckets[tupleBucket].mu.Unlock() if tupleBucket != replyBucket { - ct.buckets[replyBucket].mu.Unlock() + ct.buckets[replyBucket].mu.Unlock() // +checklocksforce } } @@ -385,7 +381,7 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, r *Route) bool { return false } - // TODO(gvisor.dev/issue/170): Support other transport protocols. + // TODO(gvisor.dev/issue/6168): Support UDP. if pkt.Network().TransportProtocol() != header.TCPProtocolNumber { return false } @@ -409,16 +405,23 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, r *Route) bool { // validated if checksum offloading is off. It may require IP defrag if the // packets are fragmented. + var newAddr tcpip.Address + var newPort uint16 + + updateSRCFields := false + switch hook { case Prerouting, Output: if conn.manip == manipDestination { switch dir { case dirOriginal: - tcpHeader.SetDestinationPort(conn.reply.srcPort) - netHeader.SetDestinationAddress(conn.reply.srcAddr) + newPort = conn.reply.srcPort + newAddr = conn.reply.srcAddr case dirReply: - tcpHeader.SetSourcePort(conn.original.dstPort) - netHeader.SetSourceAddress(conn.original.dstAddr) + newPort = conn.original.dstPort + newAddr = conn.original.dstAddr + + updateSRCFields = true } pkt.NatDone = true } @@ -426,11 +429,13 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, r *Route) bool { if conn.manip == manipSource { switch dir { case dirOriginal: - tcpHeader.SetSourcePort(conn.reply.dstPort) - netHeader.SetSourceAddress(conn.reply.dstAddr) + newPort = conn.reply.dstPort + newAddr = conn.reply.dstAddr + + updateSRCFields = true case dirReply: - tcpHeader.SetDestinationPort(conn.original.srcPort) - netHeader.SetDestinationAddress(conn.original.srcAddr) + newPort = conn.original.srcPort + newAddr = conn.original.srcAddr } pkt.NatDone = true } @@ -441,33 +446,33 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, r *Route) bool { return false } + fullChecksum := false + updatePseudoHeader := false switch hook { case Prerouting, Input: case Output, Postrouting: // Calculate the TCP checksum and set it. - tcpHeader.SetChecksum(0) - length := uint16(len(tcpHeader) + pkt.Data().Size()) - xsum := header.PseudoHeaderChecksum(header.TCPProtocolNumber, netHeader.SourceAddress(), netHeader.DestinationAddress(), length) if pkt.GSOOptions.Type != GSONone && pkt.GSOOptions.NeedsCsum { - tcpHeader.SetChecksum(xsum) + updatePseudoHeader = true } else if r.RequiresTXTransportChecksum() { - xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum()) - tcpHeader.SetChecksum(^tcpHeader.CalculateChecksum(xsum)) + fullChecksum = true + updatePseudoHeader = true } default: panic(fmt.Sprintf("unrecognized hook = %s", hook)) } - // After modification, IPv4 packets need a valid checksum. - if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber { - netHeader := header.IPv4(pkt.NetworkHeader().View()) - netHeader.SetChecksum(0) - netHeader.SetChecksum(^netHeader.CalculateChecksum()) - } + rewritePacket( + netHeader, + tcpHeader, + updateSRCFields, + fullChecksum, + updatePseudoHeader, + newPort, + newAddr, + ) // Update the state of tcb. - // TODO(gvisor.dev/issue/170): Add support in tcpcontrack to handle - // other tcp states. conn.mu.Lock() defer conn.mu.Unlock() @@ -544,8 +549,6 @@ func (ct *ConnTrack) bucket(id tupleID) int { // reapUnused returns the next bucket that should be checked and the time after // which it should be called again. func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, time.Duration) { - // TODO(gvisor.dev/issue/170): This can be more finely controlled, as - // it is in Linux via sysctl. const fractionPerReaping = 128 const maxExpiredPct = 50 const maxFullTraversal = 60 * time.Second @@ -623,7 +626,7 @@ func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bo // Don't re-unlock if both tuples are in the same bucket. if differentBuckets { - ct.buckets[replyBucket].mu.Unlock() + ct.buckets[replyBucket].mu.Unlock() // +checklocksforce } return true diff --git a/pkg/tcpip/stack/forwarding_test.go b/pkg/tcpip/stack/forwarding_test.go index 72f66441f..c2f1f4798 100644 --- a/pkg/tcpip/stack/forwarding_test.go +++ b/pkg/tcpip/stack/forwarding_test.go @@ -181,10 +181,6 @@ func (*fwdTestNetworkProtocol) MinimumPacketSize() int { return fwdTestNetHeaderLen } -func (*fwdTestNetworkProtocol) DefaultPrefixLen() int { - return fwdTestNetDefaultPrefixLen -} - func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1]) } @@ -342,6 +338,10 @@ func (e *fwdTestLinkEndpoint) WritePackets(r RouteInfo, pkts PacketBufferList, p return n, nil } +func (*fwdTestLinkEndpoint) WriteRawPacket(*PacketBuffer) tcpip.Error { + return &tcpip.ErrNotSupported{} +} + // Wait implements stack.LinkEndpoint.Wait. func (*fwdTestLinkEndpoint) Wait() {} @@ -380,8 +380,15 @@ func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (*faketime.M if err := s.CreateNIC(1, ep1); err != nil { t.Fatal("CreateNIC #1 failed:", err) } - if err := s.AddAddress(1, fwdTestNetNumber, "\x01"); err != nil { - t.Fatal("AddAddress #1 failed:", err) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: fwdTestNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x01", + PrefixLen: fwdTestNetDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr1, AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr1, err) } // NIC 2 has the link address "b", and added the network address 2. @@ -393,8 +400,15 @@ func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (*faketime.M if err := s.CreateNIC(2, ep2); err != nil { t.Fatal("CreateNIC #2 failed:", err) } - if err := s.AddAddress(2, fwdTestNetNumber, "\x02"); err != nil { - t.Fatal("AddAddress #2 failed:", err) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: fwdTestNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x02", + PrefixLen: fwdTestNetDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(2, protocolAddr2, AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 2, protocolAddr2, err) } nic, ok := s.nics[2] diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index 0a26f6dd8..f152c0d83 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -268,10 +268,6 @@ const ( // should continue traversing the network stack and false when it should be // dropped. // -// TODO(gvisor.dev/issue/170): PacketBuffer should hold the route, from -// which address can be gathered. Currently, address is only needed for -// prerouting. -// // Precondition: pkt.NetworkHeader is set. func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, r *Route, preroutingAddr tcpip.Address, inNicName, outNicName string) bool { if pkt.NetworkProtocolNumber != header.IPv4ProtocolNumber && pkt.NetworkProtocolNumber != header.IPv6ProtocolNumber { diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go index 2812c89aa..96cc899bb 100644 --- a/pkg/tcpip/stack/iptables_targets.go +++ b/pkg/tcpip/stack/iptables_targets.go @@ -87,9 +87,6 @@ func (*ReturnTarget) Action(*PacketBuffer, *ConnTrack, Hook, *Route, tcpip.Addre // destination port/IP. Outgoing packets are redirected to the loopback device, // and incoming packets are redirected to the incoming interface (rather than // forwarded). -// -// TODO(gvisor.dev/issue/170): Other flags need to be added after we support -// them. type RedirectTarget struct { // Port indicates port used to redirect. It is immutable. Port uint16 @@ -100,9 +97,6 @@ type RedirectTarget struct { } // Action implements Target.Action. -// TODO(gvisor.dev/issue/170): Parse headers without copying. The current -// implementation only works for Prerouting and calls pkt.Clone(), neither -// of which should be the case. func (rt *RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r *Route, address tcpip.Address) (RuleVerdict, int) { // Sanity check. if rt.NetworkProtocol != pkt.NetworkProtocolNumber { @@ -136,34 +130,26 @@ func (rt *RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r panic("redirect target is supported only on output and prerouting hooks") } - // TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if - // we need to change dest address (for OUTPUT chain) or ports. switch protocol := pkt.TransportProtocolNumber; protocol { case header.UDPProtocolNumber: udpHeader := header.UDP(pkt.TransportHeader().View()) - udpHeader.SetDestinationPort(rt.Port) - // Calculate UDP checksum and set it. if hook == Output { - udpHeader.SetChecksum(0) - netHeader := pkt.Network() - netHeader.SetDestinationAddress(address) - // Only calculate the checksum if offloading isn't supported. - if r.RequiresTXTransportChecksum() { - length := uint16(pkt.Size()) - uint16(len(pkt.NetworkHeader().View())) - xsum := header.PseudoHeaderChecksum(protocol, netHeader.SourceAddress(), netHeader.DestinationAddress(), length) - xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum()) - udpHeader.SetChecksum(^udpHeader.CalculateChecksum(xsum)) - } + requiresChecksum := r.RequiresTXTransportChecksum() + rewritePacket( + pkt.Network(), + udpHeader, + false, /* updateSRCFields */ + requiresChecksum, + requiresChecksum, + rt.Port, + address, + ) + } else { + udpHeader.SetDestinationPort(rt.Port) } - // After modification, IPv4 packets need a valid checksum. - if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber { - netHeader := header.IPv4(pkt.NetworkHeader().View()) - netHeader.SetChecksum(0) - netHeader.SetChecksum(^netHeader.CalculateChecksum()) - } pkt.NatDone = true case header.TCPProtocolNumber: if ct == nil { @@ -222,26 +208,18 @@ func (st *SNATTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r *Rou switch protocol := pkt.TransportProtocolNumber; protocol { case header.UDPProtocolNumber: - udpHeader := header.UDP(pkt.TransportHeader().View()) - udpHeader.SetChecksum(0) - udpHeader.SetSourcePort(st.Port) - netHeader := pkt.Network() - netHeader.SetSourceAddress(st.Addr) - // Only calculate the checksum if offloading isn't supported. - if r.RequiresTXTransportChecksum() { - length := uint16(pkt.Size()) - uint16(len(pkt.NetworkHeader().View())) - xsum := header.PseudoHeaderChecksum(protocol, netHeader.SourceAddress(), netHeader.DestinationAddress(), length) - xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum()) - udpHeader.SetChecksum(^udpHeader.CalculateChecksum(xsum)) - } + requiresChecksum := r.RequiresTXTransportChecksum() + rewritePacket( + pkt.Network(), + header.UDP(pkt.TransportHeader().View()), + true, /* updateSRCFields */ + requiresChecksum, + requiresChecksum, + st.Port, + st.Addr, + ) - // After modification, IPv4 packets need a valid checksum. - if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber { - netHeader := header.IPv4(pkt.NetworkHeader().View()) - netHeader.SetChecksum(0) - netHeader.SetChecksum(^netHeader.CalculateChecksum()) - } pkt.NatDone = true case header.TCPProtocolNumber: if ct == nil { @@ -260,3 +238,42 @@ func (st *SNATTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r *Rou return RuleAccept, 0 } + +func rewritePacket(n header.Network, t header.ChecksummableTransport, updateSRCFields, fullChecksum, updatePseudoHeader bool, newPort uint16, newAddr tcpip.Address) { + if updateSRCFields { + if fullChecksum { + t.SetSourcePortWithChecksumUpdate(newPort) + } else { + t.SetSourcePort(newPort) + } + } else { + if fullChecksum { + t.SetDestinationPortWithChecksumUpdate(newPort) + } else { + t.SetDestinationPort(newPort) + } + } + + if updatePseudoHeader { + var oldAddr tcpip.Address + if updateSRCFields { + oldAddr = n.SourceAddress() + } else { + oldAddr = n.DestinationAddress() + } + + t.UpdateChecksumPseudoHeaderAddress(oldAddr, newAddr, fullChecksum) + } + + if checksummableNetHeader, ok := n.(header.ChecksummableNetwork); ok { + if updateSRCFields { + checksummableNetHeader.SetSourceAddressWithChecksumUpdate(newAddr) + } else { + checksummableNetHeader.SetDestinationAddressWithChecksumUpdate(newAddr) + } + } else if updateSRCFields { + n.SetSourceAddress(newAddr) + } else { + n.SetDestinationAddress(newAddr) + } +} diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go index 93592e7f5..66e5f22ac 100644 --- a/pkg/tcpip/stack/iptables_types.go +++ b/pkg/tcpip/stack/iptables_types.go @@ -242,7 +242,6 @@ type IPHeaderFilter struct { func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, inNicName, outNicName string) bool { // Extract header fields. var ( - // TODO(gvisor.dev/issue/170): Support other filter fields. transProto tcpip.TransportProtocolNumber dstAddr tcpip.Address srcAddr tcpip.Address @@ -291,7 +290,6 @@ func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, inNicName, outNicNa return true case Postrouting: - // TODO(gvisor.dev/issue/170): Add the check for POSTROUTING. return true default: panic(fmt.Sprintf("unknown hook: %d", hook)) diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 133bacdd0..40b33b6b5 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -52,17 +52,6 @@ const ( linkAddr4 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x09") defaultPrefixLen = 128 - - // Extra time to use when waiting for an async event to occur. - defaultAsyncPositiveEventTimeout = 10 * time.Second - - // Extra time to use when waiting for an async event to not occur. - // - // Since a negative check is used to make sure an event did not happen, it is - // okay to use a smaller timeout compared to the positive case since execution - // stall in regards to the monotonic clock will not affect the expected - // outcome. - defaultAsyncNegativeEventTimeout = time.Second ) var ( @@ -112,11 +101,13 @@ type ndpDADEvent struct { res stack.DADResult } -type ndpRouterEvent struct { - nicID tcpip.NICID - addr tcpip.Address - // true if router was discovered, false if invalidated. - discovered bool +type ndpOffLinkRouteEvent struct { + nicID tcpip.NICID + subnet tcpip.Subnet + router tcpip.Address + prf header.NDPRoutePreference + // true if route was updated, false if invalidated. + updated bool } type ndpPrefixEvent struct { @@ -140,6 +131,10 @@ type ndpAutoGenAddrEvent struct { eventType ndpAutoGenAddrEventType } +func (e ndpAutoGenAddrEvent) String() string { + return fmt.Sprintf("%T{nicID=%d addr=%s eventType=%d}", e, e.nicID, e.addr, e.eventType) +} + type ndpRDNSS struct { addrs []tcpip.Address lifetime time.Duration @@ -167,10 +162,8 @@ var _ ipv6.NDPDispatcher = (*ndpDispatcher)(nil) // related events happen for test purposes. type ndpDispatcher struct { dadC chan ndpDADEvent - routerC chan ndpRouterEvent - rememberRouter bool + offLinkRouteC chan ndpOffLinkRouteEvent prefixC chan ndpPrefixEvent - rememberPrefix bool autoGenAddrC chan ndpAutoGenAddrEvent rdnssC chan ndpRDNSSEvent dnsslC chan ndpDNSSLEvent @@ -189,32 +182,35 @@ func (n *ndpDispatcher) OnDuplicateAddressDetectionResult(nicID tcpip.NICID, add } } -// Implements ipv6.NDPDispatcher.OnDefaultRouterDiscovered. -func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool { - if c := n.routerC; c != nil { - c <- ndpRouterEvent{ +// Implements ipv6.NDPDispatcher.OnOffLinkRouteUpdated. +func (n *ndpDispatcher) OnOffLinkRouteUpdated(nicID tcpip.NICID, subnet tcpip.Subnet, router tcpip.Address, prf header.NDPRoutePreference) { + if c := n.offLinkRouteC; c != nil { + c <- ndpOffLinkRouteEvent{ nicID, - addr, + subnet, + router, + prf, true, } } - - return n.rememberRouter } -// Implements ipv6.NDPDispatcher.OnDefaultRouterInvalidated. -func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) { - if c := n.routerC; c != nil { - c <- ndpRouterEvent{ +// Implements ipv6.NDPDispatcher.OnOffLinkRouteInvalidated. +func (n *ndpDispatcher) OnOffLinkRouteInvalidated(nicID tcpip.NICID, subnet tcpip.Subnet, router tcpip.Address) { + if c := n.offLinkRouteC; c != nil { + var prf header.NDPRoutePreference + c <- ndpOffLinkRouteEvent{ nicID, - addr, + subnet, + router, + prf, false, } } } // Implements ipv6.NDPDispatcher.OnOnLinkPrefixDiscovered. -func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool { +func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) { if c := n.prefixC; c != nil { c <- ndpPrefixEvent{ nicID, @@ -222,8 +218,6 @@ func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip true, } } - - return n.rememberPrefix } // Implements ipv6.NDPDispatcher.OnOnLinkPrefixInvalidated. @@ -237,7 +231,7 @@ func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpi } } -func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) bool { +func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) { if c := n.autoGenAddrC; c != nil { c <- ndpAutoGenAddrEvent{ nicID, @@ -245,7 +239,6 @@ func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWi newAddr, } } - return true } func (n *ndpDispatcher) OnAutoGenAddressDeprecated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) { @@ -340,8 +333,12 @@ func TestDADDisabled(t *testing.T) { Address: addr1, PrefixLen: defaultPrefixLen, } - if err := s.AddAddressWithPrefix(nicID, header.IPv6ProtocolNumber, addrWithPrefix); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addrWithPrefix, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addrWithPrefix, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}) = %s", nicID, protocolAddr, err) } // Should get the address immediately since we should not have performed @@ -386,12 +383,15 @@ func TestDADResolveLoopback(t *testing.T) { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - addrWithPrefix := tcpip.AddressWithPrefix{ - Address: addr1, - PrefixLen: defaultPrefixLen, + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: addr1, + PrefixLen: defaultPrefixLen, + }, } - if err := s.AddAddressWithPrefix(nicID, header.IPv6ProtocolNumber, addrWithPrefix); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addrWithPrefix, err) + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}) = %s", nicID, protocolAddr, err) } // Address should not be considered bound to the NIC yet (DAD ongoing). @@ -524,8 +524,12 @@ func TestDADResolve(t *testing.T) { Address: addr1, PrefixLen: defaultPrefixLen, } - if err := s.AddAddressWithPrefix(nicID, header.IPv6ProtocolNumber, addrWithPrefix); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addrWithPrefix, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addrWithPrefix, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}) = %s", nicID, protocolAddr, err) } // Make sure the address does not resolve before the resolution time has @@ -747,8 +751,12 @@ func TestDADFail(t *testing.T) { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addr1.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } // Address should not be considered bound to the NIC yet @@ -785,8 +793,8 @@ func TestDADFail(t *testing.T) { // Attempting to add the address again should not fail if the address's // state was cleaned up when DAD failed. - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err) + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } }) } @@ -858,8 +866,12 @@ func TestDADStop(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, header.IPv6ProtocolNumber, addr1, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addr1.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } // Address should not be considered bound to the NIC yet (DAD ongoing). @@ -982,17 +994,29 @@ func TestSetNDPConfigurations(t *testing.T) { // Add addresses for each NIC. addrWithPrefix1 := tcpip.AddressWithPrefix{Address: addr1, PrefixLen: defaultPrefixLen} - if err := s.AddAddressWithPrefix(nicID1, header.IPv6ProtocolNumber, addrWithPrefix1); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID1, header.IPv6ProtocolNumber, addrWithPrefix1, err) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addrWithPrefix1, + } + if err := s.AddProtocolAddress(nicID1, protocolAddr1, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}) = %s", nicID1, protocolAddr1, err) } addrWithPrefix2 := tcpip.AddressWithPrefix{Address: addr2, PrefixLen: defaultPrefixLen} - if err := s.AddAddressWithPrefix(nicID2, header.IPv6ProtocolNumber, addrWithPrefix2); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID2, header.IPv6ProtocolNumber, addrWithPrefix2, err) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addrWithPrefix2, + } + if err := s.AddProtocolAddress(nicID2, protocolAddr2, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}) = %s", nicID2, protocolAddr2, err) } expectDADEvent(nicID2, addr2) addrWithPrefix3 := tcpip.AddressWithPrefix{Address: addr3, PrefixLen: defaultPrefixLen} - if err := s.AddAddressWithPrefix(nicID3, header.IPv6ProtocolNumber, addrWithPrefix3); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s) = %s", nicID3, header.IPv6ProtocolNumber, addrWithPrefix3, err) + protocolAddr3 := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addrWithPrefix3, + } + if err := s.AddProtocolAddress(nicID3, protocolAddr3, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}) = %s", nicID3, protocolAddr3, err) } expectDADEvent(nicID3, addr3) @@ -1039,9 +1063,12 @@ func TestSetNDPConfigurations(t *testing.T) { } } -// raBufWithOptsAndDHCPv6 returns a valid NDP Router Advertisement with options -// and DHCPv6 configurations specified. -func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) *stack.PacketBuffer { +// raBuf returns a valid NDP Router Advertisement with options, router +// preference and DHCPv6 configurations specified. +func raBuf(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, prf header.NDPRoutePreference, optSer header.NDPOptionsSerializer) *stack.PacketBuffer { + const flagsByte = 1 + const routerLifetimeOffset = 2 + icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + optSer.Length() hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize) pkt := header.ICMPv6(hdr.Prepend(icmpSize)) @@ -1050,19 +1077,19 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo raPayload := pkt.MessageBody() ra := header.NDPRouterAdvert(raPayload) // Populate the Router Lifetime. - binary.BigEndian.PutUint16(raPayload[2:], rl) + binary.BigEndian.PutUint16(raPayload[routerLifetimeOffset:], rl) // Populate the Managed Address flag field. if managedAddress { - // The Managed Addresses flag field is the 7th bit of byte #1 (0-indexing) - // of the RA payload. - raPayload[1] |= 1 << 7 + // The Managed Addresses flag field is the 7th bit of the flags byte. + raPayload[flagsByte] |= 1 << 7 } // Populate the Other Configurations flag field. if otherConfigurations { - // The Other Configurations flag field is the 6th bit of byte #1 - // (0-indexing) of the RA payload. - raPayload[1] |= 1 << 6 + // The Other Configurations flag field is the 6th bit of the flags byte. + raPayload[flagsByte] |= 1 << 6 } + // The Prf field is held in the flags byte. + raPayload[flagsByte] |= byte(prf) << 3 opts := ra.Options() opts.Serialize(optSer) pkt.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ @@ -1090,7 +1117,7 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo // Note, raBufWithOpts does not populate any of the RA fields other than the // Router Lifetime. func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) *stack.PacketBuffer { - return raBufWithOptsAndDHCPv6(ip, rl, false, false, optSer) + return raBuf(ip, rl, false /* managedAddress */, false /* otherConfigurations */, 0 /* prf */, optSer) } // raBufWithDHCPv6 returns a valid NDP Router Advertisement with DHCPv6 related @@ -1098,18 +1125,26 @@ func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializ // // Note, raBufWithDHCPv6 does not populate any of the RA fields other than the // DHCPv6 related ones. -func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) *stack.PacketBuffer { - return raBufWithOptsAndDHCPv6(ip, 0, managedAddresses, otherConfiguratiosns, header.NDPOptionsSerializer{}) +func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfigurations bool) *stack.PacketBuffer { + return raBuf(ip, 0, managedAddresses, otherConfigurations, 0 /* prf */, header.NDPOptionsSerializer{}) } // raBuf returns a valid NDP Router Advertisement. // // Note, raBuf does not populate any of the RA fields other than the // Router Lifetime. -func raBuf(ip tcpip.Address, rl uint16) *stack.PacketBuffer { +func raBufSimple(ip tcpip.Address, rl uint16) *stack.PacketBuffer { return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{}) } +// raBufWithPrf returns a valid NDP Router Advertisement with a preference. +// +// Note, raBufWithPrf does not populate any of the RA fields other than the +// Router Lifetime and Default Router Preference fields. +func raBufWithPrf(ip tcpip.Address, rl uint16, prf header.NDPRoutePreference) *stack.PacketBuffer { + return raBuf(ip, rl, false /* managedAddress */, false /* otherConfigurations */, prf, header.NDPOptionsSerializer{}) +} + // raBufWithPI returns a valid NDP Router Advertisement with a single Prefix // Information option. // @@ -1148,6 +1183,39 @@ func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, on }) } +// raBufWithRIO returns a valid NDP Router Advertisement with a single Route +// Information option. +// +// All fields in the RA will be zero except the RIO option. +func raBufWithRIO(t *testing.T, ip tcpip.Address, prefix tcpip.AddressWithPrefix, lifetimeSeconds uint32, prf header.NDPRoutePreference) *stack.PacketBuffer { + // buf will hold the route information option after the Type and Length + // fields. + // + // 2.3. Route Information Option + // + // 0 1 2 3 + // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + // | Type | Length | Prefix Length |Resvd|Prf|Resvd| + // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + // | Route Lifetime | + // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + // | Prefix (Variable Length) | + // . . + // . . + // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + var buf [22]byte + buf[0] = uint8(prefix.PrefixLen) + buf[1] = byte(prf) << 3 + binary.BigEndian.PutUint32(buf[2:], lifetimeSeconds) + if n := copy(buf[6:], prefix.Address); n != len(prefix.Address) { + t.Fatalf("got copy(...) = %d, want = %d", n, len(prefix.Address)) + } + return raBufWithOpts(ip, 0 /* router lifetime */, header.NDPOptionsSerializer{ + header.NDPRouteInformation(buf[:]), + }) +} + func TestDynamicConfigurationsDisabled(t *testing.T) { const ( nicID = 1 @@ -1169,7 +1237,7 @@ func TestDynamicConfigurationsDisabled(t *testing.T) { config: func(enable bool) ipv6.NDPConfigurations { return ipv6.NDPConfigurations{DiscoverDefaultRouters: enable} }, - ra: raBuf(llAddr2, 1000), + ra: raBufSimple(llAddr2, 1000), }, { name: "No Prefix Discovery", @@ -1205,9 +1273,9 @@ func TestDynamicConfigurationsDisabled(t *testing.T) { t.Run(fmt.Sprintf("HandleRAs(%s), Forwarding(%t), Enabled(%t)", handle, forwarding, enable), func(t *testing.T) { ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, 1), - prefixC: make(chan ndpPrefixEvent, 1), - autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + offLinkRouteC: make(chan ndpOffLinkRouteEvent, 1), + prefixC: make(chan ndpPrefixEvent, 1), + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), } ndpConfigs := test.config(enable) ndpConfigs.HandleRAs = handle @@ -1277,8 +1345,8 @@ func TestDynamicConfigurationsDisabled(t *testing.T) { t.Errorf("got v6Stats.UnhandledRouterAdvertisements.Value() = %d, want = %d", got, want) } select { - case e := <-ndpDisp.routerC: - t.Errorf("unexpectedly discovered a router when configured not to: %#v", e) + case e := <-ndpDisp.offLinkRouteC: + t.Errorf("unexpectedly updated an off-link route when configured not to: %#v", e) default: } select { @@ -1304,10 +1372,8 @@ func boolToUint64(v bool) uint64 { return 0 } -// Check e to make sure that the event is for addr on nic with ID 1, and the -// discovered flag set to discovered. -func checkRouterEvent(e ndpRouterEvent, addr tcpip.Address, discovered bool) string { - return cmp.Diff(ndpRouterEvent{nicID: 1, addr: addr, discovered: discovered}, e, cmp.AllowUnexported(e)) +func checkOffLinkRouteEvent(e ndpOffLinkRouteEvent, nicID tcpip.NICID, subnet tcpip.Subnet, router tcpip.Address, prf header.NDPRoutePreference, updated bool) string { + return cmp.Diff(ndpOffLinkRouteEvent{nicID: nicID, subnet: subnet, router: router, prf: prf, updated: updated}, e, cmp.AllowUnexported(e)) } func testWithRAs(t *testing.T, f func(*testing.T, ipv6.HandleRAsConfiguration, bool)) { @@ -1340,167 +1406,176 @@ func testWithRAs(t *testing.T, f func(*testing.T, ipv6.HandleRAsConfiguration, b } } -// TestRouterDiscoveryDispatcherNoRemember tests that the stack does not -// remember a discovered router when the dispatcher asks it not to. -func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) { - ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, 1), - } - e := channel.New(0, 1280, linkAddr1) - clock := faketime.NewManualClock() - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPConfigs: ipv6.NDPConfigurations{ - HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, - DiscoverDefaultRouters: true, - }, - NDPDisp: &ndpDisp, - })}, - Clock: clock, - }) +func TestOffLinkRouteDiscovery(t *testing.T) { + const nicID = 1 - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(1) = %s", err) - } + moreSpecificPrefix := tcpip.AddressWithPrefix{Address: testutil.MustParse6("a00::"), PrefixLen: 16} + tests := []struct { + name string - // Receive an RA for a router we should not remember. - const lifetimeSeconds = 1 - e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, lifetimeSeconds)) - select { - case e := <-ndpDisp.routerC: - if diff := checkRouterEvent(e, llAddr2, true); diff != "" { - t.Errorf("router event mismatch (-want +got):\n%s", diff) - } - default: - t.Fatal("expected router discovery event") - } + discoverDefaultRouters bool + discoverMoreSpecificRoutes bool - // Wait for the invalidation time plus some buffer to make sure we do - // not actually receive any invalidation events as we should not have - // remembered the router in the first place. - clock.Advance(lifetimeSeconds * time.Second) - select { - case <-ndpDisp.routerC: - t.Fatal("should not have received any router events") - default: + dest tcpip.Subnet + ra func(*testing.T, tcpip.Address, uint16, header.NDPRoutePreference) *stack.PacketBuffer + }{ + { + name: "Default router discovery", + discoverDefaultRouters: true, + discoverMoreSpecificRoutes: false, + dest: header.IPv6EmptySubnet, + ra: func(_ *testing.T, router tcpip.Address, lifetimeSeconds uint16, prf header.NDPRoutePreference) *stack.PacketBuffer { + return raBufWithPrf(router, lifetimeSeconds, prf) + }, + }, + { + name: "More-specific route discovery", + discoverDefaultRouters: false, + discoverMoreSpecificRoutes: true, + dest: moreSpecificPrefix.Subnet(), + ra: func(t *testing.T, router tcpip.Address, lifetimeSeconds uint16, prf header.NDPRoutePreference) *stack.PacketBuffer { + return raBufWithRIO(t, router, moreSpecificPrefix, uint32(lifetimeSeconds), prf) + }, + }, } -} -func TestRouterDiscovery(t *testing.T) { - testWithRAs(t, func(t *testing.T, handleRAs ipv6.HandleRAsConfiguration, forwarding bool) { - ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, 1), - rememberRouter: true, - } - e := channel.New(0, 1280, linkAddr1) - clock := faketime.NewManualClock() - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPConfigs: ipv6.NDPConfigurations{ - HandleRAs: handleRAs, - DiscoverDefaultRouters: true, - }, - NDPDisp: &ndpDisp, - })}, - Clock: clock, - }) + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + testWithRAs(t, func(t *testing.T, handleRAs ipv6.HandleRAsConfiguration, forwarding bool) { + ndpDisp := ndpDispatcher{ + offLinkRouteC: make(chan ndpOffLinkRouteEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + clock := faketime.NewManualClock() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ + NDPConfigs: ipv6.NDPConfigurations{ + HandleRAs: handleRAs, + DiscoverDefaultRouters: test.discoverDefaultRouters, + DiscoverMoreSpecificRoutes: test.discoverMoreSpecificRoutes, + }, + NDPDisp: &ndpDisp, + })}, + Clock: clock, + }) - expectRouterEvent := func(addr tcpip.Address, discovered bool) { - t.Helper() + expectOffLinkRouteEvent := func(addr tcpip.Address, prf header.NDPRoutePreference, updated bool) { + t.Helper() - select { - case e := <-ndpDisp.routerC: - if diff := checkRouterEvent(e, addr, discovered); diff != "" { - t.Errorf("router event mismatch (-want +got):\n%s", diff) + select { + case e := <-ndpDisp.offLinkRouteC: + if diff := checkOffLinkRouteEvent(e, nicID, test.dest, addr, prf, updated); diff != "" { + t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("expected router discovery event") + } } - default: - t.Fatal("expected router discovery event") - } - } - expectAsyncRouterInvalidationEvent := func(addr tcpip.Address, timeout time.Duration) { - t.Helper() + expectAsyncOffLinkRouteInvalidationEvent := func(addr tcpip.Address, timeout time.Duration) { + t.Helper() - clock.Advance(timeout) - select { - case e := <-ndpDisp.routerC: - if diff := checkRouterEvent(e, addr, false); diff != "" { - t.Errorf("router event mismatch (-want +got):\n%s", diff) + clock.Advance(timeout) + select { + case e := <-ndpDisp.offLinkRouteC: + var prf header.NDPRoutePreference + if diff := checkOffLinkRouteEvent(e, nicID, test.dest, addr, prf, false); diff != "" { + t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) + } + default: + t.Fatal("timed out waiting for router discovery event") + } } - default: - t.Fatal("timed out waiting for router discovery event") - } - } - - if err := s.SetForwardingDefaultAndAllNICs(ipv6.ProtocolNumber, forwarding); err != nil { - t.Fatalf("SetForwardingDefaultAndAllNICs(%d, %t): %s", ipv6.ProtocolNumber, forwarding, err) - } - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(1) = %s", err) - } + if err := s.SetForwardingDefaultAndAllNICs(ipv6.ProtocolNumber, forwarding); err != nil { + t.Fatalf("SetForwardingDefaultAndAllNICs(%d, %t): %s", ipv6.ProtocolNumber, forwarding, err) + } - // Rx an RA from lladdr2 with zero lifetime. It should not be - // remembered. - e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0)) - select { - case <-ndpDisp.routerC: - t.Fatal("unexpectedly discovered a router with 0 lifetime") - default: - } + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID, err) + } - // Rx an RA from lladdr2 with a huge lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000)) - expectRouterEvent(llAddr2, true) + // Rx an RA from lladdr2 with zero lifetime. It should not be + // remembered. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 0, header.MediumRoutePreference)) + select { + case <-ndpDisp.offLinkRouteC: + t.Fatal("unexpectedly updated an off-link route with 0 lifetime") + default: + } - // Rx an RA from another router (lladdr3) with non-zero lifetime. - const l3LifetimeSeconds = 6 - e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, l3LifetimeSeconds)) - expectRouterEvent(llAddr3, true) + // Discover an off-link route through llAddr2. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 1000, header.ReservedRoutePreference)) + if test.discoverMoreSpecificRoutes { + // The reserved value is considered invalid with more-specific route + // discovery so we inject the same packet but with the default + // (medium) preference value. + select { + case <-ndpDisp.offLinkRouteC: + t.Fatal("unexpectedly updated an off-link route with a reserved preference value") + default: + } + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 1000, header.MediumRoutePreference)) + } + expectOffLinkRouteEvent(llAddr2, header.MediumRoutePreference, true) + + // Rx an RA from another router (lladdr3) with non-zero lifetime and + // non-default preference value. + const l3LifetimeSeconds = 6 + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr3, l3LifetimeSeconds, header.HighRoutePreference)) + expectOffLinkRouteEvent(llAddr3, header.HighRoutePreference, true) + + // Rx an RA from lladdr2 with lesser lifetime and default (medium) + // preference value. + const l2LifetimeSeconds = 2 + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, l2LifetimeSeconds, header.MediumRoutePreference)) + select { + case <-ndpDisp.offLinkRouteC: + t.Fatal("should not receive a off-link route event when updating lifetimes for known routers") + default: + } - // Rx an RA from lladdr2 with lesser lifetime. - const l2LifetimeSeconds = 2 - e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, l2LifetimeSeconds)) - select { - case <-ndpDisp.routerC: - t.Fatal("Should not receive a router event when updating lifetimes for known routers") - default: - } + // Rx an RA from lladdr2 with a different preference. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, l2LifetimeSeconds, header.LowRoutePreference)) + expectOffLinkRouteEvent(llAddr2, header.LowRoutePreference, true) - // Wait for lladdr2's router invalidation job to execute. The lifetime - // of the router should have been updated to the most recent (smaller) - // lifetime. - // - // Wait for the normal lifetime plus an extra bit for the - // router to get invalidated. If we don't get an invalidation - // event after this time, then something is wrong. - expectAsyncRouterInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second) - - // Rx an RA from lladdr2 with huge lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000)) - expectRouterEvent(llAddr2, true) - - // Rx an RA from lladdr2 with zero lifetime. It should be invalidated. - e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0)) - expectRouterEvent(llAddr2, false) - - // Wait for lladdr3's router invalidation job to execute. The lifetime - // of the router should have been updated to the most recent (smaller) - // lifetime. - // - // Wait for the normal lifetime plus an extra bit for the - // router to get invalidated. If we don't get an invalidation - // event after this time, then something is wrong. - expectAsyncRouterInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second) - }) + // Wait for lladdr2's router invalidation job to execute. The lifetime + // of the router should have been updated to the most recent (smaller) + // lifetime. + // + // Wait for the normal lifetime plus an extra bit for the + // router to get invalidated. If we don't get an invalidation + // event after this time, then something is wrong. + expectAsyncOffLinkRouteInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second) + + // Rx an RA from lladdr2 with huge lifetime. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 1000, header.MediumRoutePreference)) + expectOffLinkRouteEvent(llAddr2, header.MediumRoutePreference, true) + + // Rx an RA from lladdr2 with zero lifetime. It should be invalidated. + e.InjectInbound(header.IPv6ProtocolNumber, test.ra(t, llAddr2, 0, header.MediumRoutePreference)) + expectOffLinkRouteEvent(llAddr2, header.MediumRoutePreference, false) + + // Wait for lladdr3's router invalidation job to execute. The lifetime + // of the router should have been updated to the most recent (smaller) + // lifetime. + // + // Wait for the normal lifetime plus an extra bit for the + // router to get invalidated. If we don't get an invalidation + // event after this time, then something is wrong. + expectAsyncOffLinkRouteInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second) + }) + }) + } } // TestRouterDiscoveryMaxRouters tests that only -// ipv6.MaxDiscoveredDefaultRouters discovered routers are remembered. +// ipv6.MaxDiscoveredOffLinkRoutes discovered routers are remembered. func TestRouterDiscoveryMaxRouters(t *testing.T) { + const nicID = 1 + ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, 1), - rememberRouter: true, + offLinkRouteC: make(chan ndpOffLinkRouteEvent, 1), } e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ @@ -1513,23 +1588,23 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) { })}, }) - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(1) = %s", err) + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } // Receive an RA from 2 more than the max number of discovered routers. - for i := 1; i <= ipv6.MaxDiscoveredDefaultRouters+2; i++ { + for i := 1; i <= ipv6.MaxDiscoveredOffLinkRoutes+2; i++ { linkAddr := []byte{2, 2, 3, 4, 5, 0} linkAddr[5] = byte(i) llAddr := header.LinkLocalAddr(tcpip.LinkAddress(linkAddr)) - e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr, 5)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufSimple(llAddr, 5)) - if i <= ipv6.MaxDiscoveredDefaultRouters { + if i <= ipv6.MaxDiscoveredOffLinkRoutes { select { - case e := <-ndpDisp.routerC: - if diff := checkRouterEvent(e, llAddr, true); diff != "" { - t.Errorf("router event mismatch (-want +got):\n%s", diff) + case e := <-ndpDisp.offLinkRouteC: + if diff := checkOffLinkRouteEvent(e, nicID, header.IPv6EmptySubnet, llAddr, header.MediumRoutePreference, true); diff != "" { + t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) } default: t.Fatal("expected router discovery event") @@ -1537,7 +1612,7 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) { } else { select { - case <-ndpDisp.routerC: + case <-ndpDisp.offLinkRouteC: t.Fatal("should not have discovered a new router after we already discovered the max number of routers") default: } @@ -1551,54 +1626,6 @@ func checkPrefixEvent(e ndpPrefixEvent, prefix tcpip.Subnet, discovered bool) st return cmp.Diff(ndpPrefixEvent{nicID: 1, prefix: prefix, discovered: discovered}, e, cmp.AllowUnexported(e)) } -// TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not -// remember a discovered on-link prefix when the dispatcher asks it not to. -func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) { - prefix, subnet, _ := prefixSubnetAddr(0, "") - - ndpDisp := ndpDispatcher{ - prefixC: make(chan ndpPrefixEvent, 1), - } - e := channel.New(0, 1280, linkAddr1) - clock := faketime.NewManualClock() - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPConfigs: ipv6.NDPConfigurations{ - HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, - DiscoverOnLinkPrefixes: true, - }, - NDPDisp: &ndpDisp, - })}, - Clock: clock, - }) - - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(1) = %s", err) - } - - // Receive an RA with prefix that we should not remember. - const lifetimeSeconds = 1 - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, lifetimeSeconds, 0)) - select { - case e := <-ndpDisp.prefixC: - if diff := checkPrefixEvent(e, subnet, true); diff != "" { - t.Errorf("prefix event mismatch (-want +got):\n%s", diff) - } - default: - t.Fatal("expected prefix discovery event") - } - - // Wait for the invalidation time plus some buffer to make sure we do - // not actually receive any invalidation events as we should not have - // remembered the prefix in the first place. - clock.Advance(lifetimeSeconds * time.Second) - select { - case <-ndpDisp.prefixC: - t.Fatal("should not have received any prefix events") - default: - } -} - func TestPrefixDiscovery(t *testing.T) { prefix1, subnet1, _ := prefixSubnetAddr(0, "") prefix2, subnet2, _ := prefixSubnetAddr(1, "") @@ -1606,8 +1633,7 @@ func TestPrefixDiscovery(t *testing.T) { testWithRAs(t, func(t *testing.T, handleRAs ipv6.HandleRAsConfiguration, forwarding bool) { ndpDisp := ndpDispatcher{ - prefixC: make(chan ndpPrefixEvent, 1), - rememberPrefix: true, + prefixC: make(chan ndpPrefixEvent, 1), } e := channel.New(0, 1280, linkAddr1) clock := faketime.NewManualClock() @@ -1697,17 +1723,6 @@ func TestPrefixDiscovery(t *testing.T) { } func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { - // Update the infinite lifetime value to a smaller value so we can test - // that when we receive a PI with such a lifetime value, we do not - // invalidate the prefix. - const testInfiniteLifetimeSeconds = 2 - const testInfiniteLifetime = testInfiniteLifetimeSeconds * time.Second - saved := header.NDPInfiniteLifetime - header.NDPInfiniteLifetime = testInfiniteLifetime - defer func() { - header.NDPInfiniteLifetime = saved - }() - prefix := tcpip.AddressWithPrefix{ Address: testutil.MustParse6("102:304:506:708::"), PrefixLen: 64, @@ -1715,8 +1730,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { subnet := prefix.Subnet() ndpDisp := ndpDispatcher{ - prefixC: make(chan ndpPrefixEvent, 1), - rememberPrefix: true, + prefixC: make(chan ndpPrefixEvent, 1), } e := channel.New(0, 1280, linkAddr1) clock := faketime.NewManualClock() @@ -1750,9 +1764,9 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { // Receive an RA with prefix in an NDP Prefix Information option (PI) // with infinite valid lifetime which should not get invalidated. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, infiniteLifetimeSeconds, 0)) expectPrefixEvent(subnet, true) - clock.Advance(testInfiniteLifetime) + clock.Advance(header.NDPInfiniteLifetime) select { case <-ndpDisp.prefixC: t.Fatal("unexpectedly invalidated a prefix with infinite lifetime") @@ -1760,9 +1774,8 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { } // Receive an RA with finite lifetime. - // The prefix should get invalidated after 1s. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0)) - clock.Advance(testInfiniteLifetime) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, infiniteLifetimeSeconds-1, 0)) + clock.Advance(header.NDPInfiniteLifetime - time.Second) select { case e := <-ndpDisp.prefixC: if diff := checkPrefixEvent(e, subnet, false); diff != "" { @@ -1773,23 +1786,13 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { } // Receive an RA with finite lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, infiniteLifetimeSeconds-1, 0)) expectPrefixEvent(subnet, true) // Receive an RA with prefix with an infinite lifetime. // The prefix should not be invalidated. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0)) - clock.Advance(testInfiniteLifetime) - select { - case <-ndpDisp.prefixC: - t.Fatal("unexpectedly invalidated a prefix with infinite lifetime") - default: - } - - // Receive an RA with a prefix with a lifetime value greater than the - // set infinite lifetime value. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds+1, 0)) - clock.Advance((testInfiniteLifetimeSeconds + 1) * time.Second) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, infiniteLifetimeSeconds, 0)) + clock.Advance(header.NDPInfiniteLifetime) select { case <-ndpDisp.prefixC: t.Fatal("unexpectedly invalidated a prefix with infinite lifetime") @@ -1806,8 +1809,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) { // ipv6.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered. func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) { ndpDisp := ndpDispatcher{ - prefixC: make(chan ndpPrefixEvent, ipv6.MaxDiscoveredOnLinkPrefixes+3), - rememberPrefix: true, + prefixC: make(chan ndpPrefixEvent, ipv6.MaxDiscoveredOnLinkPrefixes+3), } e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ @@ -1884,17 +1886,12 @@ func checkAutoGenAddrEvent(e ndpAutoGenAddrEvent, addr tcpip.AddressWithPrefix, return cmp.Diff(ndpAutoGenAddrEvent{nicID: 1, addr: addr, eventType: eventType}, e, cmp.AllowUnexported(e)) } +const minVLSeconds = uint32(ipv6.MinPrefixInformationValidLifetimeForUpdate / time.Second) +const infiniteLifetimeSeconds = uint32(header.NDPInfiniteLifetime / time.Second) + // TestAutoGenAddr tests that an address is properly generated and invalidated // when configured to do so. func TestAutoGenAddr(t *testing.T) { - const newMinVL = 2 - newMinVLDuration := newMinVL * time.Second - saved := ipv6.MinPrefixInformationValidLifetimeForUpdate - defer func() { - ipv6.MinPrefixInformationValidLifetimeForUpdate = saved - }() - ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration - prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1) prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1) @@ -1903,6 +1900,7 @@ func TestAutoGenAddr(t *testing.T) { autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), } e := channel.New(0, 1280, linkAddr1) + clock := faketime.NewManualClock() s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ NDPConfigs: ipv6.NDPConfigurations{ @@ -1911,6 +1909,7 @@ func TestAutoGenAddr(t *testing.T) { }, NDPDisp: &ndpDisp, })}, + Clock: clock, }) if err := s.SetForwardingDefaultAndAllNICs(ipv6.ProtocolNumber, forwarding); err != nil { @@ -1960,8 +1959,9 @@ func TestAutoGenAddr(t *testing.T) { default: } - // Receive an RA with prefix2 in a PI. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0)) + // Receive an RA with prefix2 in a PI with a valid lifetime that exceeds + // the minimum. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, minVLSeconds+1, 0)) expectAutoGenAddrEvent(addr2, newAddr) if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) { t.Fatalf("Should have %s in the list of addresses", addr1) @@ -1971,7 +1971,7 @@ func TestAutoGenAddr(t *testing.T) { } // Refresh valid lifetime for addr of prefix1. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0)) select { case <-ndpDisp.autoGenAddrC: t.Fatal("unexpectedly auto-generated an address when we already have an address for a prefix") @@ -1979,12 +1979,13 @@ func TestAutoGenAddr(t *testing.T) { } // Wait for addr of prefix1 to be invalidated. + clock.Advance(ipv6.MinPrefixInformationValidLifetimeForUpdate) select { case e := <-ndpDisp.autoGenAddrC: if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" { t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout): + default: t.Fatal("timed out waiting for addr auto gen event") } if containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) { @@ -2014,20 +2015,7 @@ func addressCheck(addrs []tcpip.ProtocolAddress, containList, notContainList []t // TestAutoGenTempAddr tests that temporary SLAAC addresses are generated when // configured to do so as part of IPv6 Privacy Extensions. func TestAutoGenTempAddr(t *testing.T) { - const ( - nicID = 1 - newMinVL = 5 - newMinVLDuration = newMinVL * time.Second - ) - - savedMinPrefixInformationValidLifetimeForUpdate := ipv6.MinPrefixInformationValidLifetimeForUpdate - savedMaxDesync := ipv6.MaxDesyncFactor - defer func() { - ipv6.MinPrefixInformationValidLifetimeForUpdate = savedMinPrefixInformationValidLifetimeForUpdate - ipv6.MaxDesyncFactor = savedMaxDesync - }() - ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration - ipv6.MaxDesyncFactor = time.Nanosecond + const nicID = 1 prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1) prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1) @@ -2047,218 +2035,211 @@ func TestAutoGenTempAddr(t *testing.T) { }, } - // This Run will not return until the parallel tests finish. - // - // We need this because we need to do some teardown work after the - // parallel tests complete. - // - // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for - // more details. - t.Run("group", func(t *testing.T) { - for i, test := range tests { - i := i - test := test - - t.Run(test.name, func(t *testing.T) { - t.Parallel() - - seed := []byte{uint8(i)} - var tempIIDHistory [header.IIDSize]byte - header.InitialTempIID(tempIIDHistory[:], seed, nicID) - newTempAddr := func(stableAddr tcpip.Address) tcpip.AddressWithPrefix { - return header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], stableAddr) - } - - ndpDisp := ndpDispatcher{ - dadC: make(chan ndpDADEvent, 2), - autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2), - } - e := channel.New(0, 1280, linkAddr1) - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - DADConfigs: stack.DADConfigurations{ - DupAddrDetectTransmits: test.dupAddrTransmits, - RetransmitTimer: test.retransmitTimer, - }, - NDPConfigs: ipv6.NDPConfigurations{ - HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, - AutoGenGlobalAddresses: true, - AutoGenTempGlobalAddresses: true, - }, - NDPDisp: &ndpDisp, - TempIIDSeed: seed, - })}, - }) - - if err := s.CreateNIC(nicID, e); err != nil { - t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) - } - - expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { - t.Helper() - - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } - default: - t.Fatal("expected addr auto gen event") - } - } - - expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { - t.Helper() + for i, test := range tests { + t.Run(test.name, func(t *testing.T) { + seed := []byte{uint8(i)} + var tempIIDHistory [header.IIDSize]byte + header.InitialTempIID(tempIIDHistory[:], seed, nicID) + newTempAddr := func(stableAddr tcpip.Address) tcpip.AddressWithPrefix { + return header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], stableAddr) + } - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } - case <-time.After(defaultAsyncPositiveEventTimeout): - t.Fatal("timed out waiting for addr auto gen event") - } - } + ndpDisp := ndpDispatcher{ + dadC: make(chan ndpDADEvent, 2), + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2), + } + e := channel.New(0, 1280, linkAddr1) + clock := faketime.NewManualClock() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ + DADConfigs: stack.DADConfigurations{ + DupAddrDetectTransmits: test.dupAddrTransmits, + RetransmitTimer: test.retransmitTimer, + }, + NDPConfigs: ipv6.NDPConfigurations{ + HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, + AutoGenGlobalAddresses: true, + AutoGenTempGlobalAddresses: true, + MaxTempAddrValidLifetime: 2 * ipv6.MinPrefixInformationValidLifetimeForUpdate, + MaxTempAddrPreferredLifetime: 2 * ipv6.MinPrefixInformationValidLifetimeForUpdate, + }, + NDPDisp: &ndpDisp, + TempIIDSeed: seed, + })}, + Clock: clock, + }) - expectDADEventAsync := func(addr tcpip.Address) { - t.Helper() + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } - select { - case e := <-ndpDisp.dadC: - if diff := checkDADEvent(e, nicID, addr, &stack.DADSucceeded{}); diff != "" { - t.Errorf("DAD event mismatch (-want +got):\n%s", diff) - } - case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncPositiveEventTimeout): - t.Fatal("timed out waiting for DAD event") - } - } + expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { + t.Helper() - // Receive an RA with prefix1 in an NDP Prefix Information option (PI) - // with zero valid lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0)) select { case e := <-ndpDisp.autoGenAddrC: - t.Fatalf("unexpectedly auto-generated an address with 0 lifetime; event = %+v", e) + if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } default: + t.Fatal("expected addr auto gen event") } + } - // Receive an RA with prefix1 in an NDP Prefix Information option (PI) - // with non-zero valid lifetime. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0)) - expectAutoGenAddrEvent(addr1, newAddr) - expectDADEventAsync(addr1.Address) + expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { + t.Helper() + + clock.RunImmediatelyScheduledJobs() select { case e := <-ndpDisp.autoGenAddrC: - t.Fatalf("unexpectedly got an auto gen addr event = %+v", e) + if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } default: + t.Fatal("timed out waiting for addr auto gen event") } - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1}, nil); mismatch != "" { - t.Fatal(mismatch) - } + } - // Receive an RA with prefix1 in an NDP Prefix Information option (PI) - // with non-zero valid & preferred lifetimes. - tempAddr1 := newTempAddr(addr1.Address) - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100)) - expectAutoGenAddrEvent(tempAddr1, newAddr) - expectDADEventAsync(tempAddr1.Address) - if mismatch := addressCheck(s.NICInfo()[1].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1}, nil); mismatch != "" { - t.Fatal(mismatch) - } + expectDADEventAsync := func(addr tcpip.Address) { + t.Helper() - // Receive an RA with prefix2 in an NDP Prefix Information option (PI) - // with preferred lifetime > valid lifetime - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 5, 6)) + clock.Advance(time.Duration(test.dupAddrTransmits) * test.retransmitTimer) select { - case e := <-ndpDisp.autoGenAddrC: - t.Fatalf("unexpectedly auto-generated an address with preferred lifetime > valid lifetime; event = %+v", e) + case e := <-ndpDisp.dadC: + if diff := checkDADEvent(e, nicID, addr, &stack.DADSucceeded{}); diff != "" { + t.Errorf("DAD event mismatch (-want +got):\n%s", diff) + } default: + t.Fatal("timed out waiting for DAD event") } - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1}, nil); mismatch != "" { - t.Fatal(mismatch) - } + } - // Receive an RA with prefix2 in a PI w/ non-zero valid and preferred - // lifetimes. - tempAddr2 := newTempAddr(addr2.Address) - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100)) - expectAutoGenAddrEvent(addr2, newAddr) - expectDADEventAsync(addr2.Address) - expectAutoGenAddrEventAsync(tempAddr2, newAddr) - expectDADEventAsync(tempAddr2.Address) - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" { - t.Fatal(mismatch) - } + // Receive an RA with prefix1 in an NDP Prefix Information option (PI) + // with zero valid lifetime. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0)) + select { + case e := <-ndpDisp.autoGenAddrC: + t.Fatalf("unexpectedly auto-generated an address with 0 lifetime; event = %+v", e) + default: + } - // Deprecate prefix1. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0)) - expectAutoGenAddrEvent(addr1, deprecatedAddr) - expectAutoGenAddrEvent(tempAddr1, deprecatedAddr) - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" { - t.Fatal(mismatch) - } + // Receive an RA with prefix1 in an NDP Prefix Information option (PI) + // with non-zero valid lifetime. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0)) + expectAutoGenAddrEvent(addr1, newAddr) + expectDADEventAsync(addr1.Address) + select { + case e := <-ndpDisp.autoGenAddrC: + t.Fatalf("unexpectedly got an auto gen addr event = %+v", e) + default: + } + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1}, nil); mismatch != "" { + t.Fatal(mismatch) + } - // Refresh lifetimes for prefix1. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100)) - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" { - t.Fatal(mismatch) - } + // Receive an RA with prefix1 in an NDP Prefix Information option (PI) + // with non-zero valid & preferred lifetimes. + tempAddr1 := newTempAddr(addr1.Address) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100)) + expectAutoGenAddrEvent(tempAddr1, newAddr) + expectDADEventAsync(tempAddr1.Address) + if mismatch := addressCheck(s.NICInfo()[1].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1}, nil); mismatch != "" { + t.Fatal(mismatch) + } - // Reduce valid lifetime and deprecate addresses of prefix1. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0)) - expectAutoGenAddrEvent(addr1, deprecatedAddr) - expectAutoGenAddrEvent(tempAddr1, deprecatedAddr) - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" { - t.Fatal(mismatch) - } + // Receive an RA with prefix2 in an NDP Prefix Information option (PI) + // with preferred lifetime > valid lifetime + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 5, 6)) + select { + case e := <-ndpDisp.autoGenAddrC: + t.Fatalf("unexpectedly auto-generated an address with preferred lifetime > valid lifetime; event = %+v", e) + default: + } + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1}, nil); mismatch != "" { + t.Fatal(mismatch) + } - // Wait for addrs of prefix1 to be invalidated. They should be - // invalidated at the same time. - select { - case e := <-ndpDisp.autoGenAddrC: - var nextAddr tcpip.AddressWithPrefix - if e.addr == addr1 { - if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } - nextAddr = tempAddr1 - } else { - if diff := checkAutoGenAddrEvent(e, tempAddr1, invalidatedAddr); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } - nextAddr = addr1 - } + // Receive an RA with prefix2 in a PI with a valid lifetime that exceeds + // the minimum and won't be reached in this test. + tempAddr2 := newTempAddr(addr2.Address) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 2*minVLSeconds, 2*minVLSeconds)) + expectAutoGenAddrEvent(addr2, newAddr) + expectDADEventAsync(addr2.Address) + expectAutoGenAddrEventAsync(tempAddr2, newAddr) + expectDADEventAsync(tempAddr2.Address) + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" { + t.Fatal(mismatch) + } - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, nextAddr, invalidatedAddr); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } - case <-time.After(defaultAsyncPositiveEventTimeout): - t.Fatal("timed out waiting for addr auto gen event") + // Deprecate prefix1. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0)) + expectAutoGenAddrEvent(addr1, deprecatedAddr) + expectAutoGenAddrEvent(tempAddr1, deprecatedAddr) + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" { + t.Fatal(mismatch) + } + + // Refresh lifetimes for prefix1. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100)) + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" { + t.Fatal(mismatch) + } + + // Reduce valid lifetime and deprecate addresses of prefix1. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, minVLSeconds, 0)) + expectAutoGenAddrEvent(addr1, deprecatedAddr) + expectAutoGenAddrEvent(tempAddr1, deprecatedAddr) + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" { + t.Fatal(mismatch) + } + + // Wait for addrs of prefix1 to be invalidated. They should be + // invalidated at the same time. + clock.Advance(ipv6.MinPrefixInformationValidLifetimeForUpdate) + select { + case e := <-ndpDisp.autoGenAddrC: + var nextAddr tcpip.AddressWithPrefix + if e.addr == addr1 { + if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout): - t.Fatal("timed out waiting for addr auto gen event") - } - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr2, tempAddr2}, []tcpip.AddressWithPrefix{addr1, tempAddr1}); mismatch != "" { - t.Fatal(mismatch) + nextAddr = tempAddr1 + } else { + if diff := checkAutoGenAddrEvent(e, tempAddr1, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } + nextAddr = addr1 } - // Receive an RA with prefix2 in a PI w/ 0 lifetimes. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 0, 0)) - expectAutoGenAddrEvent(addr2, deprecatedAddr) - expectAutoGenAddrEvent(tempAddr2, deprecatedAddr) select { case e := <-ndpDisp.autoGenAddrC: - t.Errorf("got unexpected auto gen addr event = %+v", e) + if diff := checkAutoGenAddrEvent(e, nextAddr, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) + } default: + t.Fatal("timed out waiting for addr auto gen event") } - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr2, tempAddr2}, []tcpip.AddressWithPrefix{addr1, tempAddr1}); mismatch != "" { - t.Fatal(mismatch) - } - }) - } - }) + default: + t.Fatal("timed out waiting for addr auto gen event") + } + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr2, tempAddr2}, []tcpip.AddressWithPrefix{addr1, tempAddr1}); mismatch != "" { + t.Fatal(mismatch) + } + + // Receive an RA with prefix2 in a PI w/ 0 lifetimes. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 0, 0)) + expectAutoGenAddrEvent(addr2, deprecatedAddr) + expectAutoGenAddrEvent(tempAddr2, deprecatedAddr) + select { + case e := <-ndpDisp.autoGenAddrC: + t.Errorf("got unexpected auto gen addr event = %+v", e) + default: + } + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr2, tempAddr2}, []tcpip.AddressWithPrefix{addr1, tempAddr1}); mismatch != "" { + t.Fatal(mismatch) + } + }) + } } // TestNoAutoGenTempAddrForLinkLocal test that temporary SLAAC addresses are not @@ -2266,12 +2247,6 @@ func TestAutoGenTempAddr(t *testing.T) { func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) { const nicID = 1 - savedMaxDesyncFactor := ipv6.MaxDesyncFactor - defer func() { - ipv6.MaxDesyncFactor = savedMaxDesyncFactor - }() - ipv6.MaxDesyncFactor = time.Nanosecond - tests := []struct { name string dupAddrTransmits uint8 @@ -2287,66 +2262,56 @@ func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) { }, } - // This Run will not return until the parallel tests finish. - // - // We need this because we need to do some teardown work after the - // parallel tests complete. - // - // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for - // more details. - t.Run("group", func(t *testing.T) { - for _, test := range tests { - test := test - - t.Run(test.name, func(t *testing.T) { - t.Parallel() - - ndpDisp := ndpDispatcher{ - dadC: make(chan ndpDADEvent, 1), - autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), - } - e := channel.New(0, 1280, linkAddr1) - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPConfigs: ipv6.NDPConfigurations{ - AutoGenTempGlobalAddresses: true, - }, - NDPDisp: &ndpDisp, - AutoGenLinkLocal: true, - })}, - }) + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ndpDisp := ndpDispatcher{ + dadC: make(chan ndpDADEvent, 1), + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + clock := faketime.NewManualClock() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ + NDPConfigs: ipv6.NDPConfigurations{ + AutoGenTempGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + AutoGenLinkLocal: true, + })}, + Clock: clock, + }) - if err := s.CreateNIC(nicID, e); err != nil { - t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) - } + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } - // The stable link-local address should auto-generate and resolve DAD. - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, tcpip.AddressWithPrefix{Address: llAddr1, PrefixLen: header.IIDOffsetInIPv6Address * 8}, newAddr); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } - default: - t.Fatal("expected addr auto gen event") + // The stable link-local address should auto-generate and resolve DAD. + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, tcpip.AddressWithPrefix{Address: llAddr1, PrefixLen: header.IIDOffsetInIPv6Address * 8}, newAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - select { - case e := <-ndpDisp.dadC: - if diff := checkDADEvent(e, nicID, llAddr1, &stack.DADSucceeded{}); diff != "" { - t.Errorf("DAD event mismatch (-want +got):\n%s", diff) - } - case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncPositiveEventTimeout): - t.Fatal("timed out waiting for DAD event") + default: + t.Fatal("expected addr auto gen event") + } + clock.Advance(time.Duration(test.dupAddrTransmits) * test.retransmitTimer) + select { + case e := <-ndpDisp.dadC: + if diff := checkDADEvent(e, nicID, llAddr1, &stack.DADSucceeded{}); diff != "" { + t.Errorf("DAD event mismatch (-want +got):\n%s", diff) } + default: + t.Fatal("timed out waiting for DAD event") + } - // No new addresses should be generated. - select { - case e := <-ndpDisp.autoGenAddrC: - t.Errorf("got unxpected auto gen addr event = %+v", e) - case <-time.After(defaultAsyncNegativeEventTimeout): - } - }) - } - }) + // No new addresses should be generated. + select { + case e := <-ndpDisp.autoGenAddrC: + t.Errorf("got unxpected auto gen addr event = %+v", e) + default: + } + }) + } } // TestNoAutoGenTempAddrWithoutStableAddr tests that a temporary SLAAC address @@ -2359,12 +2324,6 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) { retransmitTimer = 2 * time.Second ) - savedMaxDesyncFactor := ipv6.MaxDesyncFactor - defer func() { - ipv6.MaxDesyncFactor = savedMaxDesyncFactor - }() - ipv6.MaxDesyncFactor = 0 - prefix, _, addr := prefixSubnetAddr(0, linkAddr1) var tempIIDHistory [header.IIDSize]byte header.InitialTempIID(tempIIDHistory[:], nil, nicID) @@ -2375,6 +2334,7 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) { autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), } e := channel.New(0, 1280, linkAddr1) + clock := faketime.NewManualClock() s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ DADConfigs: stack.DADConfigurations{ @@ -2388,6 +2348,7 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) { }, NDPDisp: &ndpDisp, })}, + Clock: clock, }) if err := s.CreateNIC(nicID, e); err != nil { @@ -2417,12 +2378,13 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) { // Wait for DAD to complete for the stable address then expect the temporary // address to be generated. + clock.Advance(dadTransmits * retransmitTimer) select { case e := <-ndpDisp.dadC: if diff := checkDADEvent(e, nicID, addr.Address, &stack.DADSucceeded{}); diff != "" { t.Errorf("DAD event mismatch (-want +got):\n%s", diff) } - case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout): + default: t.Fatal("timed out waiting for DAD event") } select { @@ -2430,7 +2392,7 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) { if diff := checkAutoGenAddrEvent(e, tempAddr, newAddr); diff != "" { t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - case <-time.After(defaultAsyncPositiveEventTimeout): + default: t.Fatal("timed out waiting for addr auto gen event") } } @@ -2439,46 +2401,44 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) { // regenerated. func TestAutoGenTempAddrRegen(t *testing.T) { const ( - nicID = 1 - regenAfter = 2 * time.Second - newMinVL = 10 - newMinVLDuration = newMinVL * time.Second - ) + nicID = 1 + regenAdv = 2 * time.Second - savedMaxDesyncFactor := ipv6.MaxDesyncFactor - savedMinMaxTempAddrPreferredLifetime := ipv6.MinMaxTempAddrPreferredLifetime - savedMinMaxTempAddrValidLifetime := ipv6.MinMaxTempAddrValidLifetime - defer func() { - ipv6.MaxDesyncFactor = savedMaxDesyncFactor - ipv6.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime - ipv6.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime - }() - ipv6.MaxDesyncFactor = 0 - ipv6.MinMaxTempAddrPreferredLifetime = newMinVLDuration - ipv6.MinMaxTempAddrValidLifetime = newMinVLDuration + numTempAddrs = 3 + maxTempAddrValidLifetime = numTempAddrs * ipv6.MinPrefixInformationValidLifetimeForUpdate + ) prefix, _, addr := prefixSubnetAddr(0, linkAddr1) var tempIIDHistory [header.IIDSize]byte header.InitialTempIID(tempIIDHistory[:], nil, nicID) - tempAddr1 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address) - tempAddr2 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address) - tempAddr3 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address) + var tempAddrs [numTempAddrs]tcpip.AddressWithPrefix + for i := 0; i < len(tempAddrs); i++ { + tempAddrs[i] = header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address) + } ndpDisp := ndpDispatcher{ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2), } e := channel.New(0, 1280, linkAddr1) ndpConfigs := ipv6.NDPConfigurations{ - HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, - AutoGenGlobalAddresses: true, - AutoGenTempGlobalAddresses: true, - RegenAdvanceDuration: newMinVLDuration - regenAfter, + HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, + AutoGenGlobalAddresses: true, + AutoGenTempGlobalAddresses: true, + RegenAdvanceDuration: regenAdv, + MaxTempAddrValidLifetime: maxTempAddrValidLifetime, + MaxTempAddrPreferredLifetime: ipv6.MinPrefixInformationValidLifetimeForUpdate, + } + clock := faketime.NewManualClock() + randSource := savingRandSource{ + s: rand.NewSource(time.Now().UnixNano()), } s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ NDPConfigs: ndpConfigs, NDPDisp: &ndpDisp, })}, + Clock: clock, + RandSource: &randSource, }) if err := s.CreateNIC(nicID, e); err != nil { @@ -2501,36 +2461,43 @@ func TestAutoGenTempAddrRegen(t *testing.T) { expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) { t.Helper() + clock.Advance(timeout) select { case e := <-ndpDisp.autoGenAddrC: if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - case <-time.After(timeout): + default: t.Fatal("timed out waiting for addr auto gen event") } } + tempDesyncFactor := time.Duration(randSource.lastInt63) % ipv6.MaxDesyncFactor + effectiveMaxTempAddrPL := ipv6.MinPrefixInformationValidLifetimeForUpdate - tempDesyncFactor + // The time since the last regeneration before a new temporary address is + // generated. + tempAddrRegenenerationTime := effectiveMaxTempAddrPL - regenAdv + // Receive an RA with prefix1 in an NDP Prefix Information option (PI) // with non-zero valid & preferred lifetimes. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, minVLSeconds)) expectAutoGenAddrEvent(addr, newAddr) - expectAutoGenAddrEvent(tempAddr1, newAddr) - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1}, nil); mismatch != "" { + expectAutoGenAddrEvent(tempAddrs[0], newAddr) + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddrs[0]}, nil); mismatch != "" { t.Fatal(mismatch) } // Wait for regeneration - expectAutoGenAddrEventAsync(tempAddr2, newAddr, regenAfter+defaultAsyncPositiveEventTimeout) - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1, tempAddr2}, nil); mismatch != "" { + expectAutoGenAddrEventAsync(tempAddrs[1], newAddr, tempAddrRegenenerationTime) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, minVLSeconds)) + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddrs[0], tempAddrs[1]}, nil); mismatch != "" { t.Fatal(mismatch) } + expectAutoGenAddrEventAsync(tempAddrs[0], deprecatedAddr, regenAdv) // Wait for regeneration - expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncPositiveEventTimeout) - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1, tempAddr2, tempAddr3}, nil); mismatch != "" { - t.Fatal(mismatch) - } + expectAutoGenAddrEventAsync(tempAddrs[2], newAddr, tempAddrRegenenerationTime-regenAdv) + expectAutoGenAddrEventAsync(tempAddrs[1], deprecatedAddr, regenAdv) // Stop generating temporary addresses ndpConfigs.AutoGenTempGlobalAddresses = false @@ -2541,45 +2508,24 @@ func TestAutoGenTempAddrRegen(t *testing.T) { ndpEP.SetNDPConfigurations(ndpConfigs) } + // Refresh lifetimes and wait for the last temporary address to be deprecated. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, minVLSeconds)) + expectAutoGenAddrEventAsync(tempAddrs[2], deprecatedAddr, effectiveMaxTempAddrPL-regenAdv) + + // Refresh lifetimes such that the prefix is valid and preferred forever. + // + // This should not affect the lifetimes of temporary addresses because they + // are capped by the maximum valid and preferred lifetimes for temporary + // addresses. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, infiniteLifetimeSeconds, infiniteLifetimeSeconds)) + // Wait for all the temporary addresses to get invalidated. - tempAddrs := []tcpip.AddressWithPrefix{tempAddr1, tempAddr2, tempAddr3} - invalidateAfter := newMinVLDuration - 2*regenAfter + invalidateAfter := maxTempAddrValidLifetime - clock.NowMonotonic().Sub(tcpip.MonotonicTime{}) for _, addr := range tempAddrs { - // Wait for a deprecation then invalidation event, or just an invalidation - // event. We need to cover both cases but cannot deterministically hit both - // cases because the deprecation and invalidation jobs could execute in any - // order. - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, addr, deprecatedAddr); diff == "" { - // If we get a deprecation event first, we should get an invalidation - // event almost immediately after. - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } - case <-time.After(defaultAsyncPositiveEventTimeout): - t.Fatal("timed out waiting for addr auto gen event") - } - } else if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff == "" { - // If we get an invalidation event first, we shouldn't get a deprecation - // event after. - select { - case e := <-ndpDisp.autoGenAddrC: - t.Fatalf("unexpectedly got an auto-generated event = %+v", e) - case <-time.After(defaultAsyncNegativeEventTimeout): - } - } else { - t.Fatalf("got unexpected auto-generated event = %+v", e) - } - case <-time.After(invalidateAfter + defaultAsyncPositiveEventTimeout): - t.Fatal("timed out waiting for addr auto gen event") - } - - invalidateAfter = regenAfter + expectAutoGenAddrEventAsync(addr, invalidatedAddr, invalidateAfter) + invalidateAfter = tempAddrRegenenerationTime } - if mismatch := addressCheck(s.NICInfo()[1].ProtocolAddresses, []tcpip.AddressWithPrefix{addr}, tempAddrs); mismatch != "" { + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr}, tempAddrs[:]); mismatch != "" { t.Fatal(mismatch) } } @@ -2588,52 +2534,54 @@ func TestAutoGenTempAddrRegen(t *testing.T) { // regeneration job gets updated when refreshing the address's lifetimes. func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) { const ( - nicID = 1 - regenAfter = 2 * time.Second - newMinVL = 10 - newMinVLDuration = newMinVL * time.Second - ) + nicID = 1 + regenAdv = 2 * time.Second - savedMaxDesyncFactor := ipv6.MaxDesyncFactor - savedMinMaxTempAddrPreferredLifetime := ipv6.MinMaxTempAddrPreferredLifetime - savedMinMaxTempAddrValidLifetime := ipv6.MinMaxTempAddrValidLifetime - defer func() { - ipv6.MaxDesyncFactor = savedMaxDesyncFactor - ipv6.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime - ipv6.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime - }() - ipv6.MaxDesyncFactor = 0 - ipv6.MinMaxTempAddrPreferredLifetime = newMinVLDuration - ipv6.MinMaxTempAddrValidLifetime = newMinVLDuration + numTempAddrs = 3 + maxTempAddrPreferredLifetime = ipv6.MinPrefixInformationValidLifetimeForUpdate + maxTempAddrPreferredLifetimeSeconds = uint32(maxTempAddrPreferredLifetime / time.Second) + ) prefix, _, addr := prefixSubnetAddr(0, linkAddr1) var tempIIDHistory [header.IIDSize]byte header.InitialTempIID(tempIIDHistory[:], nil, nicID) - tempAddr1 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address) - tempAddr2 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address) - tempAddr3 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address) + var tempAddrs [numTempAddrs]tcpip.AddressWithPrefix + for i := 0; i < len(tempAddrs); i++ { + tempAddrs[i] = header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address) + } ndpDisp := ndpDispatcher{ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2), } e := channel.New(0, 1280, linkAddr1) ndpConfigs := ipv6.NDPConfigurations{ - HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, - AutoGenGlobalAddresses: true, - AutoGenTempGlobalAddresses: true, - RegenAdvanceDuration: newMinVLDuration - regenAfter, + HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, + AutoGenGlobalAddresses: true, + AutoGenTempGlobalAddresses: true, + RegenAdvanceDuration: regenAdv, + MaxTempAddrPreferredLifetime: maxTempAddrPreferredLifetime, + MaxTempAddrValidLifetime: maxTempAddrPreferredLifetime * 2, + } + clock := faketime.NewManualClock() + initialTime := clock.NowMonotonic() + randSource := savingRandSource{ + s: rand.NewSource(time.Now().UnixNano()), } s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ NDPConfigs: ndpConfigs, NDPDisp: &ndpDisp, })}, + Clock: clock, + RandSource: &randSource, }) if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } + tempDesyncFactor := time.Duration(randSource.lastInt63) % ipv6.MaxDesyncFactor + expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { t.Helper() @@ -2650,22 +2598,23 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) { expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) { t.Helper() + clock.Advance(timeout) select { case e := <-ndpDisp.autoGenAddrC: if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - case <-time.After(timeout): + default: t.Fatal("timed out waiting for addr auto gen event") } } // Receive an RA with prefix1 in an NDP Prefix Information option (PI) // with non-zero valid & preferred lifetimes. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, maxTempAddrPreferredLifetimeSeconds, maxTempAddrPreferredLifetimeSeconds)) expectAutoGenAddrEvent(addr, newAddr) - expectAutoGenAddrEvent(tempAddr1, newAddr) - if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1}, nil); mismatch != "" { + expectAutoGenAddrEvent(tempAddrs[0], newAddr) + if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddrs[0]}, nil); mismatch != "" { t.Fatal(mismatch) } @@ -2673,13 +2622,27 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) { // // A new temporary address should be generated after the regeneration // time has passed since the prefix is deprecated. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 0)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, maxTempAddrPreferredLifetimeSeconds, 0)) expectAutoGenAddrEvent(addr, deprecatedAddr) - expectAutoGenAddrEvent(tempAddr1, deprecatedAddr) + expectAutoGenAddrEvent(tempAddrs[0], deprecatedAddr) select { case e := <-ndpDisp.autoGenAddrC: - t.Fatalf("unexpected auto gen addr event = %+v", e) - case <-time.After(regenAfter + defaultAsyncNegativeEventTimeout): + t.Fatalf("unexpected auto gen addr event = %#v", e) + default: + } + + effectiveMaxTempAddrPL := maxTempAddrPreferredLifetime - tempDesyncFactor + // The time since the last regeneration before a new temporary address is + // generated. + tempAddrRegenenerationTime := effectiveMaxTempAddrPL - regenAdv + + // Advance the clock by the regeneration time but don't expect a new temporary + // address as the prefix is deprecated. + clock.Advance(tempAddrRegenenerationTime) + select { + case e := <-ndpDisp.autoGenAddrC: + t.Fatalf("unexpected auto gen addr event = %#v", e) + default: } // Prefer the prefix again. @@ -2687,8 +2650,15 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) { // A new temporary address should immediately be generated since the // regeneration time has already passed since the last address was generated // - this regeneration does not depend on a job. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100)) - expectAutoGenAddrEvent(tempAddr2, newAddr) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, maxTempAddrPreferredLifetimeSeconds, maxTempAddrPreferredLifetimeSeconds)) + expectAutoGenAddrEvent(tempAddrs[1], newAddr) + // Wait for the first temporary address to be deprecated. + expectAutoGenAddrEventAsync(tempAddrs[0], deprecatedAddr, regenAdv) + select { + case e := <-ndpDisp.autoGenAddrC: + t.Fatalf("unexpected auto gen addr event = %s", e) + default: + } // Increase the maximum lifetimes for temporary addresses to large values // then refresh the lifetimes of the prefix. @@ -2699,34 +2669,30 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) { // regenerate a new temporary address. Note, new addresses are only // regenerated after the preferred lifetime - the regenerate advance duration // as paased. - ndpConfigs.MaxTempAddrValidLifetime = 100 * time.Second - ndpConfigs.MaxTempAddrPreferredLifetime = 100 * time.Second + const largeLifetimeSeconds = minVLSeconds * 2 + const largeLifetime = time.Duration(largeLifetimeSeconds) * time.Second + ndpConfigs.MaxTempAddrValidLifetime = 2 * largeLifetime + ndpConfigs.MaxTempAddrPreferredLifetime = largeLifetime ipv6Ep, err := s.GetNetworkEndpoint(nicID, header.IPv6ProtocolNumber) if err != nil { t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err) } ndpEP := ipv6Ep.(ipv6.NDPEndpoint) ndpEP.SetNDPConfigurations(ndpConfigs) - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, largeLifetimeSeconds, largeLifetimeSeconds)) + timeSinceInitialTime := clock.NowMonotonic().Sub(initialTime) + clock.Advance(largeLifetime - timeSinceInitialTime) + expectAutoGenAddrEvent(tempAddrs[0], deprecatedAddr) + // to offset the advement of time to test the first temporary address's + // deprecation after the second was generated + advLess := regenAdv + expectAutoGenAddrEventAsync(tempAddrs[2], newAddr, timeSinceInitialTime-advLess-(tempDesyncFactor+regenAdv)) + expectAutoGenAddrEventAsync(tempAddrs[1], deprecatedAddr, regenAdv) select { case e := <-ndpDisp.autoGenAddrC: t.Fatalf("unexpected auto gen addr event = %+v", e) - case <-time.After(regenAfter + defaultAsyncNegativeEventTimeout): + default: } - - // Set the maximum lifetimes for temporary addresses such that on the next - // RA, the regeneration job gets scheduled again. - // - // The maximum lifetime is the sum of the minimum lifetimes for temporary - // addresses + the time that has already passed since the last address was - // generated so that the regeneration job is needed to generate the next - // address. - newLifetimes := newMinVLDuration + regenAfter + defaultAsyncNegativeEventTimeout - ndpConfigs.MaxTempAddrValidLifetime = newLifetimes - ndpConfigs.MaxTempAddrPreferredLifetime = newLifetimes - ndpEP.SetNDPConfigurations(ndpConfigs) - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100)) - expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncPositiveEventTimeout) } // TestMixedSLAACAddrConflictRegen tests SLAAC address regeneration in response @@ -2853,8 +2819,12 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) { continue } - if err := s.AddAddress(nicID, ipv6.ProtocolNumber, test.addrs[j].Address); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, test.addrs[j].Address, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: test.addrs[j].Address.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } manuallyAssignedAddresses[test.addrs[j].Address] = struct{}{} @@ -2954,13 +2924,14 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) { // stack.Stack will have a default route through the router (llAddr3) installed // and a static link-address (linkAddr3) added to the link address cache for the // router. -func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) { +func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*ndpDispatcher, *channel.Endpoint, *stack.Stack, *faketime.ManualClock) { t.Helper() ndpDisp := &ndpDispatcher{ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), } e := channel.New(0, 1280, linkAddr1) e.LinkEPCapabilities |= stack.CapabilityResolutionRequired + clock := faketime.NewManualClock() s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ NDPConfigs: ipv6.NDPConfigurations{ @@ -2970,6 +2941,7 @@ func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*nd NDPDisp: ndpDisp, })}, TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}, + Clock: clock, }) if err := s.CreateNIC(nicID, e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) @@ -2983,7 +2955,7 @@ func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*nd if err := s.AddStaticNeighbor(nicID, ipv6.ProtocolNumber, llAddr3, linkAddr3); err != nil { t.Fatalf("s.AddStaticNeighbor(%d, %d, %s, %s): %s", nicID, ipv6.ProtocolNumber, llAddr3, linkAddr3, err) } - return ndpDisp, e, s + return ndpDisp, e, s, clock } // addrForNewConnectionTo returns the local address used when creating a new @@ -3057,7 +3029,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1) prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1) - ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID) + ndpDisp, e, s, _ := stackAndNdpDispatcherWithDefaultRoute(t, nicID) expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { t.Helper() @@ -3160,19 +3132,11 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { // when its preferred lifetime expires. func TestAutoGenAddrJobDeprecation(t *testing.T) { const nicID = 1 - const newMinVL = 2 - newMinVLDuration := newMinVL * time.Second - - saved := ipv6.MinPrefixInformationValidLifetimeForUpdate - defer func() { - ipv6.MinPrefixInformationValidLifetimeForUpdate = saved - }() - ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1) prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1) - ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID) + ndpDisp, e, s, clock := stackAndNdpDispatcherWithDefaultRoute(t, nicID) expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { t.Helper() @@ -3190,12 +3154,13 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) { t.Helper() + clock.Advance(timeout) select { case e := <-ndpDisp.autoGenAddrC: if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - case <-time.After(timeout): + default: t.Fatal("timed out waiting for addr auto gen event") } } @@ -3213,7 +3178,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { } // Receive PI for prefix2. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, infiniteLifetimeSeconds, infiniteLifetimeSeconds)) expectAutoGenAddrEvent(addr2, newAddr) if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) { t.Fatalf("should have %s in the list of addresses", addr2) @@ -3232,7 +3197,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { expectPrimaryAddr(addr1) // Refresh lifetime for addr of prefix1. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, minVLSeconds, minVLSeconds-1)) select { case <-ndpDisp.autoGenAddrC: t.Fatal("unexpectedly got an auto-generated event") @@ -3241,7 +3206,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { expectPrimaryAddr(addr1) // Wait for addr of prefix1 to be deprecated. - expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout) + expectAutoGenAddrEventAfter(addr1, deprecatedAddr, ipv6.MinPrefixInformationValidLifetimeForUpdate-time.Second) if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should not have %s in the list of addresses", addr1) } @@ -3251,6 +3216,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { // addr2 should be the primary endpoint now since addr1 is deprecated but // addr2 is not. expectPrimaryAddr(addr2) + // addr1 is deprecated but if explicitly requested, it should be used. fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID} if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address { @@ -3259,7 +3225,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { // Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make // sure we do not get a deprecation event again. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, minVLSeconds, 0)) select { case <-ndpDisp.autoGenAddrC: t.Fatal("unexpectedly got an auto-generated event") @@ -3271,7 +3237,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { } // Refresh lifetimes for addr of prefix1. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, minVLSeconds, minVLSeconds-1)) select { case <-ndpDisp.autoGenAddrC: t.Fatal("unexpectedly got an auto-generated event") @@ -3281,7 +3247,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { expectPrimaryAddr(addr1) // Wait for addr of prefix1 to be deprecated. - expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout) + expectAutoGenAddrEventAfter(addr1, deprecatedAddr, ipv6.MinPrefixInformationValidLifetimeForUpdate-time.Second) if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should not have %s in the list of addresses", addr1) } @@ -3295,7 +3261,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { } // Wait for addr of prefix1 to be invalidated. - expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncPositiveEventTimeout) + expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second) if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { t.Fatalf("should not have %s in the list of addresses", addr1) } @@ -3305,7 +3271,7 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { expectPrimaryAddr(addr2) // Refresh both lifetimes for addr of prefix2 to the same value. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL)) + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, minVLSeconds, minVLSeconds)) select { case <-ndpDisp.autoGenAddrC: t.Fatal("unexpectedly got an auto-generated event") @@ -3317,6 +3283,17 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { // cases because the deprecation and invalidation handlers could be handled in // either deprecation then invalidation, or invalidation then deprecation // (which should be cancelled by the invalidation handler). + // + // Since we're about to cause both events to fire, we need the dispatcher + // channel to be able to hold both. + if got, want := len(ndpDisp.autoGenAddrC), 0; got != want { + t.Fatalf("got len(ndpDisp.autoGenAddrC) = %d, want %d", got, want) + } + if got, want := cap(ndpDisp.autoGenAddrC), 1; got != want { + t.Fatalf("got cap(ndpDisp.autoGenAddrC) = %d, want %d", got, want) + } + ndpDisp.autoGenAddrC = make(chan ndpAutoGenAddrEvent, 2) + clock.Advance(ipv6.MinPrefixInformationValidLifetimeForUpdate) select { case e := <-ndpDisp.autoGenAddrC: if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" { @@ -3327,21 +3304,21 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" { t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - case <-time.After(defaultAsyncPositiveEventTimeout): + default: t.Fatal("timed out waiting for addr auto gen event") } } else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" { - // If we get an invalidation event first, we should not get a deprecation + // If we get an invalidation event first, we should not get a deprecation // event after. select { case <-ndpDisp.autoGenAddrC: t.Fatal("unexpectedly got an auto-generated event") - case <-time.After(defaultAsyncNegativeEventTimeout): + default: } } else { t.Fatalf("got unexpected auto-generated event") } - case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout): + default: t.Fatal("timed out waiting for addr auto gen event") } if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) { @@ -3378,15 +3355,6 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) { // infinite values. func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) { const infiniteVLSeconds = 2 - const minVLSeconds = 1 - savedIL := header.NDPInfiniteLifetime - savedMinVL := ipv6.MinPrefixInformationValidLifetimeForUpdate - defer func() { - ipv6.MinPrefixInformationValidLifetimeForUpdate = savedMinVL - header.NDPInfiniteLifetime = savedIL - }() - ipv6.MinPrefixInformationValidLifetimeForUpdate = minVLSeconds * time.Second - header.NDPInfiniteLifetime = infiniteVLSeconds * time.Second prefix, _, addr := prefixSubnetAddr(0, linkAddr1) @@ -3410,68 +3378,58 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) { }, } - // This Run will not return until the parallel tests finish. - // - // We need this because we need to do some teardown work after the - // parallel tests complete. - // - // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for - // more details. - t.Run("group", func(t *testing.T) { - for _, test := range tests { - test := test - - t.Run(test.name, func(t *testing.T) { - t.Parallel() - - ndpDisp := ndpDispatcher{ - autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), - } - e := channel.New(0, 1280, linkAddr1) - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPConfigs: ipv6.NDPConfigurations{ - HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, - AutoGenGlobalAddresses: true, - }, - NDPDisp: &ndpDisp, - })}, - }) - - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(1) = %s", err) - } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + } + e := channel.New(0, 1280, linkAddr1) + clock := faketime.NewManualClock() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ + NDPConfigs: ipv6.NDPConfigurations{ + HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, + AutoGenGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + })}, + Clock: clock, + }) - // Receive an RA with finite prefix. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0)) - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } - default: - t.Fatal("expected addr auto gen event") + // Receive an RA with finite prefix. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0)) + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - // Receive an new RA with prefix with infinite VL. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.infiniteVL, 0)) + default: + t.Fatal("expected addr auto gen event") + } - // Receive a new RA with prefix with finite VL. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0)) + // Receive an new RA with prefix with infinite VL. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.infiniteVL, 0)) - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } + // Receive a new RA with prefix with finite VL. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0)) - case <-time.After(minVLSeconds*time.Second + defaultAsyncPositiveEventTimeout): - t.Fatal("timeout waiting for addr auto gen event") + clock.Advance(ipv6.MinPrefixInformationValidLifetimeForUpdate) + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - }) - } - }) + + default: + t.Fatal("timeout waiting for addr auto gen event") + } + }) + } } // TestAutoGenAddrValidLifetimeUpdates tests that the valid lifetime of an @@ -3479,12 +3437,6 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) { // RFC 4862 section 5.5.3.e. func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) { const infiniteVL = 4294967295 - const newMinVL = 4 - saved := ipv6.MinPrefixInformationValidLifetimeForUpdate - defer func() { - ipv6.MinPrefixInformationValidLifetimeForUpdate = saved - }() - ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second prefix, _, addr := prefixSubnetAddr(0, linkAddr1) @@ -3495,137 +3447,129 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) { evl uint32 }{ // Should update the VL to the minimum VL for updating if the - // new VL is less than newMinVL but was originally greater than + // new VL is less than minVLSeconds but was originally greater than // it. { "LargeVLToVLLessThanMinVLForUpdate", 9999, 1, - newMinVL, + minVLSeconds, }, { "LargeVLTo0", 9999, 0, - newMinVL, + minVLSeconds, }, { "InfiniteVLToVLLessThanMinVLForUpdate", infiniteVL, 1, - newMinVL, + minVLSeconds, }, { "InfiniteVLTo0", infiniteVL, 0, - newMinVL, + minVLSeconds, }, - // Should not update VL if original VL was less than newMinVL - // and the new VL is also less than newMinVL. + // Should not update VL if original VL was less than minVLSeconds + // and the new VL is also less than minVLSeconds. { "ShouldNotUpdateWhenBothOldAndNewAreLessThanMinVLForUpdate", - newMinVL - 1, - newMinVL - 3, - newMinVL - 1, + minVLSeconds - 1, + minVLSeconds - 3, + minVLSeconds - 1, }, // Should take the new VL if the new VL is greater than the - // remaining time or is greater than newMinVL. + // remaining time or is greater than minVLSeconds. { "MorethanMinVLToLesserButStillMoreThanMinVLForUpdate", - newMinVL + 5, - newMinVL + 3, - newMinVL + 3, + minVLSeconds + 5, + minVLSeconds + 3, + minVLSeconds + 3, }, { "SmallVLToGreaterVLButStillLessThanMinVLForUpdate", - newMinVL - 3, - newMinVL - 1, - newMinVL - 1, + minVLSeconds - 3, + minVLSeconds - 1, + minVLSeconds - 1, }, { "SmallVLToGreaterVLThatIsMoreThaMinVLForUpdate", - newMinVL - 3, - newMinVL + 1, - newMinVL + 1, + minVLSeconds - 3, + minVLSeconds + 1, + minVLSeconds + 1, }, } - // This Run will not return until the parallel tests finish. - // - // We need this because we need to do some teardown work after the - // parallel tests complete. - // - // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for - // more details. - t.Run("group", func(t *testing.T) { - for _, test := range tests { - test := test - - t.Run(test.name, func(t *testing.T) { - t.Parallel() - - ndpDisp := ndpDispatcher{ - autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10), - } - e := channel.New(10, 1280, linkAddr1) - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ - NDPConfigs: ipv6.NDPConfigurations{ - HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, - AutoGenGlobalAddresses: true, - }, - NDPDisp: &ndpDisp, - })}, - }) + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ndpDisp := ndpDispatcher{ + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10), + } + e := channel.New(10, 1280, linkAddr1) + clock := faketime.NewManualClock() + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ + NDPConfigs: ipv6.NDPConfigurations{ + HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, + AutoGenGlobalAddresses: true, + }, + NDPDisp: &ndpDisp, + })}, + Clock: clock, + }) - if err := s.CreateNIC(1, e); err != nil { - t.Fatalf("CreateNIC(1) = %s", err) - } + if err := s.CreateNIC(1, e); err != nil { + t.Fatalf("CreateNIC(1) = %s", err) + } - // Receive an RA with prefix with initial VL, - // test.ovl. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.ovl, 0)) - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } - default: - t.Fatal("expected addr auto gen event") + // Receive an RA with prefix with initial VL, + // test.ovl. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.ovl, 0)) + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } + default: + t.Fatal("expected addr auto gen event") + } - // Receive an new RA with prefix with new VL, - // test.nvl. - e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.nvl, 0)) + // Receive an new RA with prefix with new VL, + // test.nvl. + e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.nvl, 0)) - // - // Validate that the VL for the address got set - // to test.evl. - // + // + // Validate that the VL for the address got set + // to test.evl. + // - // The address should not be invalidated until the effective valid - // lifetime has passed. - select { - case <-ndpDisp.autoGenAddrC: - t.Fatal("unexpectedly received an auto gen addr event") - case <-time.After(time.Duration(test.evl)*time.Second - defaultAsyncNegativeEventTimeout): - } + // The address should not be invalidated until the effective valid + // lifetime has passed. + const delta = 1 + clock.Advance(time.Duration(test.evl)*time.Second - delta) + select { + case <-ndpDisp.autoGenAddrC: + t.Fatal("unexpectedly received an auto gen addr event") + default: + } - // Wait for the invalidation event. - select { - case e := <-ndpDisp.autoGenAddrC: - if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" { - t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) - } - case <-time.After(defaultAsyncPositiveEventTimeout): - t.Fatal("timeout waiting for addr auto gen event") + // Wait for the invalidation event. + clock.Advance(delta) + select { + case e := <-ndpDisp.autoGenAddrC: + if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" { + t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - }) - } - }) + default: + t.Fatal("timeout waiting for addr auto gen event") + } + }) + } } // TestAutoGenAddrRemoval tests that when auto-generated addresses are removed @@ -3696,7 +3640,7 @@ func TestAutoGenAddrAfterRemoval(t *testing.T) { prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1) prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1) - ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID) + ndpDisp, e, s, _ := stackAndNdpDispatcherWithDefaultRoute(t, nicID) expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { t.Helper() @@ -3735,8 +3679,9 @@ func TestAutoGenAddrAfterRemoval(t *testing.T) { Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr2, } - if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil { - t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err) + properties := stack.AddressProperties{PEB: stack.FirstPrimaryEndpoint} + if err := s.AddProtocolAddress(nicID, protoAddr2, properties); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, %+v) = %s", nicID, protoAddr2, properties, err) } // addr2 should be more preferred now since it is at the front of the primary // list. @@ -3824,8 +3769,9 @@ func TestAutoGenAddrStaticConflict(t *testing.T) { } // Add the address as a static address before SLAAC tries to add it. - if err := s.AddProtocolAddress(1, tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr}); err != nil { - t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr.Address, err) + protocolAddr := tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr} + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(1, %+v, {}) = %s", protocolAddr, err) } if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) { t.Fatalf("Should have %s in the list of addresses", addr1) @@ -3976,13 +3922,6 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { const maxMaxRetries = 3 const lifetimeSeconds = 10 - // Needed for the temporary address sub test. - savedMaxDesync := ipv6.MaxDesyncFactor - defer func() { - ipv6.MaxDesyncFactor = savedMaxDesync - }() - ipv6.MaxDesyncFactor = time.Nanosecond - secretKey := makeSecretKey(t) prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1) @@ -4008,22 +3947,24 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { } } - expectAutoGenAddrEventAsync := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { + expectAutoGenAddrEventAsync := func(t *testing.T, clock *faketime.ManualClock, ndpDisp *ndpDispatcher, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) { t.Helper() + clock.RunImmediatelyScheduledJobs() select { case e := <-ndpDisp.autoGenAddrC: if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" { t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff) } - case <-time.After(defaultAsyncPositiveEventTimeout): + default: t.Fatal("timed out waiting for addr auto gen event") } } - expectDADEvent := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.Address, res stack.DADResult) { + expectDADEvent := func(t *testing.T, clock *faketime.ManualClock, ndpDisp *ndpDispatcher, addr tcpip.Address, res stack.DADResult) { t.Helper() + clock.RunImmediatelyScheduledJobs() select { case e := <-ndpDisp.dadC: if diff := checkDADEvent(e, nicID, addr, res); diff != "" { @@ -4034,15 +3975,16 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { } } - expectDADEventAsync := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.Address, res stack.DADResult) { + expectDADEventAsync := func(t *testing.T, clock *faketime.ManualClock, ndpDisp *ndpDispatcher, addr tcpip.Address, res stack.DADResult) { t.Helper() + clock.Advance(dadTransmits * retransmitTimer) select { case e := <-ndpDisp.dadC: if diff := checkDADEvent(e, nicID, addr, res); diff != "" { t.Errorf("DAD event mismatch (-want +got):\n%s", diff) } - case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout): + default: t.Fatal("timed out waiting for DAD event") } } @@ -4053,7 +3995,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { name string ndpConfigs ipv6.NDPConfigurations autoGenLinkLocal bool - prepareFn func(t *testing.T, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix + prepareFn func(t *testing.T, clock *faketime.ManualClock, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix addrGenFn func(dadCounter uint8, tempIIDHistory []byte) tcpip.AddressWithPrefix }{ { @@ -4062,7 +4004,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { HandleRAs: ipv6.HandlingRAsEnabledWhenForwardingDisabled, AutoGenGlobalAddresses: true, }, - prepareFn: func(_ *testing.T, _ *ndpDispatcher, e *channel.Endpoint, _ []byte) []tcpip.AddressWithPrefix { + prepareFn: func(_ *testing.T, _ *faketime.ManualClock, _ *ndpDispatcher, e *channel.Endpoint, _ []byte) []tcpip.AddressWithPrefix { // Receive an RA with prefix1 in a PI. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds)) return nil @@ -4076,7 +4018,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { name: "LinkLocal address", ndpConfigs: ipv6.NDPConfigurations{}, autoGenLinkLocal: true, - prepareFn: func(*testing.T, *ndpDispatcher, *channel.Endpoint, []byte) []tcpip.AddressWithPrefix { + prepareFn: func(*testing.T, *faketime.ManualClock, *ndpDispatcher, *channel.Endpoint, []byte) []tcpip.AddressWithPrefix { return nil }, addrGenFn: func(dadCounter uint8, _ []byte) tcpip.AddressWithPrefix { @@ -4090,14 +4032,14 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { AutoGenGlobalAddresses: true, AutoGenTempGlobalAddresses: true, }, - prepareFn: func(t *testing.T, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix { + prepareFn: func(t *testing.T, clock *faketime.ManualClock, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix { header.InitialTempIID(tempIIDHistory, nil, nicID) // Generate a stable SLAAC address so temporary addresses will be // generated. e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100)) expectAutoGenAddrEvent(t, ndpDisp, stableAddrForTempAddrTest, newAddr) - expectDADEventAsync(t, ndpDisp, stableAddrForTempAddrTest.Address, &stack.DADSucceeded{}) + expectDADEventAsync(t, clock, ndpDisp, stableAddrForTempAddrTest.Address, &stack.DADSucceeded{}) // The stable address will be assigned throughout the test. return []tcpip.AddressWithPrefix{stableAddrForTempAddrTest} @@ -4109,14 +4051,6 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { } for _, addrType := range addrTypes { - // This Run will not return until the parallel tests finish. - // - // We need this because we need to do some teardown work after the parallel - // tests complete and limit the number of parallel tests running at the same - // time to reduce flakes. - // - // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for - // more details. t.Run(addrType.name, func(t *testing.T) { for maxRetries := uint8(0); maxRetries <= maxMaxRetries; maxRetries++ { for numFailures := uint8(0); numFailures <= maxRetries+1; numFailures++ { @@ -4125,8 +4059,6 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { addrType := addrType t.Run(fmt.Sprintf("%d max retries and %d failures", maxRetries, numFailures), func(t *testing.T) { - t.Parallel() - ndpDisp := ndpDispatcher{ dadC: make(chan ndpDADEvent, 1), autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2), @@ -4134,6 +4066,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { e := channel.New(0, 1280, linkAddr1) ndpConfigs := addrType.ndpConfigs ndpConfigs.AutoGenAddressConflictRetries = maxRetries + clock := faketime.NewManualClock() s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ AutoGenLinkLocal: addrType.autoGenLinkLocal, @@ -4150,6 +4083,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { SecretKey: secretKey, }, })}, + Clock: clock, }) opts := stack.NICOptions{Name: nicName} if err := s.CreateNICWithOptions(nicID, e, opts); err != nil { @@ -4157,12 +4091,12 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { } var tempIIDHistory [header.IIDSize]byte - stableAddrs := addrType.prepareFn(t, &ndpDisp, e, tempIIDHistory[:]) + stableAddrs := addrType.prepareFn(t, clock, &ndpDisp, e, tempIIDHistory[:]) // Simulate DAD conflicts so the address is regenerated. for i := uint8(0); i < numFailures; i++ { addr := addrType.addrGenFn(i, tempIIDHistory[:]) - expectAutoGenAddrEventAsync(t, &ndpDisp, addr, newAddr) + expectAutoGenAddrEventAsync(t, clock, &ndpDisp, addr, newAddr) // Should not have any new addresses assigned to the NIC. if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, stableAddrs, nil); mismatch != "" { @@ -4172,17 +4106,21 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { // Simulate a DAD conflict. rxNDPSolicit(e, addr.Address) expectAutoGenAddrEvent(t, &ndpDisp, addr, invalidatedAddr) - expectDADEvent(t, &ndpDisp, addr.Address, &stack.DADDupAddrDetected{}) + expectDADEvent(t, clock, &ndpDisp, addr.Address, &stack.DADDupAddrDetected{}) // Attempting to add the address manually should not fail if the // address's state was cleaned up when DAD failed. - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr.Address); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr.Address, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addr, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } if err := s.RemoveAddress(nicID, addr.Address); err != nil { t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr.Address, err) } - expectDADEvent(t, &ndpDisp, addr.Address, &stack.DADAborted{}) + expectDADEvent(t, clock, &ndpDisp, addr.Address, &stack.DADAborted{}) } // Should not have any new addresses assigned to the NIC. @@ -4194,8 +4132,8 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { // an address after DAD resolves. if maxRetries+1 > numFailures { addr := addrType.addrGenFn(numFailures, tempIIDHistory[:]) - expectAutoGenAddrEventAsync(t, &ndpDisp, addr, newAddr) - expectDADEventAsync(t, &ndpDisp, addr.Address, &stack.DADSucceeded{}) + expectAutoGenAddrEventAsync(t, clock, &ndpDisp, addr, newAddr) + expectDADEventAsync(t, clock, &ndpDisp, addr.Address, &stack.DADSucceeded{}) if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, append(stableAddrs, addr), nil); mismatch != "" { t.Fatal(mismatch) } @@ -4205,7 +4143,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) { select { case e := <-ndpDisp.autoGenAddrC: t.Fatalf("unexpectedly got an auto-generated address event = %+v", e) - case <-time.After(defaultAsyncNegativeEventTimeout): + default: } }) } @@ -4718,11 +4656,9 @@ func TestNoCleanupNDPStateWhenForwardingEnabled(t *testing.T) { ) ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, 1), - rememberRouter: true, - prefixC: make(chan ndpPrefixEvent, 1), - rememberPrefix: true, - autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), + offLinkRouteC: make(chan ndpOffLinkRouteEvent, 1), + prefixC: make(chan ndpPrefixEvent, 1), + autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1), } s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{ @@ -4765,17 +4701,17 @@ func TestNoCleanupNDPStateWhenForwardingEnabled(t *testing.T) { ), ) select { - case e := <-ndpDisp.routerC: - if diff := checkRouterEvent(e, llAddr3, true /* discovered */); diff != "" { - t.Errorf("router event mismatch (-want +got):\n%s", diff) + case e := <-ndpDisp.offLinkRouteC: + if diff := checkOffLinkRouteEvent(e, nicID, header.IPv6EmptySubnet, llAddr3, header.MediumRoutePreference, true /* discovered */); diff != "" { + t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) } default: - t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID) + t.Errorf("expected off-link route event for %s on NIC(%d)", llAddr3, nicID) } select { case e := <-ndpDisp.prefixC: if diff := checkPrefixEvent(e, subnet, true /* discovered */); diff != "" { - t.Errorf("router event mismatch (-want +got):\n%s", diff) + t.Errorf("off-link route event mismatch (-want +got):\n%s", diff) } default: t.Errorf("expected prefix event for %s on NIC(%d)", prefix, nicID) @@ -4797,8 +4733,8 @@ func TestNoCleanupNDPStateWhenForwardingEnabled(t *testing.T) { t.Fatalf("SetForwardingDefaultAndAllNICs(%d, %t): %s", ipv6.ProtocolNumber, forwarding, err) } select { - case e := <-ndpDisp.routerC: - t.Errorf("unexpected router event = %#v", e) + case e := <-ndpDisp.offLinkRouteC: + t.Errorf("unexpected off-link route event = %#v", e) default: } select { @@ -4884,11 +4820,9 @@ func TestCleanupNDPState(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { ndpDisp := ndpDispatcher{ - routerC: make(chan ndpRouterEvent, maxRouterAndPrefixEvents), - rememberRouter: true, - prefixC: make(chan ndpPrefixEvent, maxRouterAndPrefixEvents), - rememberPrefix: true, - autoGenAddrC: make(chan ndpAutoGenAddrEvent, test.maxAutoGenAddrEvents), + offLinkRouteC: make(chan ndpOffLinkRouteEvent, maxRouterAndPrefixEvents), + prefixC: make(chan ndpPrefixEvent, maxRouterAndPrefixEvents), + autoGenAddrC: make(chan ndpAutoGenAddrEvent, test.maxAutoGenAddrEvents), } clock := faketime.NewManualClock() s := stack.New(stack.Options{ @@ -4905,14 +4839,14 @@ func TestCleanupNDPState(t *testing.T) { Clock: clock, }) - expectRouterEvent := func() (bool, ndpRouterEvent) { + expectOffLinkRouteEvent := func() (bool, ndpOffLinkRouteEvent) { select { - case e := <-ndpDisp.routerC: + case e := <-ndpDisp.offLinkRouteC: return true, e default: } - return false, ndpRouterEvent{} + return false, ndpOffLinkRouteEvent{} } expectPrefixEvent := func() (bool, ndpPrefixEvent) { @@ -4957,8 +4891,8 @@ func TestCleanupNDPState(t *testing.T) { // multiple addresses. e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds)) - if ok, _ := expectRouterEvent(); !ok { - t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1) + if ok, _ := expectOffLinkRouteEvent(); !ok { + t.Errorf("expected off-link route event for %s on NIC(%d)", llAddr3, nicID1) } if ok, _ := expectPrefixEvent(); !ok { t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1) @@ -4968,8 +4902,8 @@ func TestCleanupNDPState(t *testing.T) { } e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds)) - if ok, _ := expectRouterEvent(); !ok { - t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1) + if ok, _ := expectOffLinkRouteEvent(); !ok { + t.Errorf("expected off-link route event for %s on NIC(%d)", llAddr4, nicID1) } if ok, _ := expectPrefixEvent(); !ok { t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1) @@ -4979,8 +4913,8 @@ func TestCleanupNDPState(t *testing.T) { } e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds)) - if ok, _ := expectRouterEvent(); !ok { - t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2) + if ok, _ := expectOffLinkRouteEvent(); !ok { + t.Errorf("expected off-link route event for %s on NIC(%d)", llAddr3, nicID2) } if ok, _ := expectPrefixEvent(); !ok { t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2) @@ -4990,8 +4924,8 @@ func TestCleanupNDPState(t *testing.T) { } e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds)) - if ok, _ := expectRouterEvent(); !ok { - t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2) + if ok, _ := expectOffLinkRouteEvent(); !ok { + t.Errorf("expected off-link route event for %s on NIC(%d)", llAddr4, nicID2) } if ok, _ := expectPrefixEvent(); !ok { t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2) @@ -5032,14 +4966,14 @@ func TestCleanupNDPState(t *testing.T) { test.cleanupFn(t, s) // Collect invalidation events after having NDP state cleaned up. - gotRouterEvents := make(map[ndpRouterEvent]int) + gotOffLinkRouteEvents := make(map[ndpOffLinkRouteEvent]int) for i := 0; i < maxRouterAndPrefixEvents; i++ { - ok, e := expectRouterEvent() + ok, e := expectOffLinkRouteEvent() if !ok { - t.Errorf("expected %d router events after becoming a router; got = %d", maxRouterAndPrefixEvents, i) + t.Errorf("expected %d off-link route events after becoming a router; got = %d", maxRouterAndPrefixEvents, i) break } - gotRouterEvents[e]++ + gotOffLinkRouteEvents[e]++ } gotPrefixEvents := make(map[ndpPrefixEvent]int) for i := 0; i < maxRouterAndPrefixEvents; i++ { @@ -5066,14 +5000,14 @@ func TestCleanupNDPState(t *testing.T) { t.FailNow() } - expectedRouterEvents := map[ndpRouterEvent]int{ - {nicID: nicID1, addr: llAddr3, discovered: false}: 1, - {nicID: nicID1, addr: llAddr4, discovered: false}: 1, - {nicID: nicID2, addr: llAddr3, discovered: false}: 1, - {nicID: nicID2, addr: llAddr4, discovered: false}: 1, + expectedOffLinkRouteEvents := map[ndpOffLinkRouteEvent]int{ + {nicID: nicID1, subnet: header.IPv6EmptySubnet, router: llAddr3, updated: false}: 1, + {nicID: nicID1, subnet: header.IPv6EmptySubnet, router: llAddr4, updated: false}: 1, + {nicID: nicID2, subnet: header.IPv6EmptySubnet, router: llAddr3, updated: false}: 1, + {nicID: nicID2, subnet: header.IPv6EmptySubnet, router: llAddr4, updated: false}: 1, } - if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" { - t.Errorf("router events mismatch (-want +got):\n%s", diff) + if diff := cmp.Diff(expectedOffLinkRouteEvents, gotOffLinkRouteEvents); diff != "" { + t.Errorf("off-link route events mismatch (-want +got):\n%s", diff) } expectedPrefixEvents := map[ndpPrefixEvent]int{ {nicID: nicID1, prefix: subnet1, discovered: false}: 1, @@ -5137,8 +5071,8 @@ func TestCleanupNDPState(t *testing.T) { // cancelled when the NDP state was cleaned up). clock.Advance(lifetimeSeconds * time.Second) select { - case <-ndpDisp.routerC: - t.Error("unexpected router event") + case <-ndpDisp.offLinkRouteC: + t.Error("unexpected off-link route event") default: } select { @@ -5163,7 +5097,6 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) { ndpDisp := ndpDispatcher{ dhcpv6ConfigurationC: make(chan ndpDHCPv6Event, 1), - rememberRouter: true, } e := channel.New(0, 1280, linkAddr1) s := stack.New(stack.Options{ @@ -5464,16 +5397,25 @@ func TestRouterSolicitation(t *testing.T) { RandSource: &randSource, }) - if err := s.CreateNIC(nicID, &e); err != nil { - t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + opts := stack.NICOptions{Disabled: true} + if err := s.CreateNICWithOptions(nicID, &e, opts); err != nil { + t.Fatalf("CreateNICWithOptions(%d, _, %#v) = %s", nicID, opts, err) } if addr := test.nicAddr; addr != "" { - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: addr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } } + if err := s.EnableNIC(nicID); err != nil { + t.Fatalf("EnableNIC(%d): %s", nicID, err) + } + // Make sure each RS is sent at the right time. remaining := test.maxRtrSolicit if remaining != 0 { diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 378389db2..29d580e76 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -72,9 +72,15 @@ type nic struct { sync.RWMutex spoofing bool promiscuous bool - // packetEPs is protected by mu, but the contained packetEndpointList are - // not. - packetEPs map[tcpip.NetworkProtocolNumber]*packetEndpointList + } + + packetEPs struct { + mu sync.RWMutex + + // eps is protected by the mutex, but the values contained in it are not. + // + // +checklocks:mu + eps map[tcpip.NetworkProtocolNumber]*packetEndpointList } } @@ -91,6 +97,8 @@ type packetEndpointList struct { mu sync.RWMutex // eps is protected by mu, but the contained PacketEndpoint values are not. + // + // +checklocks:mu eps []PacketEndpoint } @@ -111,6 +119,12 @@ func (p *packetEndpointList) remove(ep PacketEndpoint) { } } +func (p *packetEndpointList) len() int { + p.mu.RLock() + defer p.mu.RUnlock() + return len(p.eps) +} + // forEach calls fn with each endpoints in p while holding the read lock on p. func (p *packetEndpointList) forEach(fn func(PacketEndpoint)) { p.mu.RLock() @@ -143,18 +157,16 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC duplicateAddressDetectors: make(map[tcpip.NetworkProtocolNumber]DuplicateAddressDetector), } nic.linkResQueue.init(nic) - nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber]*packetEndpointList) + + nic.packetEPs.mu.Lock() + defer nic.packetEPs.mu.Unlock() + + nic.packetEPs.eps = make(map[tcpip.NetworkProtocolNumber]*packetEndpointList) resolutionRequired := ep.Capabilities()&CapabilityResolutionRequired != 0 - // Register supported packet and network endpoint protocols. - for _, netProto := range header.Ethertypes { - nic.mu.packetEPs[netProto] = new(packetEndpointList) - } for _, netProto := range stack.networkProtocols { netNum := netProto.Number() - nic.mu.packetEPs[netNum] = new(packetEndpointList) - netEP := netProto.NewEndpoint(nic, nic) nic.networkEndpoints[netNum] = netEP @@ -365,6 +377,8 @@ func (n *nic) writePacket(r RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt pkt.EgressRoute = r pkt.NetworkProtocolNumber = protocol + n.deliverOutboundPacket(r.RemoteLinkAddress, pkt) + if err := n.LinkEndpoint.WritePacket(r, protocol, pkt); err != nil { return err } @@ -383,6 +397,7 @@ func (n *nic) writePackets(r RouteInfo, protocol tcpip.NetworkProtocolNumber, pk for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { pkt.EgressRoute = r pkt.NetworkProtocolNumber = protocol + n.deliverOutboundPacket(r.RemoteLinkAddress, pkt) } writtenPackets, err := n.LinkEndpoint.WritePackets(r, pkts, protocol) @@ -501,7 +516,7 @@ func (n *nic) getAddressOrCreateTempInner(protocol tcpip.NetworkProtocolNumber, // addAddress adds a new address to n, so that it starts accepting packets // targeted at the given address (and network protocol). -func (n *nic) addAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) tcpip.Error { +func (n *nic) addAddress(protocolAddress tcpip.ProtocolAddress, properties AddressProperties) tcpip.Error { ep, ok := n.networkEndpoints[protocolAddress.Protocol] if !ok { return &tcpip.ErrUnknownProtocol{} @@ -512,7 +527,7 @@ func (n *nic) addAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpo return &tcpip.ErrNotSupported{} } - addressEndpoint, err := addressableEndpoint.AddAndAcquirePermanentAddress(protocolAddress.AddressWithPrefix, peb, AddressConfigStatic, false /* deprecated */) + addressEndpoint, err := addressableEndpoint.AddAndAcquirePermanentAddress(protocolAddress.AddressWithPrefix, properties) if err == nil { // We have no need for the address endpoint. addressEndpoint.DecRef() @@ -699,12 +714,9 @@ func (n *nic) isInGroup(addr tcpip.Address) bool { // This rule applies only to the slice itself, not to the items of the slice; // the ownership of the items is not retained by the caller. func (n *nic) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { - n.mu.RLock() enabled := n.Enabled() // If the NIC is not yet enabled, don't receive any packets. if !enabled { - n.mu.RUnlock() - n.stats.disabledRx.packets.Increment() n.stats.disabledRx.bytes.IncrementBy(uint64(pkt.Data().Size())) return @@ -715,7 +727,6 @@ func (n *nic) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp networkEndpoint, ok := n.networkEndpoints[protocol] if !ok { - n.mu.RUnlock() n.stats.unknownL3ProtocolRcvdPackets.Increment() return } @@ -727,44 +738,87 @@ func (n *nic) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp } pkt.RXTransportChecksumValidated = n.LinkEndpoint.Capabilities()&CapabilityRXChecksumOffload != 0 - // Are any packet type sockets listening for this network protocol? - protoEPs := n.mu.packetEPs[protocol] - // Other packet type sockets that are listening for all protocols. - anyEPs := n.mu.packetEPs[header.EthernetProtocolAll] - n.mu.RUnlock() - // Deliver to interested packet endpoints without holding NIC lock. + var packetEPPkt *PacketBuffer deliverPacketEPs := func(ep PacketEndpoint) { - p := pkt.Clone() - p.PktType = tcpip.PacketHost - ep.HandlePacket(n.id, local, protocol, p) + if packetEPPkt == nil { + // Packet endpoints hold the full packet. + // + // We perform a deep copy because higher-level endpoints may point to + // the middle of a view that is held by a packet endpoint. Save/Restore + // does not support overlapping slices and will panic in this case. + // + // TODO(https://gvisor.dev/issue/6517): Avoid this copy once S/R supports + // overlapping slices (e.g. by passing a shallow copy of pkt to the packet + // endpoint). + packetEPPkt = NewPacketBuffer(PacketBufferOptions{ + Data: PayloadSince(pkt.LinkHeader()).ToVectorisedView(), + }) + // If a link header was populated in the original packet buffer, then + // populate it in the packet buffer we provide to packet endpoints as + // packet endpoints inspect link headers. + packetEPPkt.LinkHeader().Consume(pkt.LinkHeader().View().Size()) + packetEPPkt.PktType = tcpip.PacketHost + } + + ep.HandlePacket(n.id, local, protocol, packetEPPkt.Clone()) } - if protoEPs != nil { + + n.packetEPs.mu.Lock() + // Are any packet type sockets listening for this network protocol? + protoEPs, protoEPsOK := n.packetEPs.eps[protocol] + // Other packet type sockets that are listening for all protocols. + anyEPs, anyEPsOK := n.packetEPs.eps[header.EthernetProtocolAll] + n.packetEPs.mu.Unlock() + + if protoEPsOK { protoEPs.forEach(deliverPacketEPs) } - if anyEPs != nil { + if anyEPsOK { anyEPs.forEach(deliverPacketEPs) } networkEndpoint.HandlePacket(pkt) } -// DeliverOutboundPacket implements NetworkDispatcher.DeliverOutboundPacket. -func (n *nic) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { - n.mu.RLock() +// deliverOutboundPacket delivers outgoing packets to interested endpoints. +func (n *nic) deliverOutboundPacket(remote tcpip.LinkAddress, pkt *PacketBuffer) { + n.packetEPs.mu.RLock() + defer n.packetEPs.mu.RUnlock() // We do not deliver to protocol specific packet endpoints as on Linux // only ETH_P_ALL endpoints get outbound packets. // Add any other packet sockets that maybe listening for all protocols. - eps := n.mu.packetEPs[header.EthernetProtocolAll] - n.mu.RUnlock() + eps, ok := n.packetEPs.eps[header.EthernetProtocolAll] + if !ok { + return + } + + local := n.LinkAddress() + var packetEPPkt *PacketBuffer eps.forEach(func(ep PacketEndpoint) { - p := pkt.Clone() - p.PktType = tcpip.PacketOutgoing - // Add the link layer header as outgoing packets are intercepted - // before the link layer header is created. - n.LinkEndpoint.AddHeader(local, remote, protocol, p) - ep.HandlePacket(n.id, local, protocol, p) + if packetEPPkt == nil { + // Packet endpoints hold the full packet. + // + // We perform a deep copy because higher-level endpoints may point to + // the middle of a view that is held by a packet endpoint. Save/Restore + // does not support overlapping slices and will panic in this case. + // + // TODO(https://gvisor.dev/issue/6517): Avoid this copy once S/R supports + // overlapping slices (e.g. by passing a shallow copy of pkt to the packet + // endpoint). + packetEPPkt = NewPacketBuffer(PacketBufferOptions{ + ReserveHeaderBytes: pkt.AvailableHeaderBytes(), + Data: PayloadSince(pkt.NetworkHeader()).ToVectorisedView(), + }) + // Add the link layer header as outgoing packets are intercepted before + // the link layer header is created and packet endpoints are interested + // in the link header. + n.LinkEndpoint.AddHeader(local, remote, pkt.NetworkProtocolNumber, packetEPPkt) + packetEPPkt.PktType = tcpip.PacketOutgoing + } + + ep.HandlePacket(n.id, local, pkt.NetworkProtocolNumber, packetEPPkt.Clone()) }) } @@ -779,17 +833,11 @@ func (n *nic) DeliverTransportPacket(protocol tcpip.TransportProtocolNumber, pkt transProto := state.proto - // Raw socket packets are delivered based solely on the transport - // protocol number. We do not inspect the payload to ensure it's - // validly formed. - n.stack.demux.deliverRawPacket(protocol, pkt) - // TransportHeader is empty only when pkt is an ICMP packet or was reassembled // from fragments. if pkt.TransportHeader().View().IsEmpty() { - // TODO(gvisor.dev/issue/170): ICMP packets don't have their TransportHeader - // fields set yet, parse it here. See icmp/protocol.go:protocol.Parse for a - // full explanation. + // ICMP packets don't have their TransportHeader fields set yet, parse it + // here. See icmp/protocol.go:protocol.Parse for a full explanation. if protocol == header.ICMPv4ProtocolNumber || protocol == header.ICMPv6ProtocolNumber { // ICMP packets may be longer, but until icmp.Parse is implemented, here // we parse it using the minimum size. @@ -878,6 +926,17 @@ func (n *nic) DeliverTransportError(local, remote tcpip.Address, net tcpip.Netwo } } +// DeliverRawPacket implements TransportDispatcher. +func (n *nic) DeliverRawPacket(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) { + // For ICMPv4 only we validate the header length for compatibility with + // raw(7) ICMP_FILTER. The same check is made in Linux here: + // https://github.com/torvalds/linux/blob/70585216/net/ipv4/raw.c#L189. + if protocol == header.ICMPv4ProtocolNumber && pkt.TransportHeader().View().Size()+pkt.Data().Size() < header.ICMPv4MinimumSize { + return + } + n.stack.demux.deliverRawPacket(protocol, pkt) +} + // ID implements NetworkInterface. func (n *nic) ID() tcpip.NICID { return n.id @@ -912,12 +971,13 @@ func (n *nic) setNUDConfigs(protocol tcpip.NetworkProtocolNumber, c NUDConfigura } func (n *nic) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) tcpip.Error { - n.mu.Lock() - defer n.mu.Unlock() + n.packetEPs.mu.Lock() + defer n.packetEPs.mu.Unlock() - eps, ok := n.mu.packetEPs[netProto] + eps, ok := n.packetEPs.eps[netProto] if !ok { - return &tcpip.ErrNotSupported{} + eps = new(packetEndpointList) + n.packetEPs.eps[netProto] = eps } eps.add(ep) @@ -925,14 +985,17 @@ func (n *nic) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep Pa } func (n *nic) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { - n.mu.Lock() - defer n.mu.Unlock() + n.packetEPs.mu.Lock() + defer n.packetEPs.mu.Unlock() - eps, ok := n.mu.packetEPs[netProto] + eps, ok := n.packetEPs.eps[netProto] if !ok { return } eps.remove(ep) + if eps.len() == 0 { + delete(n.packetEPs.eps, netProto) + } } // isValidForOutgoing returns true if the endpoint can be used to send out a diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go index 5cb342f78..c8ad93f29 100644 --- a/pkg/tcpip/stack/nic_test.go +++ b/pkg/tcpip/stack/nic_test.go @@ -127,11 +127,6 @@ func (*testIPv6Protocol) MinimumPacketSize() int { return header.IPv6MinimumSize } -// DefaultPrefixLen implements NetworkProtocol.DefaultPrefixLen. -func (*testIPv6Protocol) DefaultPrefixLen() int { - return header.IPv6AddressSize * 8 -} - // ParseAddresses implements NetworkProtocol.ParseAddresses. func (*testIPv6Protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { h := header.IPv6(v) diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go index 9192d8433..29c22bfd4 100644 --- a/pkg/tcpip/stack/packet_buffer.go +++ b/pkg/tcpip/stack/packet_buffer.go @@ -282,14 +282,12 @@ func (pk *PacketBuffer) headerView(typ headerType) tcpipbuffer.View { return v } -// Clone makes a shallow copy of pk. -// -// Clone should be called in such cases so that no modifications is done to -// underlying packet payload. +// Clone makes a semi-deep copy of pk. The underlying packet payload is +// shared. Hence, no modifications is done to underlying packet payload. func (pk *PacketBuffer) Clone() *PacketBuffer { return &PacketBuffer{ PacketBufferEntry: pk.PacketBufferEntry, - buf: pk.buf, + buf: pk.buf.Clone(), reserved: pk.reserved, pushed: pk.pushed, consumed: pk.consumed, @@ -321,14 +319,14 @@ func (pk *PacketBuffer) Network() header.Network { } } -// CloneToInbound makes a shallow copy of the packet buffer to be used as an -// inbound packet. +// CloneToInbound makes a semi-deep copy of the packet buffer (similar to +// Clone) to be used as an inbound packet. // // See PacketBuffer.Data for details about how a packet buffer holds an inbound // packet. func (pk *PacketBuffer) CloneToInbound() *PacketBuffer { newPk := &PacketBuffer{ - buf: pk.buf, + buf: pk.buf.Clone(), // Treat unfilled header portion as reserved. reserved: pk.AvailableHeaderBytes(), } diff --git a/pkg/tcpip/stack/packet_buffer_test.go b/pkg/tcpip/stack/packet_buffer_test.go index a8da34992..87b023445 100644 --- a/pkg/tcpip/stack/packet_buffer_test.go +++ b/pkg/tcpip/stack/packet_buffer_test.go @@ -123,6 +123,32 @@ func TestPacketHeaderPush(t *testing.T) { } } +func TestPacketBufferClone(t *testing.T) { + data := concatViews(makeView(20), makeView(30), makeView(40)) + pk := NewPacketBuffer(PacketBufferOptions{ + // Make a copy of data to make sure our truth data won't be taint by + // PacketBuffer. + Data: buffer.NewViewFromBytes(data).ToVectorisedView(), + }) + + bytesToDelete := 30 + originalSize := data.Size() + + clonedPks := []*PacketBuffer{ + pk.Clone(), + pk.CloneToInbound(), + } + pk.Data().DeleteFront(bytesToDelete) + if got, want := pk.Data().Size(), originalSize-bytesToDelete; got != want { + t.Errorf("original packet was not changed: size expected = %d, got = %d", want, got) + } + for _, clonedPk := range clonedPks { + if got := clonedPk.Data().Size(); got != originalSize { + t.Errorf("cloned packet should not be modified: expected size = %d, got = %d", originalSize, got) + } + } +} + func TestPacketHeaderConsume(t *testing.T) { for _, test := range []struct { name string diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index a038389e0..31b3a554d 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -265,6 +265,11 @@ type TransportDispatcher interface { // // DeliverTransportError takes ownership of the packet buffer. DeliverTransportError(local, remote tcpip.Address, _ tcpip.NetworkProtocolNumber, _ tcpip.TransportProtocolNumber, _ TransportError, _ *PacketBuffer) + + // DeliverRawPacket delivers a packet to any subscribed raw sockets. + // + // DeliverRawPacket does NOT take ownership of the packet buffer. + DeliverRawPacket(tcpip.TransportProtocolNumber, *PacketBuffer) } // PacketLooping specifies where an outbound packet should be sent. @@ -313,8 +318,7 @@ type PrimaryEndpointBehavior int const ( // CanBePrimaryEndpoint indicates the endpoint can be used as a primary - // endpoint for new connections with no local address. This is the - // default when calling NIC.AddAddress. + // endpoint for new connections with no local address. CanBePrimaryEndpoint PrimaryEndpointBehavior = iota // FirstPrimaryEndpoint indicates the endpoint should be the first @@ -327,6 +331,19 @@ const ( NeverPrimaryEndpoint ) +func (peb PrimaryEndpointBehavior) String() string { + switch peb { + case CanBePrimaryEndpoint: + return "CanBePrimaryEndpoint" + case FirstPrimaryEndpoint: + return "FirstPrimaryEndpoint" + case NeverPrimaryEndpoint: + return "NeverPrimaryEndpoint" + default: + panic(fmt.Sprintf("unknown primary endpoint behavior: %d", peb)) + } +} + // AddressConfigType is the method used to add an address. type AddressConfigType int @@ -346,6 +363,14 @@ const ( AddressConfigSlaacTemp ) +// AddressProperties contains additional properties that can be configured when +// adding an address. +type AddressProperties struct { + PEB PrimaryEndpointBehavior + ConfigType AddressConfigType + Deprecated bool +} + // AssignableAddressEndpoint is a reference counted address endpoint that may be // assigned to a NetworkEndpoint. type AssignableAddressEndpoint interface { @@ -452,7 +477,7 @@ type AddressableEndpoint interface { // Returns *tcpip.ErrDuplicateAddress if the address exists. // // Acquires and returns the AddressEndpoint for the added address. - AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated bool) (AddressEndpoint, tcpip.Error) + AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties AddressProperties) (AddressEndpoint, tcpip.Error) // RemovePermanentAddress removes the passed address if it is a permanent // address. @@ -680,9 +705,6 @@ type NetworkProtocol interface { // than this targeted at this protocol. MinimumPacketSize() int - // DefaultPrefixLen returns the protocol's default prefix length. - DefaultPrefixLen() int - // ParseAddresses returns the source and destination addresses stored in a // packet of this protocol. ParseAddresses(v buffer.View) (src, dst tcpip.Address) @@ -728,16 +750,6 @@ type NetworkDispatcher interface { // // DeliverNetworkPacket takes ownership of pkt. DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) - - // DeliverOutboundPacket is called by link layer when a packet is being - // sent out. - // - // pkt.LinkHeader may or may not be set before calling - // DeliverOutboundPacket. Some packets do not have link headers (e.g. - // packets sent via loopback), and won't have the field set. - // - // DeliverOutboundPacket takes ownership of pkt. - DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) } // LinkEndpointCapabilities is the type associated with the capabilities @@ -841,6 +853,14 @@ type LinkEndpoint interface { // offload is enabled. If it will be used for something else, syscall filters // may need to be updated. WritePackets(RouteInfo, PacketBufferList, tcpip.NetworkProtocolNumber) (int, tcpip.Error) + + // WriteRawPacket writes a packet directly to the link. + // + // If the link-layer has its own header, the payload must already include the + // header. + // + // WriteRawPacket takes ownership of the packet. + WriteRawPacket(*PacketBuffer) tcpip.Error } // InjectableLinkEndpoint is a LinkEndpoint where inbound packets are diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 40d277312..98867a828 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -72,7 +72,8 @@ type Stack struct { // rawFactory creates raw endpoints. If nil, raw endpoints are // disabled. It is set during Stack creation and is immutable. - rawFactory RawFactory + rawFactory RawFactory + packetEndpointWriteSupported bool demux *transportDemuxer @@ -108,7 +109,7 @@ type Stack struct { handleLocal bool // tables are the iptables packet filtering and manipulation rules. - // TODO(gvisor.dev/issue/170): S/R this field. + // TODO(gvisor.dev/issue/4595): S/R this field. tables *IPTables // resumableEndpoints is a list of endpoints that need to be resumed if the @@ -119,8 +120,7 @@ type Stack struct { // by the stack. icmpRateLimiter *ICMPRateLimiter - // seed is a one-time random value initialized at stack startup - // and is used to seed the TCP port picking on active connections + // seed is a one-time random value initialized at stack startup. // // TODO(gvisor.dev/issue/940): S/R this field. seed uint32 @@ -161,6 +161,10 @@ type Stack struct { // This is required to prevent potential ACK loops. // Setting this to 0 will disable all rate limiting. tcpInvalidRateLimit time.Duration + + // tsOffsetSecret is the secret key for generating timestamp offsets + // initialized at stack startup. + tsOffsetSecret uint32 } // UniqueID is an abstract generator of unique identifiers. @@ -215,6 +219,10 @@ type Options struct { // this is non-nil. RawFactory RawFactory + // AllowPacketEndpointWrite determines if packet endpoints support write + // operations. + AllowPacketEndpointWrite bool + // RandSource is an optional source to use to generate random // numbers. If omitted it defaults to a Source seeded by the data // returned by the stack secure RNG. @@ -356,23 +364,24 @@ func New(opts Options) *Stack { opts.NUDConfigs.resetInvalidFields() s := &Stack{ - transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), - networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), - nics: make(map[tcpip.NICID]*nic), - defaultForwardingEnabled: make(map[tcpip.NetworkProtocolNumber]struct{}), - cleanupEndpoints: make(map[TransportEndpoint]struct{}), - PortManager: ports.NewPortManager(), - clock: clock, - stats: opts.Stats.FillIn(), - handleLocal: opts.HandleLocal, - tables: opts.IPTables, - icmpRateLimiter: NewICMPRateLimiter(), - seed: seed, - nudConfigs: opts.NUDConfigs, - uniqueIDGenerator: opts.UniqueID, - nudDisp: opts.NUDDisp, - randomGenerator: randomGenerator, - secureRNG: opts.SecureRNG, + transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), + networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), + nics: make(map[tcpip.NICID]*nic), + packetEndpointWriteSupported: opts.AllowPacketEndpointWrite, + defaultForwardingEnabled: make(map[tcpip.NetworkProtocolNumber]struct{}), + cleanupEndpoints: make(map[TransportEndpoint]struct{}), + PortManager: ports.NewPortManager(), + clock: clock, + stats: opts.Stats.FillIn(), + handleLocal: opts.HandleLocal, + tables: opts.IPTables, + icmpRateLimiter: NewICMPRateLimiter(), + seed: seed, + nudConfigs: opts.NUDConfigs, + uniqueIDGenerator: opts.UniqueID, + nudDisp: opts.NUDDisp, + randomGenerator: randomGenerator, + secureRNG: opts.SecureRNG, sendBufferSize: tcpip.SendBufferSizeOption{ Min: MinBufferSize, Default: DefaultBufferSize, @@ -384,6 +393,7 @@ func New(opts Options) *Stack { Max: DefaultMaxBufferSize, }, tcpInvalidRateLimit: defaultTCPInvalidRateLimit, + tsOffsetSecret: randomGenerator.Uint32(), } // Add specified network protocols. @@ -780,6 +790,9 @@ func (s *Stack) removeNICLocked(id tcpip.NICID) tcpip.Error { if !ok { return &tcpip.ErrUnknownNICID{} } + if nic.IsLoopback() { + return &tcpip.ErrNotSupported{} + } delete(s.nics, id) // Remove routes in-place. n tracks the number of routes written. @@ -903,46 +916,9 @@ type NICStateFlags struct { Loopback bool } -// AddAddress adds a new network-layer address to the specified NIC. -func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error { - return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint) -} - -// AddAddressWithPrefix is the same as AddAddress, but allows you to specify -// the address prefix. -func (s *Stack) AddAddressWithPrefix(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.AddressWithPrefix) tcpip.Error { - ap := tcpip.ProtocolAddress{ - Protocol: protocol, - AddressWithPrefix: addr, - } - return s.AddProtocolAddressWithOptions(id, ap, CanBePrimaryEndpoint) -} - -// AddProtocolAddress adds a new network-layer protocol address to the -// specified NIC. -func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress) tcpip.Error { - return s.AddProtocolAddressWithOptions(id, protocolAddress, CanBePrimaryEndpoint) -} - -// AddAddressWithOptions is the same as AddAddress, but allows you to specify -// whether the new endpoint can be primary or not. -func (s *Stack) AddAddressWithOptions(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, peb PrimaryEndpointBehavior) tcpip.Error { - netProto, ok := s.networkProtocols[protocol] - if !ok { - return &tcpip.ErrUnknownProtocol{} - } - return s.AddProtocolAddressWithOptions(id, tcpip.ProtocolAddress{ - Protocol: protocol, - AddressWithPrefix: tcpip.AddressWithPrefix{ - Address: addr, - PrefixLen: netProto.DefaultPrefixLen(), - }, - }, peb) -} - -// AddProtocolAddressWithOptions is the same as AddProtocolAddress, but allows -// you to specify whether the new endpoint can be primary or not. -func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) tcpip.Error { +// AddProtocolAddress adds an address to the specified NIC, possibly with extra +// properties. +func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress, properties AddressProperties) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() @@ -951,7 +927,7 @@ func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tc return &tcpip.ErrUnknownNICID{} } - return nic.addAddress(protocolAddress, peb) + return nic.addAddress(protocolAddress, properties) } // RemoveAddress removes an existing network-layer address from the specified @@ -1646,9 +1622,27 @@ func (s *Stack) WritePacketToRemote(nicID tcpip.NICID, remote tcpip.LinkAddress, ReserveHeaderBytes: int(nic.MaxHeaderLength()), Data: payload, }) + pkt.NetworkProtocolNumber = netProto return nic.WritePacketToRemote(remote, netProto, pkt) } +// WriteRawPacket writes data directly to the specified NIC without adding any +// headers. +func (s *Stack) WriteRawPacket(nicID tcpip.NICID, proto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) tcpip.Error { + s.mu.RLock() + nic, ok := s.nics[nicID] + s.mu.RUnlock() + if !ok { + return &tcpip.ErrUnknownNICID{} + } + + pkt := NewPacketBuffer(PacketBufferOptions{ + Data: payload, + }) + pkt.NetworkProtocolNumber = proto + return nic.WriteRawPacket(pkt) +} + // NetworkProtocolInstance returns the protocol instance in the stack for the // specified network protocol. This method is public for protocol implementers // and tests to use. @@ -1816,8 +1810,7 @@ func (s *Stack) SetNUDConfigurations(id tcpip.NICID, proto tcpip.NetworkProtocol return nic.setNUDConfigs(proto, c) } -// Seed returns a 32 bit value that can be used as a seed value for port -// picking, ISN generation etc. +// Seed returns a 32 bit value that can be used as a seed value. // // NOTE: The seed is generated once during stack initialization only. func (s *Stack) Seed() uint32 { @@ -1872,9 +1865,8 @@ const ( // ParsePacketBufferTransport parses the provided packet buffer's transport // header. func (s *Stack) ParsePacketBufferTransport(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) ParseResult { - // TODO(gvisor.dev/issue/170): ICMP packets don't have their TransportHeader - // fields set yet, parse it here. See icmp/protocol.go:protocol.Parse for a - // full explanation. + // ICMP packets don't have their TransportHeader fields set yet, parse it + // here. See icmp/protocol.go:protocol.Parse for a full explanation. if protocol == header.ICMPv4ProtocolNumber || protocol == header.ICMPv6ProtocolNumber { return ParsedOK } @@ -1942,3 +1934,9 @@ func (s *Stack) IsSubnetBroadcast(nicID tcpip.NICID, protocol tcpip.NetworkProto return false } + +// PacketEndpointWriteSupported returns true iff packet endpoints support write +// operations. +func (s *Stack) PacketEndpointWriteSupported() bool { + return s.packetEndpointWriteSupported +} diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 21951d05a..cd4137794 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -234,10 +234,6 @@ func (*fakeNetworkProtocol) MinimumPacketSize() int { return fakeNetHeaderLen } -func (*fakeNetworkProtocol) DefaultPrefixLen() int { - return fakeDefaultPrefixLen -} - func (f *fakeNetworkProtocol) PacketCount(intfAddr byte) int { return f.packetCount[int(intfAddr)%len(f.packetCount)] } @@ -349,12 +345,26 @@ func TestNetworkReceive(t *testing.T) { t.Fatal("CreateNIC failed:", err) } - if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x01", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr1, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr1, err) } - if err := s.AddAddress(1, fakeNetNumber, "\x02"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x02", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr2, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr2, err) } fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol) @@ -517,8 +527,15 @@ func TestNetworkSend(t *testing.T) { s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}}) } - if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x01", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } // Make sure that the link-layer endpoint received the outbound packet. @@ -538,12 +555,26 @@ func TestNetworkSendMultiRoute(t *testing.T) { t.Fatal("CreateNIC failed:", err) } - if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x01", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr1, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr1, err) } - if err := s.AddAddress(1, fakeNetNumber, "\x03"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr3 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x03", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr3, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr3, err) } ep2 := channel.New(10, defaultMTU, "") @@ -551,12 +582,26 @@ func TestNetworkSendMultiRoute(t *testing.T) { t.Fatal("CreateNIC failed:", err) } - if err := s.AddAddress(2, fakeNetNumber, "\x02"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x02", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(2, protocolAddr2, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 2, protocolAddr2, err) } - if err := s.AddAddress(2, fakeNetNumber, "\x04"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr4 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x04", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(2, protocolAddr4, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 2, protocolAddr4, err) } // Set a route table that sends all packets with odd destination @@ -719,38 +764,59 @@ func TestRemoveUnknownNIC(t *testing.T) { } func TestRemoveNIC(t *testing.T) { - const nicID = 1 + for _, tt := range []struct { + name string + linkep stack.LinkEndpoint + expectErr tcpip.Error + }{ + { + name: "loopback", + linkep: loopback.New(), + expectErr: &tcpip.ErrNotSupported{}, + }, + { + name: "channel", + linkep: channel.New(0, defaultMTU, ""), + expectErr: nil, + }, + } { + t.Run(tt.name, func(t *testing.T) { + const nicID = 1 - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory}, - }) + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory}, + }) - e := linkEPWithMockedAttach{ - LinkEndpoint: loopback.New(), - } - if err := s.CreateNIC(nicID, &e); err != nil { - t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) - } + e := linkEPWithMockedAttach{ + LinkEndpoint: tt.linkep, + } + if err := s.CreateNIC(nicID, &e); err != nil { + t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) + } - // NIC should be present in NICInfo and attached to a NetworkDispatcher. - allNICInfo := s.NICInfo() - if _, ok := allNICInfo[nicID]; !ok { - t.Errorf("entry for %d missing from allNICInfo = %+v", nicID, allNICInfo) - } - if !e.isAttached() { - t.Fatal("link endpoint not attached to a network dispatcher") - } + // NIC should be present in NICInfo and attached to a NetworkDispatcher. + allNICInfo := s.NICInfo() + if _, ok := allNICInfo[nicID]; !ok { + t.Errorf("entry for %d missing from allNICInfo = %+v", nicID, allNICInfo) + } + if !e.isAttached() { + t.Fatal("link endpoint not attached to a network dispatcher") + } - // Removing a NIC should remove it from NICInfo and e should be detached from - // the NetworkDispatcher. - if err := s.RemoveNIC(nicID); err != nil { - t.Fatalf("s.RemoveNIC(%d): %s", nicID, err) - } - if nicInfo, ok := s.NICInfo()[nicID]; ok { - t.Errorf("got unexpected NICInfo entry for deleted NIC %d = %+v", nicID, nicInfo) - } - if e.isAttached() { - t.Error("link endpoint for removed NIC still attached to a network dispatcher") + // Removing a NIC should remove it from NICInfo and e should be detached from + // the NetworkDispatcher. + if got, want := s.RemoveNIC(nicID), tt.expectErr; got != want { + t.Fatalf("got s.RemoveNIC(%d) = %s, want %s", nicID, got, want) + } + if tt.expectErr == nil { + if nicInfo, ok := s.NICInfo()[nicID]; ok { + t.Errorf("got unexpected NICInfo entry for deleted NIC %d = %+v", nicID, nicInfo) + } + if e.isAttached() { + t.Error("link endpoint for removed NIC still attached to a network dispatcher") + } + } + }) } } @@ -791,8 +857,15 @@ func TestRouteWithDownNIC(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID1, err) } - if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: addr1, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(nicID1, protocolAddr1, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID1, protocolAddr1, err) } ep2 := channel.New(1, defaultMTU, "") @@ -800,8 +873,15 @@ func TestRouteWithDownNIC(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID2, err) } - if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: addr2, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(nicID2, protocolAddr2, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID2, protocolAddr2, err) } // Set a route table that sends all packets with odd destination @@ -957,12 +1037,26 @@ func TestRoutes(t *testing.T) { t.Fatal("CreateNIC failed:", err) } - if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x01", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr1, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr1, err) } - if err := s.AddAddress(1, fakeNetNumber, "\x03"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr3 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x03", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr3, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr3, err) } ep2 := channel.New(10, defaultMTU, "") @@ -970,12 +1064,26 @@ func TestRoutes(t *testing.T) { t.Fatal("CreateNIC failed:", err) } - if err := s.AddAddress(2, fakeNetNumber, "\x02"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x02", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(2, protocolAddr2, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 2, protocolAddr2, err) } - if err := s.AddAddress(2, fakeNetNumber, "\x04"); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr4 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x04", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(2, protocolAddr4, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 2, protocolAddr4, err) } // Set a route table that sends all packets with odd destination @@ -1037,8 +1145,15 @@ func TestAddressRemoval(t *testing.T) { t.Fatal("CreateNIC failed:", err) } - if err := s.AddAddress(1, fakeNetNumber, localAddr); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: localAddr, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } { subnet, err := tcpip.NewSubnet("\x00", "\x00") @@ -1087,8 +1202,15 @@ func TestAddressRemovalWithRouteHeld(t *testing.T) { fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol) buf := buffer.NewView(30) - if err := s.AddAddress(1, fakeNetNumber, localAddr); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: localAddr, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } { subnet, err := tcpip.NewSubnet("\x00", "\x00") @@ -1221,8 +1343,15 @@ func TestEndpointExpiration(t *testing.T) { // 2. Add Address, everything should work. //----------------------- - if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: localAddr, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } verifyAddress(t, s, nicID, localAddr) testRecv(t, fakeNet, localAddrByte, ep, buf) @@ -1249,8 +1378,8 @@ func TestEndpointExpiration(t *testing.T) { // 4. Add Address back, everything should work again. //----------------------- - if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil { - t.Fatal("AddAddress failed:", err) + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } verifyAddress(t, s, nicID, localAddr) testRecv(t, fakeNet, localAddrByte, ep, buf) @@ -1289,8 +1418,8 @@ func TestEndpointExpiration(t *testing.T) { // 7. Add Address back, everything should work again. //----------------------- - if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil { - t.Fatal("AddAddress failed:", err) + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } verifyAddress(t, s, nicID, localAddr) testRecv(t, fakeNet, localAddrByte, ep, buf) @@ -1432,8 +1561,15 @@ func TestExternalSendWithHandleLocal(t *testing.T) { if err := s.CreateNIC(nicID, ep); err != nil { t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, fakeNetNumber, localAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: localAddr, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } s.SetRouteTable([]tcpip.Route{{Destination: subnet, NIC: nicID}}) @@ -1489,8 +1625,15 @@ func TestSpoofingWithAddress(t *testing.T) { t.Fatal("CreateNIC failed:", err) } - if err := s.AddAddress(1, fakeNetNumber, localAddr); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: localAddr, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } { @@ -1612,8 +1755,8 @@ func TestOutgoingBroadcastWithEmptyRouteTable(t *testing.T) { } protoAddr := tcpip.ProtocolAddress{Protocol: fakeNetNumber, AddressWithPrefix: tcpip.AddressWithPrefix{Address: header.IPv4Any}} - if err := s.AddProtocolAddress(1, protoAddr); err != nil { - t.Fatalf("AddProtocolAddress(1, %v) failed: %v", protoAddr, err) + if err := s.AddProtocolAddress(1, protoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(1, %+v, {}) failed: %s", protoAddr, err) } r, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */) if err != nil { @@ -1657,13 +1800,13 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) { t.Fatalf("CreateNIC failed: %s", err) } nic1ProtoAddr := tcpip.ProtocolAddress{Protocol: fakeNetNumber, AddressWithPrefix: nic1Addr} - if err := s.AddProtocolAddress(1, nic1ProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(1, %v) failed: %v", nic1ProtoAddr, err) + if err := s.AddProtocolAddress(1, nic1ProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(1, %+v, {}) failed: %s", nic1ProtoAddr, err) } nic2ProtoAddr := tcpip.ProtocolAddress{Protocol: fakeNetNumber, AddressWithPrefix: nic2Addr} - if err := s.AddProtocolAddress(2, nic2ProtoAddr); err != nil { - t.Fatalf("AddAddress(2, %v) failed: %v", nic2ProtoAddr, err) + if err := s.AddProtocolAddress(2, nic2ProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(2, %+v, {}) failed: %s", nic2ProtoAddr, err) } // Set the initial route table. @@ -1705,7 +1848,7 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) { // 2. Case: Having an explicit route for broadcast will select that one. rt = append( []tcpip.Route{ - {Destination: tcpip.AddressWithPrefix{Address: header.IPv4Broadcast, PrefixLen: 8 * header.IPv4AddressSize}.Subnet(), NIC: 1}, + {Destination: header.IPv4Broadcast.WithPrefix().Subnet(), NIC: 1}, }, rt..., ) @@ -1787,8 +1930,15 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) { t.Fatalf("got FindRoute(1, %v, %v, %v) = %v, want = %v", anyAddr, tc.address, fakeNetNumber, err, want) } - if err := s.AddAddress(1, fakeNetNumber, anyAddr); err != nil { - t.Fatalf("AddAddress(%v, %v) failed: %v", fakeNetNumber, anyAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: anyAddr, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } if r, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); tc.routeNeeded { @@ -1865,22 +2015,27 @@ func TestGetMainNICAddressAddPrimaryNonPrimary(t *testing.T) { // Add an address and in case of a primary one include a // prefixLen. address := tcpip.Address(bytes.Repeat([]byte{byte(i)}, addrLen)) + properties := stack.AddressProperties{PEB: behavior} if behavior == stack.CanBePrimaryEndpoint { protocolAddress := tcpip.ProtocolAddress{ - Protocol: fakeNetNumber, - AddressWithPrefix: tcpip.AddressWithPrefix{ - Address: address, - PrefixLen: addrLen * 8, - }, + Protocol: fakeNetNumber, + AddressWithPrefix: address.WithPrefix(), } - if err := s.AddProtocolAddressWithOptions(nicID, protocolAddress, behavior); err != nil { - t.Fatalf("AddProtocolAddressWithOptions(%d, %#v, %d): %s", nicID, protocolAddress, behavior, err) + if err := s.AddProtocolAddress(nicID, protocolAddress, properties); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, %+v): %s", nicID, protocolAddress, properties, err) } // Remember the address/prefix. primaryAddrAdded[protocolAddress.AddressWithPrefix] = struct{}{} } else { - if err := s.AddAddressWithOptions(nicID, fakeNetNumber, address, behavior); err != nil { - t.Fatalf("AddAddressWithOptions(%d, %d, %s, %d): %s:", nicID, fakeNetNumber, address, behavior, err) + protocolAddress := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: address, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(nicID, protocolAddress, properties); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, %+v): %s", nicID, protocolAddress, properties, err) } } } @@ -1975,8 +2130,8 @@ func TestGetMainNICAddressAddRemove(t *testing.T) { PrefixLen: tc.prefixLen, }, } - if err := s.AddProtocolAddress(1, protocolAddress); err != nil { - t.Fatal("AddProtocolAddress failed:", err) + if err := s.AddProtocolAddress(1, protocolAddress, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(1, %+v, {}): %s", protocolAddress, err) } // Check that we get the right initial address and prefix length. @@ -2026,33 +2181,6 @@ func verifyAddresses(t *testing.T, expectedAddresses, gotAddresses []tcpip.Proto } } -func TestAddAddress(t *testing.T) { - const nicID = 1 - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory}, - }) - ep := channel.New(10, defaultMTU, "") - if err := s.CreateNIC(nicID, ep); err != nil { - t.Fatal("CreateNIC failed:", err) - } - - var addrGen addressGenerator - expectedAddresses := make([]tcpip.ProtocolAddress, 0, 2) - for _, addrLen := range []int{4, 16} { - address := addrGen.next(addrLen) - if err := s.AddAddress(nicID, fakeNetNumber, address); err != nil { - t.Fatalf("AddAddress(address=%s) failed: %s", address, err) - } - expectedAddresses = append(expectedAddresses, tcpip.ProtocolAddress{ - Protocol: fakeNetNumber, - AddressWithPrefix: tcpip.AddressWithPrefix{Address: address, PrefixLen: fakeDefaultPrefixLen}, - }) - } - - gotAddresses := s.AllAddresses()[nicID] - verifyAddresses(t, expectedAddresses, gotAddresses) -} - func TestAddProtocolAddress(t *testing.T) { const nicID = 1 s := stack.New(stack.Options{ @@ -2063,96 +2191,43 @@ func TestAddProtocolAddress(t *testing.T) { t.Fatal("CreateNIC failed:", err) } - var addrGen addressGenerator - addrLenRange := []int{4, 16} - prefixLenRange := []int{8, 13, 20, 32} - expectedAddresses := make([]tcpip.ProtocolAddress, 0, len(addrLenRange)*len(prefixLenRange)) - for _, addrLen := range addrLenRange { - for _, prefixLen := range prefixLenRange { - protocolAddress := tcpip.ProtocolAddress{ - Protocol: fakeNetNumber, - AddressWithPrefix: tcpip.AddressWithPrefix{ - Address: addrGen.next(addrLen), - PrefixLen: prefixLen, - }, - } - if err := s.AddProtocolAddress(nicID, protocolAddress); err != nil { - t.Errorf("AddProtocolAddress(%+v) failed: %s", protocolAddress, err) - } - expectedAddresses = append(expectedAddresses, protocolAddress) - } - } - - gotAddresses := s.AllAddresses()[nicID] - verifyAddresses(t, expectedAddresses, gotAddresses) -} - -func TestAddAddressWithOptions(t *testing.T) { - const nicID = 1 - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory}, - }) - ep := channel.New(10, defaultMTU, "") - if err := s.CreateNIC(nicID, ep); err != nil { - t.Fatal("CreateNIC failed:", err) - } - addrLenRange := []int{4, 16} behaviorRange := []stack.PrimaryEndpointBehavior{stack.CanBePrimaryEndpoint, stack.FirstPrimaryEndpoint, stack.NeverPrimaryEndpoint} - expectedAddresses := make([]tcpip.ProtocolAddress, 0, len(addrLenRange)*len(behaviorRange)) + configTypeRange := []stack.AddressConfigType{stack.AddressConfigStatic, stack.AddressConfigSlaac, stack.AddressConfigSlaacTemp} + deprecatedRange := []bool{false, true} + wantAddresses := make([]tcpip.ProtocolAddress, 0, len(addrLenRange)*len(behaviorRange)*len(configTypeRange)*len(deprecatedRange)) var addrGen addressGenerator for _, addrLen := range addrLenRange { for _, behavior := range behaviorRange { - address := addrGen.next(addrLen) - if err := s.AddAddressWithOptions(nicID, fakeNetNumber, address, behavior); err != nil { - t.Fatalf("AddAddressWithOptions(address=%s, behavior=%d) failed: %s", address, behavior, err) - } - expectedAddresses = append(expectedAddresses, tcpip.ProtocolAddress{ - Protocol: fakeNetNumber, - AddressWithPrefix: tcpip.AddressWithPrefix{Address: address, PrefixLen: fakeDefaultPrefixLen}, - }) - } - } - - gotAddresses := s.AllAddresses()[nicID] - verifyAddresses(t, expectedAddresses, gotAddresses) -} - -func TestAddProtocolAddressWithOptions(t *testing.T) { - const nicID = 1 - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory}, - }) - ep := channel.New(10, defaultMTU, "") - if err := s.CreateNIC(nicID, ep); err != nil { - t.Fatal("CreateNIC failed:", err) - } - - addrLenRange := []int{4, 16} - prefixLenRange := []int{8, 13, 20, 32} - behaviorRange := []stack.PrimaryEndpointBehavior{stack.CanBePrimaryEndpoint, stack.FirstPrimaryEndpoint, stack.NeverPrimaryEndpoint} - expectedAddresses := make([]tcpip.ProtocolAddress, 0, len(addrLenRange)*len(prefixLenRange)*len(behaviorRange)) - var addrGen addressGenerator - for _, addrLen := range addrLenRange { - for _, prefixLen := range prefixLenRange { - for _, behavior := range behaviorRange { - protocolAddress := tcpip.ProtocolAddress{ - Protocol: fakeNetNumber, - AddressWithPrefix: tcpip.AddressWithPrefix{ - Address: addrGen.next(addrLen), - PrefixLen: prefixLen, - }, - } - if err := s.AddProtocolAddressWithOptions(nicID, protocolAddress, behavior); err != nil { - t.Fatalf("AddProtocolAddressWithOptions(%+v, %d) failed: %s", protocolAddress, behavior, err) + for _, configType := range configTypeRange { + for _, deprecated := range deprecatedRange { + address := addrGen.next(addrLen) + properties := stack.AddressProperties{ + PEB: behavior, + ConfigType: configType, + Deprecated: deprecated, + } + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: address, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, properties); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, %+v) failed: %s", nicID, protocolAddr, properties, err) + } + wantAddresses = append(wantAddresses, tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{Address: address, PrefixLen: fakeDefaultPrefixLen}, + }) } - expectedAddresses = append(expectedAddresses, protocolAddress) } } } gotAddresses := s.AllAddresses()[nicID] - verifyAddresses(t, expectedAddresses, gotAddresses) + verifyAddresses(t, wantAddresses, gotAddresses) } func TestCreateNICWithOptions(t *testing.T) { @@ -2269,8 +2344,15 @@ func TestNICStats(t *testing.T) { if err := s.CreateNIC(nicid, ep); err != nil { t.Fatal("CreateNIC failed: ", err) } - if err := s.AddAddress(nicid, fakeNetNumber, nic.addr); err != nil { - t.Fatal("AddAddress failed:", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: nic.addr, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(nicid, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicid, protocolAddr, err) } { @@ -2714,8 +2796,16 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) { // be returned by a call to GetMainNICAddress; // else, it should. const address1 = tcpip.Address("\x01") - if err := s.AddAddressWithOptions(nicID, fakeNetNumber, address1, pi); err != nil { - t.Fatalf("AddAddressWithOptions(%d, %d, %s, %d): %s", nicID, fakeNetNumber, address1, pi, err) + properties := stack.AddressProperties{PEB: pi} + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: address1, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, properties); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, %+v): %s", nicID, protocolAddr, properties, err) } addr, err := s.GetMainNICAddress(nicID, fakeNetNumber) if err != nil { @@ -2764,16 +2854,31 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) { // Add some other address with peb set to // FirstPrimaryEndpoint. const address3 = tcpip.Address("\x03") - if err := s.AddAddressWithOptions(nicID, fakeNetNumber, address3, stack.FirstPrimaryEndpoint); err != nil { - t.Fatalf("AddAddressWithOptions(%d, %d, %s, %d): %s", nicID, fakeNetNumber, address3, stack.FirstPrimaryEndpoint, err) - + protocolAddr3 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: address3, + PrefixLen: fakeDefaultPrefixLen, + }, + } + properties = stack.AddressProperties{PEB: stack.FirstPrimaryEndpoint} + if err := s.AddProtocolAddress(nicID, protocolAddr3, properties); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, %+v): %s", nicID, protocolAddr3, properties, err) } // Add back the address we removed earlier and // make sure the new peb was respected. // (The address should just be promoted now). - if err := s.AddAddressWithOptions(nicID, fakeNetNumber, address1, ps); err != nil { - t.Fatalf("AddAddressWithOptions(%d, %d, %s, %d): %s", nicID, fakeNetNumber, address1, pi, err) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: address1, + PrefixLen: fakeDefaultPrefixLen, + }, + } + properties = stack.AddressProperties{PEB: ps} + if err := s.AddProtocolAddress(nicID, protocolAddr1, properties); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, %+v): %s", nicID, protocolAddr1, properties, err) } var primaryAddrs []tcpip.Address for _, pa := range s.NICInfo()[nicID].ProtocolAddresses { @@ -3075,8 +3180,12 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) { } for _, a := range test.nicAddrs { - if err := s.AddAddress(nicID, ipv6.ProtocolNumber, a); err != nil { - t.Errorf("s.AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, a, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: a.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } } @@ -3182,8 +3291,12 @@ func TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddAddress(nicID, ipv6.ProtocolNumber, addr1); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, addr1, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: addr1.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } // The NIC should have joined addr1's solicited node multicast address. @@ -3338,8 +3451,8 @@ func TestDoDADWhenNICEnabled(t *testing.T) { PrefixLen: 128, }, } - if err := s.AddProtocolAddress(nicID, addr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, addr, err) + if err := s.AddProtocolAddress(nicID, addr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, addr, err) } // Address should be in the list of all addresses. @@ -3666,8 +3779,8 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { if err := s.CreateNIC(nicID1, ep); err != nil { t.Fatalf("CreateNIC(%d, _): %s", nicID1, err) } - if err := s.AddProtocolAddress(nicID1, test.nicAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID1, test.nicAddr, err) + if err := s.AddProtocolAddress(nicID1, test.nicAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID1, test.nicAddr, err) } s.SetRouteTable(test.routes) @@ -3729,8 +3842,8 @@ func TestResolveWith(t *testing.T) { PrefixLen: 24, }, } - if err := s.AddProtocolAddress(nicID, addr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, addr, err) + if err := s.AddProtocolAddress(nicID, addr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, addr, err) } s.SetRouteTable([]tcpip.Route{{Destination: header.IPv4EmptySubnet, NIC: nicID}}) @@ -3771,8 +3884,15 @@ func TestRouteReleaseAfterAddrRemoval(t *testing.T) { if err := s.CreateNIC(nicID, ep); err != nil { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, fakeNetNumber, localAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: localAddr, + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } { subnet, err := tcpip.NewSubnet("\x00", "\x00") @@ -3860,8 +3980,8 @@ func TestGetMainNICAddressWhenNICDisabled(t *testing.T) { PrefixLen: 8, }, } - if err := s.AddProtocolAddress(nicID, protocolAddress); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, protocolAddress, err) + if err := s.AddProtocolAddress(nicID, protocolAddress, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddress, err) } // Check that we get the right initial address and prefix length. @@ -3969,44 +4089,44 @@ func TestFindRouteWithForwarding(t *testing.T) { ) type netCfg struct { - proto tcpip.NetworkProtocolNumber - factory stack.NetworkProtocolFactory - nic1Addr tcpip.Address - nic2Addr tcpip.Address - remoteAddr tcpip.Address + proto tcpip.NetworkProtocolNumber + factory stack.NetworkProtocolFactory + nic1AddrWithPrefix tcpip.AddressWithPrefix + nic2AddrWithPrefix tcpip.AddressWithPrefix + remoteAddr tcpip.Address } fakeNetCfg := netCfg{ - proto: fakeNetNumber, - factory: fakeNetFactory, - nic1Addr: nic1Addr, - nic2Addr: nic2Addr, - remoteAddr: remoteAddr, + proto: fakeNetNumber, + factory: fakeNetFactory, + nic1AddrWithPrefix: tcpip.AddressWithPrefix{Address: nic1Addr, PrefixLen: fakeDefaultPrefixLen}, + nic2AddrWithPrefix: tcpip.AddressWithPrefix{Address: nic2Addr, PrefixLen: fakeDefaultPrefixLen}, + remoteAddr: remoteAddr, } globalIPv6Addr1 := tcpip.Address(net.ParseIP("a::1").To16()) globalIPv6Addr2 := tcpip.Address(net.ParseIP("a::2").To16()) ipv6LinkLocalNIC1WithGlobalRemote := netCfg{ - proto: ipv6.ProtocolNumber, - factory: ipv6.NewProtocol, - nic1Addr: llAddr1, - nic2Addr: globalIPv6Addr2, - remoteAddr: globalIPv6Addr1, + proto: ipv6.ProtocolNumber, + factory: ipv6.NewProtocol, + nic1AddrWithPrefix: llAddr1.WithPrefix(), + nic2AddrWithPrefix: globalIPv6Addr2.WithPrefix(), + remoteAddr: globalIPv6Addr1, } ipv6GlobalNIC1WithLinkLocalRemote := netCfg{ - proto: ipv6.ProtocolNumber, - factory: ipv6.NewProtocol, - nic1Addr: globalIPv6Addr1, - nic2Addr: llAddr1, - remoteAddr: llAddr2, + proto: ipv6.ProtocolNumber, + factory: ipv6.NewProtocol, + nic1AddrWithPrefix: globalIPv6Addr1.WithPrefix(), + nic2AddrWithPrefix: llAddr1.WithPrefix(), + remoteAddr: llAddr2, } ipv6GlobalNIC1WithLinkLocalMulticastRemote := netCfg{ - proto: ipv6.ProtocolNumber, - factory: ipv6.NewProtocol, - nic1Addr: globalIPv6Addr1, - nic2Addr: globalIPv6Addr2, - remoteAddr: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", + proto: ipv6.ProtocolNumber, + factory: ipv6.NewProtocol, + nic1AddrWithPrefix: globalIPv6Addr1.WithPrefix(), + nic2AddrWithPrefix: globalIPv6Addr2.WithPrefix(), + remoteAddr: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", } tests := []struct { @@ -4015,8 +4135,8 @@ func TestFindRouteWithForwarding(t *testing.T) { netCfg netCfg forwardingEnabled bool - addrNIC tcpip.NICID - localAddr tcpip.Address + addrNIC tcpip.NICID + localAddrWithPrefix tcpip.AddressWithPrefix findRouteErr tcpip.Error dependentOnForwarding bool @@ -4026,7 +4146,7 @@ func TestFindRouteWithForwarding(t *testing.T) { netCfg: fakeNetCfg, forwardingEnabled: false, addrNIC: nicID1, - localAddr: fakeNetCfg.nic2Addr, + localAddrWithPrefix: fakeNetCfg.nic2AddrWithPrefix, findRouteErr: &tcpip.ErrNoRoute{}, dependentOnForwarding: false, }, @@ -4035,7 +4155,7 @@ func TestFindRouteWithForwarding(t *testing.T) { netCfg: fakeNetCfg, forwardingEnabled: true, addrNIC: nicID1, - localAddr: fakeNetCfg.nic2Addr, + localAddrWithPrefix: fakeNetCfg.nic2AddrWithPrefix, findRouteErr: &tcpip.ErrNoRoute{}, dependentOnForwarding: false, }, @@ -4044,7 +4164,7 @@ func TestFindRouteWithForwarding(t *testing.T) { netCfg: fakeNetCfg, forwardingEnabled: false, addrNIC: nicID1, - localAddr: fakeNetCfg.nic1Addr, + localAddrWithPrefix: fakeNetCfg.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNoRoute{}, dependentOnForwarding: false, }, @@ -4053,7 +4173,7 @@ func TestFindRouteWithForwarding(t *testing.T) { netCfg: fakeNetCfg, forwardingEnabled: true, addrNIC: nicID1, - localAddr: fakeNetCfg.nic1Addr, + localAddrWithPrefix: fakeNetCfg.nic1AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: true, }, @@ -4062,7 +4182,7 @@ func TestFindRouteWithForwarding(t *testing.T) { netCfg: fakeNetCfg, forwardingEnabled: false, addrNIC: nicID2, - localAddr: fakeNetCfg.nic2Addr, + localAddrWithPrefix: fakeNetCfg.nic2AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: false, }, @@ -4071,7 +4191,7 @@ func TestFindRouteWithForwarding(t *testing.T) { netCfg: fakeNetCfg, forwardingEnabled: true, addrNIC: nicID2, - localAddr: fakeNetCfg.nic2Addr, + localAddrWithPrefix: fakeNetCfg.nic2AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: false, }, @@ -4080,7 +4200,7 @@ func TestFindRouteWithForwarding(t *testing.T) { netCfg: fakeNetCfg, forwardingEnabled: false, addrNIC: nicID2, - localAddr: fakeNetCfg.nic1Addr, + localAddrWithPrefix: fakeNetCfg.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNoRoute{}, dependentOnForwarding: false, }, @@ -4089,7 +4209,7 @@ func TestFindRouteWithForwarding(t *testing.T) { netCfg: fakeNetCfg, forwardingEnabled: true, addrNIC: nicID2, - localAddr: fakeNetCfg.nic1Addr, + localAddrWithPrefix: fakeNetCfg.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNoRoute{}, dependentOnForwarding: false, }, @@ -4097,7 +4217,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding disabled and localAddr on same NIC as route", netCfg: fakeNetCfg, forwardingEnabled: false, - localAddr: fakeNetCfg.nic2Addr, + localAddrWithPrefix: fakeNetCfg.nic2AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: false, }, @@ -4105,7 +4225,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding enabled and localAddr on same NIC as route", netCfg: fakeNetCfg, forwardingEnabled: false, - localAddr: fakeNetCfg.nic2Addr, + localAddrWithPrefix: fakeNetCfg.nic2AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: false, }, @@ -4113,7 +4233,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding disabled and localAddr on different NIC as route", netCfg: fakeNetCfg, forwardingEnabled: false, - localAddr: fakeNetCfg.nic1Addr, + localAddrWithPrefix: fakeNetCfg.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNoRoute{}, dependentOnForwarding: false, }, @@ -4121,7 +4241,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding enabled and localAddr on different NIC as route", netCfg: fakeNetCfg, forwardingEnabled: true, - localAddr: fakeNetCfg.nic1Addr, + localAddrWithPrefix: fakeNetCfg.nic1AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: true, }, @@ -4145,7 +4265,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding disabled and link-local local addr with route on different NIC", netCfg: ipv6LinkLocalNIC1WithGlobalRemote, forwardingEnabled: false, - localAddr: ipv6LinkLocalNIC1WithGlobalRemote.nic1Addr, + localAddrWithPrefix: ipv6LinkLocalNIC1WithGlobalRemote.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNoRoute{}, dependentOnForwarding: false, }, @@ -4153,7 +4273,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding enabled and link-local local addr with route on same NIC", netCfg: ipv6LinkLocalNIC1WithGlobalRemote, forwardingEnabled: true, - localAddr: ipv6LinkLocalNIC1WithGlobalRemote.nic1Addr, + localAddrWithPrefix: ipv6LinkLocalNIC1WithGlobalRemote.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNoRoute{}, dependentOnForwarding: false, }, @@ -4161,7 +4281,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding disabled and global local addr with route on same NIC", netCfg: ipv6LinkLocalNIC1WithGlobalRemote, forwardingEnabled: true, - localAddr: ipv6LinkLocalNIC1WithGlobalRemote.nic2Addr, + localAddrWithPrefix: ipv6LinkLocalNIC1WithGlobalRemote.nic2AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: false, }, @@ -4169,7 +4289,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding disabled and link-local local addr with route on same NIC", netCfg: ipv6GlobalNIC1WithLinkLocalRemote, forwardingEnabled: false, - localAddr: ipv6GlobalNIC1WithLinkLocalRemote.nic2Addr, + localAddrWithPrefix: ipv6GlobalNIC1WithLinkLocalRemote.nic2AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: false, }, @@ -4177,7 +4297,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding enabled and link-local local addr with route on same NIC", netCfg: ipv6GlobalNIC1WithLinkLocalRemote, forwardingEnabled: true, - localAddr: ipv6GlobalNIC1WithLinkLocalRemote.nic2Addr, + localAddrWithPrefix: ipv6GlobalNIC1WithLinkLocalRemote.nic2AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: false, }, @@ -4185,7 +4305,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding disabled and global local addr with link-local remote on different NIC", netCfg: ipv6GlobalNIC1WithLinkLocalRemote, forwardingEnabled: false, - localAddr: ipv6GlobalNIC1WithLinkLocalRemote.nic1Addr, + localAddrWithPrefix: ipv6GlobalNIC1WithLinkLocalRemote.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNetworkUnreachable{}, dependentOnForwarding: false, }, @@ -4193,7 +4313,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding enabled and global local addr with link-local remote on different NIC", netCfg: ipv6GlobalNIC1WithLinkLocalRemote, forwardingEnabled: true, - localAddr: ipv6GlobalNIC1WithLinkLocalRemote.nic1Addr, + localAddrWithPrefix: ipv6GlobalNIC1WithLinkLocalRemote.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNetworkUnreachable{}, dependentOnForwarding: false, }, @@ -4201,7 +4321,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding disabled and global local addr with link-local multicast remote on different NIC", netCfg: ipv6GlobalNIC1WithLinkLocalMulticastRemote, forwardingEnabled: false, - localAddr: ipv6GlobalNIC1WithLinkLocalMulticastRemote.nic1Addr, + localAddrWithPrefix: ipv6GlobalNIC1WithLinkLocalMulticastRemote.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNetworkUnreachable{}, dependentOnForwarding: false, }, @@ -4209,7 +4329,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding enabled and global local addr with link-local multicast remote on different NIC", netCfg: ipv6GlobalNIC1WithLinkLocalMulticastRemote, forwardingEnabled: true, - localAddr: ipv6GlobalNIC1WithLinkLocalMulticastRemote.nic1Addr, + localAddrWithPrefix: ipv6GlobalNIC1WithLinkLocalMulticastRemote.nic1AddrWithPrefix, findRouteErr: &tcpip.ErrNetworkUnreachable{}, dependentOnForwarding: false, }, @@ -4217,7 +4337,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding disabled and global local addr with link-local multicast remote on same NIC", netCfg: ipv6GlobalNIC1WithLinkLocalMulticastRemote, forwardingEnabled: false, - localAddr: ipv6GlobalNIC1WithLinkLocalMulticastRemote.nic2Addr, + localAddrWithPrefix: ipv6GlobalNIC1WithLinkLocalMulticastRemote.nic2AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: false, }, @@ -4225,7 +4345,7 @@ func TestFindRouteWithForwarding(t *testing.T) { name: "forwarding enabled and global local addr with link-local multicast remote on same NIC", netCfg: ipv6GlobalNIC1WithLinkLocalMulticastRemote, forwardingEnabled: true, - localAddr: ipv6GlobalNIC1WithLinkLocalMulticastRemote.nic2Addr, + localAddrWithPrefix: ipv6GlobalNIC1WithLinkLocalMulticastRemote.nic2AddrWithPrefix, findRouteErr: nil, dependentOnForwarding: false, }, @@ -4247,12 +4367,20 @@ func TestFindRouteWithForwarding(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s:", nicID2, err) } - if err := s.AddAddress(nicID1, test.netCfg.proto, test.netCfg.nic1Addr); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, test.netCfg.proto, test.netCfg.nic1Addr, err) + protocolAddr1 := tcpip.ProtocolAddress{ + Protocol: test.netCfg.proto, + AddressWithPrefix: test.netCfg.nic1AddrWithPrefix, + } + if err := s.AddProtocolAddress(nicID1, protocolAddr1, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID1, protocolAddr1, err) } - if err := s.AddAddress(nicID2, test.netCfg.proto, test.netCfg.nic2Addr); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, test.netCfg.proto, test.netCfg.nic2Addr, err) + protocolAddr2 := tcpip.ProtocolAddress{ + Protocol: test.netCfg.proto, + AddressWithPrefix: test.netCfg.nic2AddrWithPrefix, + } + if err := s.AddProtocolAddress(nicID2, protocolAddr2, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID2, protocolAddr2, err) } if err := s.SetForwardingDefaultAndAllNICs(test.netCfg.proto, test.forwardingEnabled); err != nil { @@ -4261,20 +4389,20 @@ func TestFindRouteWithForwarding(t *testing.T) { s.SetRouteTable([]tcpip.Route{{Destination: test.netCfg.remoteAddr.WithPrefix().Subnet(), NIC: nicID2}}) - r, err := s.FindRoute(test.addrNIC, test.localAddr, test.netCfg.remoteAddr, test.netCfg.proto, false /* multicastLoop */) + r, err := s.FindRoute(test.addrNIC, test.localAddrWithPrefix.Address, test.netCfg.remoteAddr, test.netCfg.proto, false /* multicastLoop */) if err == nil { defer r.Release() } if diff := cmp.Diff(test.findRouteErr, err); diff != "" { - t.Fatalf("unexpected error from FindRoute(%d, %s, %s, %d, false), (-want, +got):\n%s", test.addrNIC, test.localAddr, test.netCfg.remoteAddr, test.netCfg.proto, diff) + t.Fatalf("unexpected error from FindRoute(%d, %s, %s, %d, false), (-want, +got):\n%s", test.addrNIC, test.localAddrWithPrefix.Address, test.netCfg.remoteAddr, test.netCfg.proto, diff) } if test.findRouteErr != nil { return } - if r.LocalAddress() != test.localAddr { - t.Errorf("got r.LocalAddress() = %s, want = %s", r.LocalAddress(), test.localAddr) + if r.LocalAddress() != test.localAddrWithPrefix.Address { + t.Errorf("got r.LocalAddress() = %s, want = %s", r.LocalAddress(), test.localAddrWithPrefix.Address) } if r.RemoteAddress() != test.netCfg.remoteAddr { t.Errorf("got r.RemoteAddress() = %s, want = %s", r.RemoteAddress(), test.netCfg.remoteAddr) @@ -4297,8 +4425,8 @@ func TestFindRouteWithForwarding(t *testing.T) { if !ok { t.Fatal("packet not sent through ep2") } - if pkt.Route.LocalAddress != test.localAddr { - t.Errorf("got pkt.Route.LocalAddress = %s, want = %s", pkt.Route.LocalAddress, test.localAddr) + if pkt.Route.LocalAddress != test.localAddrWithPrefix.Address { + t.Errorf("got pkt.Route.LocalAddress = %s, want = %s", pkt.Route.LocalAddress, test.localAddrWithPrefix.Address) } if pkt.Route.RemoteAddress != test.netCfg.remoteAddr { t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", pkt.Route.RemoteAddress, test.netCfg.remoteAddr) diff --git a/pkg/tcpip/stack/tcp.go b/pkg/tcpip/stack/tcp.go index e90c1a770..dc7289441 100644 --- a/pkg/tcpip/stack/tcp.go +++ b/pkg/tcpip/stack/tcp.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/internal/tcp" "gvisor.dev/gvisor/pkg/tcpip/seqnum" ) @@ -380,15 +381,18 @@ type TCPSndBufState struct { // SndClosed indicates that the endpoint has been closed for sends. SndClosed bool - // SndBufInQueue is the number of bytes in the send queue. - SndBufInQueue seqnum.Size - // PacketTooBigCount is used to notify the main protocol routine how // many times a "packet too big" control packet is received. PacketTooBigCount int // SndMTU is the smallest MTU seen in the control packets received. SndMTU int + + // AutoTuneSndBufDisabled indicates that the auto tuning of send buffer + // is disabled. + // + // Must be accessed using atomic operations. + AutoTuneSndBufDisabled uint32 } // TCPEndpointStateInner contains the members of TCPEndpointState used directly @@ -399,7 +403,7 @@ type TCPSndBufState struct { type TCPEndpointStateInner struct { // TSOffset is a randomized offset added to the value of the TSVal // field in the timestamp option. - TSOffset uint32 + TSOffset tcp.TSOffset // SACKPermitted is set to true if the peer sends the TCPSACKPermitted // option in the SYN/SYN-ACK. diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go index 8a8454a6a..542d9257c 100644 --- a/pkg/tcpip/stack/transport_demuxer.go +++ b/pkg/tcpip/stack/transport_demuxer.go @@ -16,6 +16,7 @@ package stack import ( "fmt" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" @@ -31,11 +32,13 @@ type protocolIDs struct { // transportEndpoints manages all endpoints of a given protocol. It has its own // mutex so as to reduce interference between protocols. type transportEndpoints struct { - // mu protects all fields of the transportEndpoints. - mu sync.RWMutex + mu sync.RWMutex + // +checklocks:mu endpoints map[TransportEndpointID]*endpointsByNIC // rawEndpoints contains endpoints for raw sockets, which receive all // traffic of a given protocol regardless of port. + // + // +checklocks:mu rawEndpoints []RawTransportEndpoint } @@ -68,7 +71,7 @@ func (eps *transportEndpoints) transportEndpoints() []TransportEndpoint { // descending order of match quality. If a call to yield returns false, // iterEndpointsLocked stops iteration and returns immediately. // -// Preconditions: eps.mu must be locked. +// +checklocks:eps.mu func (eps *transportEndpoints) iterEndpointsLocked(id TransportEndpointID, yield func(*endpointsByNIC) bool) { // Try to find a match with the id as provided. if ep, ok := eps.endpoints[id]; ok { @@ -109,7 +112,7 @@ func (eps *transportEndpoints) iterEndpointsLocked(id TransportEndpointID, yield // findAllEndpointsLocked returns all endpointsByNIC in eps that match id, in // descending order of match quality. // -// Preconditions: eps.mu must be locked. +// +checklocks:eps.mu func (eps *transportEndpoints) findAllEndpointsLocked(id TransportEndpointID) []*endpointsByNIC { var matchedEPs []*endpointsByNIC eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool { @@ -121,7 +124,7 @@ func (eps *transportEndpoints) findAllEndpointsLocked(id TransportEndpointID) [] // findEndpointLocked returns the endpoint that most closely matches the given id. // -// Preconditions: eps.mu must be locked. +// +checklocks:eps.mu func (eps *transportEndpoints) findEndpointLocked(id TransportEndpointID) *endpointsByNIC { var matchedEP *endpointsByNIC eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool { @@ -132,10 +135,12 @@ func (eps *transportEndpoints) findEndpointLocked(id TransportEndpointID) *endpo } type endpointsByNIC struct { - mu sync.RWMutex - endpoints map[tcpip.NICID]*multiPortEndpoint // seed is a random secret for a jenkins hash. seed uint32 + + mu sync.RWMutex + // +checklocks:mu + endpoints map[tcpip.NICID]*multiPortEndpoint } func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint { @@ -170,7 +175,7 @@ func (epsByNIC *endpointsByNIC) handlePacket(id TransportEndpointID, pkt *Packet return true } // multiPortEndpoints are guaranteed to have at least one element. - transEP := selectEndpoint(id, mpep, epsByNIC.seed) + transEP := mpep.selectEndpoint(id, epsByNIC.seed) if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue { queuedProtocol.QueuePacket(transEP, id, pkt) epsByNIC.mu.RUnlock() @@ -199,7 +204,7 @@ func (epsByNIC *endpointsByNIC) handleError(n *nic, id TransportEndpointID, tran // broadcast like we are doing with handlePacket above? // multiPortEndpoints are guaranteed to have at least one element. - selectEndpoint(id, mpep, epsByNIC.seed).HandleError(transErr, pkt) + mpep.selectEndpoint(id, epsByNIC.seed).HandleError(transErr, pkt) } // registerEndpoint returns true if it succeeds. It fails and returns @@ -215,10 +220,17 @@ func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto t netProto: netProto, transProto: transProto, } - epsByNIC.endpoints[bindToDevice] = multiPortEp } - return multiPortEp.singleRegisterEndpoint(t, flags) + if err := multiPortEp.singleRegisterEndpoint(t, flags); err != nil { + return err + } + // Only add this newly created multiportEndpoint if the singleRegisterEndpoint + // succeeded. + if !ok { + epsByNIC.endpoints[bindToDevice] = multiPortEp + } + return nil } func (epsByNIC *endpointsByNIC) checkEndpoint(flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { @@ -325,15 +337,18 @@ func (d *transportDemuxer) checkEndpoint(netProtos []tcpip.NetworkProtocolNumber // // +stateify savable type multiPortEndpoint struct { - mu sync.RWMutex `state:"nosave"` demux *transportDemuxer netProto tcpip.NetworkProtocolNumber transProto tcpip.TransportProtocolNumber + flags ports.FlagCounter + + mu sync.RWMutex `state:"nosave"` // endpoints stores the transport endpoints in the order in which they // were bound. This is required for UDP SO_REUSEADDR. + // + // +checklocks:mu endpoints []TransportEndpoint - flags ports.FlagCounter } func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint { @@ -354,13 +369,16 @@ func reciprocalScale(val, n uint32) uint32 { // selectEndpoint calculates a hash of destination and source addresses and // ports then uses it to select a socket. In this case, all packets from one // address will be sent to same endpoint. -func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32) TransportEndpoint { - if len(mpep.endpoints) == 1 { - return mpep.endpoints[0] +func (ep *multiPortEndpoint) selectEndpoint(id TransportEndpointID, seed uint32) TransportEndpoint { + ep.mu.RLock() + defer ep.mu.RUnlock() + + if len(ep.endpoints) == 1 { + return ep.endpoints[0] } - if mpep.flags.SharedFlags().ToFlags().Effective().MostRecent { - return mpep.endpoints[len(mpep.endpoints)-1] + if ep.flags.SharedFlags().ToFlags().Effective().MostRecent { + return ep.endpoints[len(ep.endpoints)-1] } payload := []byte{ @@ -376,8 +394,8 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32 h.Write([]byte(id.RemoteAddress)) hash := h.Sum32() - idx := reciprocalScale(hash, uint32(len(mpep.endpoints))) - return mpep.endpoints[idx] + idx := reciprocalScale(hash, uint32(len(ep.endpoints))) + return ep.endpoints[idx] } func (ep *multiPortEndpoint) handlePacketAll(id TransportEndpointID, pkt *PacketBuffer) { @@ -405,7 +423,6 @@ func (ep *multiPortEndpoint) handlePacketAll(id TransportEndpointID, pkt *Packet func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, flags ports.Flags) tcpip.Error { ep.mu.Lock() defer ep.mu.Unlock() - bits := flags.Bits() & ports.MultiBindFlagMask if len(ep.endpoints) != 0 { @@ -468,17 +485,21 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol eps.mu.Lock() defer eps.mu.Unlock() - epsByNIC, ok := eps.endpoints[id] if !ok { epsByNIC = &endpointsByNIC{ endpoints: make(map[tcpip.NICID]*multiPortEndpoint), - seed: d.stack.Seed(), + seed: d.stack.seed, } + } + if err := epsByNIC.registerEndpoint(d, netProto, protocol, ep, flags, bindToDevice); err != nil { + return err + } + // Only add this newly created epsByNIC if registerEndpoint succeeded. + if !ok { eps.endpoints[id] = epsByNIC } - - return epsByNIC.registerEndpoint(d, netProto, protocol, ep, flags, bindToDevice) + return nil } func (d *transportDemuxer) singleCheckEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { @@ -646,7 +667,7 @@ func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolN } } - ep := selectEndpoint(id, mpep, epsByNIC.seed) + ep := mpep.selectEndpoint(id, epsByNIC.seed) epsByNIC.mu.RUnlock() return ep } diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go index 0972c94de..cd3a8c25a 100644 --- a/pkg/tcpip/stack/transport_demuxer_test.go +++ b/pkg/tcpip/stack/transport_demuxer_test.go @@ -35,7 +35,7 @@ import ( const ( testSrcAddrV6 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" - testDstAddrV6 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" + testDstAddrV6 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02") testSrcAddrV4 = "\x0a\x00\x00\x01" testDstAddrV4 = "\x0a\x00\x00\x02" @@ -64,12 +64,20 @@ func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICI } linkEps[linkEpID] = channelEp - if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, testDstAddrV4); err != nil { - t.Fatalf("AddAddress IPv4 failed: %s", err) + protocolAddrV4 := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: tcpip.Address(testDstAddrV4).WithPrefix(), + } + if err := s.AddProtocolAddress(linkEpID, protocolAddrV4, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", linkEpID, protocolAddrV4, err) } - if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, testDstAddrV6); err != nil { - t.Fatalf("AddAddress IPv6 failed: %s", err) + protocolAddrV6 := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: testDstAddrV6.WithPrefix(), + } + if err := s.AddProtocolAddress(linkEpID, protocolAddrV6, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", linkEpID, protocolAddrV6, err) } } @@ -203,6 +211,56 @@ func TestTransportDemuxerRegister(t *testing.T) { } } +func TestTransportDemuxerRegisterMultiple(t *testing.T) { + type test struct { + flags ports.Flags + want tcpip.Error + } + for _, subtest := range []struct { + name string + tests []test + }{ + {"zeroFlags", []test{ + {ports.Flags{}, nil}, + {ports.Flags{}, &tcpip.ErrPortInUse{}}, + }}, + {"multibindFlags", []test{ + // Allow multiple registrations same TransportEndpointID with multibind flags. + {ports.Flags{LoadBalanced: true, MostRecent: true}, nil}, + {ports.Flags{LoadBalanced: true, MostRecent: true}, nil}, + // Disallow registration w/same ID for a non-multibindflag. + {ports.Flags{TupleOnly: true}, &tcpip.ErrPortInUse{}}, + }}, + } { + t.Run(subtest.name, func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}, + }) + var eps []tcpip.Endpoint + for idx, test := range subtest.tests { + var wq waiter.Queue + ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq) + if err != nil { + t.Fatal(err) + } + eps = append(eps, ep) + tEP, ok := ep.(stack.TransportEndpoint) + if !ok { + t.Fatalf("%T does not implement stack.TransportEndpoint", ep) + } + id := stack.TransportEndpointID{LocalPort: 1} + if got, want := s.RegisterTransportEndpoint([]tcpip.NetworkProtocolNumber{ipv4.ProtocolNumber}, udp.ProtocolNumber, id, tEP, test.flags, 0), test.want; got != want { + t.Fatalf("test index: %d, s.RegisterTransportEndpoint(ipv4.ProtocolNumber, udp.ProtocolNumber, _, _, %+v, 0) = %s, want %s", idx, test.flags, got, want) + } + } + for _, ep := range eps { + ep.Close() + } + }) + } +} + // TestBindToDeviceDistribution injects varied packets on input devices and checks that // the distribution of packets received matches expectations. func TestBindToDeviceDistribution(t *testing.T) { diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go index 839178809..655931715 100644 --- a/pkg/tcpip/stack/transport_test.go +++ b/pkg/tcpip/stack/transport_test.go @@ -357,8 +357,15 @@ func TestTransportReceive(t *testing.T) { s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}}) } - if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil { - t.Fatalf("AddAddress failed: %v", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x01", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } // Create endpoint and connect to remote address. @@ -428,8 +435,15 @@ func TestTransportControlReceive(t *testing.T) { s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}}) } - if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil { - t.Fatalf("AddAddress failed: %v", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x01", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } // Create endpoint and connect to remote address. @@ -497,8 +511,15 @@ func TestTransportSend(t *testing.T) { t.Fatalf("CreateNIC failed: %v", err) } - if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil { - t.Fatalf("AddAddress failed: %v", err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: fakeNetNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: "\x01", + PrefixLen: fakeDefaultPrefixLen, + }, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", 1, protocolAddr, err) } { diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 91622fa4c..a9ce148b9 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -19,7 +19,7 @@ // The starting point is the creation and configuration of a stack. A stack can // be created by calling the New() function of the tcpip/stack/stack package; // configuring a stack involves creating NICs (via calls to Stack.CreateNIC()), -// adding network addresses (via calls to Stack.AddAddress()), and +// adding network addresses (via calls to Stack.AddProtocolAddress()), and // setting a route table (via a call to Stack.SetRouteTable()). // // Once a stack is configured, endpoints can be created by calling @@ -465,11 +465,11 @@ type ControlMessages struct { // PacketOwner is used to get UID and GID of the packet. type PacketOwner interface { - // UID returns UID of the packet. - UID() uint32 + // UID returns KUID of the packet. + KUID() uint32 - // GID returns GID of the packet. - GID() uint32 + // GID returns KGID of the packet. + KGID() uint32 } // ReadOptions contains options for Endpoint.Read. @@ -1845,6 +1845,10 @@ type TCPStats struct { // FailedPortReservations is the number of times TCP failed to reserve // a port. FailedPortReservations *StatCounter + + // SegmentsAckedWithDSACK is the number of segments acknowledged with + // DSACK. + SegmentsAckedWithDSACK *StatCounter } // UDPStats collects UDP-specific stats. diff --git a/pkg/tcpip/tests/integration/forward_test.go b/pkg/tcpip/tests/integration/forward_test.go index 92fa6257d..6e1d4720d 100644 --- a/pkg/tcpip/tests/integration/forward_test.go +++ b/pkg/tcpip/tests/integration/forward_test.go @@ -473,11 +473,19 @@ func TestMulticastForwarding(t *testing.T) { t.Fatalf("s.CreateNIC(%d, _): %s", nicID2, err) } - if err := s.AddAddress(nicID2, ipv4.ProtocolNumber, utils.Ipv4Addr.Address); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID2, ipv4.ProtocolNumber, utils.Ipv4Addr.Address, err) + protocolAddrV4 := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: utils.Ipv4Addr, } - if err := s.AddAddress(nicID2, ipv6.ProtocolNumber, utils.Ipv6Addr.Address); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID2, ipv6.ProtocolNumber, utils.Ipv6Addr.Address, err) + if err := s.AddProtocolAddress(nicID2, protocolAddrV4, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID2, protocolAddrV4, err) + } + protocolAddrV6 := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: utils.Ipv6Addr, + } + if err := s.AddProtocolAddress(nicID2, protocolAddrV6, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID2, protocolAddrV6, err) } if err := s.SetForwardingDefaultAndAllNICs(ipv4.ProtocolNumber, true); err != nil { @@ -612,8 +620,8 @@ func TestPerInterfaceForwarding(t *testing.T) { addr: utils.RouterNIC2IPv6Addr, }, } { - if err := s.AddProtocolAddress(add.nicID, add.addr); err != nil { - t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", add.nicID, add.addr, err) + if err := s.AddProtocolAddress(add.nicID, add.addr, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", add.nicID, add.addr, err) } } diff --git a/pkg/tcpip/tests/integration/iptables_test.go b/pkg/tcpip/tests/integration/iptables_test.go index f9ab7d0af..28b49c6be 100644 --- a/pkg/tcpip/tests/integration/iptables_test.go +++ b/pkg/tcpip/tests/integration/iptables_test.go @@ -49,10 +49,10 @@ const ( nicName = "nic1" anotherNicName = "nic2" linkAddr = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e") - srcAddrV4 = "\x0a\x00\x00\x01" - dstAddrV4 = "\x0a\x00\x00\x02" - srcAddrV6 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" - dstAddrV6 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" + srcAddrV4 = tcpip.Address("\x0a\x00\x00\x01") + dstAddrV4 = tcpip.Address("\x0a\x00\x00\x02") + srcAddrV6 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01") + dstAddrV6 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02") payloadSize = 20 ) @@ -66,8 +66,12 @@ func genStackV6(t *testing.T) (*stack.Stack, *channel.Endpoint) { if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil { t.Fatalf("CreateNICWithOptions(%d, _, %#v) = %s", nicID, nicOpts, err) } - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, dstAddrV6); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, dstAddrV6, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: dstAddrV6.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } return s, e } @@ -82,8 +86,12 @@ func genStackV4(t *testing.T) (*stack.Stack, *channel.Endpoint) { if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil { t.Fatalf("CreateNICWithOptions(%d, _, %#v) = %s", nicID, nicOpts, err) } - if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, dstAddrV4); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, dstAddrV4, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: dstAddrV4.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } return s, e } @@ -601,11 +609,19 @@ func TestIPTableWritePackets(t *testing.T) { if err := s.CreateNIC(nicID, &e); err != nil { t.Fatalf("CreateNIC(%d, _) = %s", nicID, err) } - if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, srcAddrV6); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, srcAddrV6, err) + protocolAddrV6 := tcpip.ProtocolAddress{ + Protocol: header.IPv6ProtocolNumber, + AddressWithPrefix: srcAddrV6.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddrV6, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddrV6, err) } - if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, srcAddrV4); err != nil { - t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, srcAddrV4, err) + protocolAddrV4 := tcpip.ProtocolAddress{ + Protocol: header.IPv4ProtocolNumber, + AddressWithPrefix: srcAddrV4.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddrV4, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddrV4, err) } s.SetRouteTable([]tcpip.Route{ @@ -856,11 +872,19 @@ func TestForwardingHook(t *testing.T) { t.Fatalf("s.CreateNICWithOptions(%d, _, _): %s", nicID2, err) } - if err := s.AddAddress(nicID2, ipv4.ProtocolNumber, utils.Ipv4Addr.Address); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID2, ipv4.ProtocolNumber, utils.Ipv4Addr.Address, err) + protocolAddrV4 := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: utils.Ipv4Addr.Address.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID2, protocolAddrV4, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID2, protocolAddrV4, err) + } + protocolAddrV6 := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: utils.Ipv6Addr.Address.WithPrefix(), } - if err := s.AddAddress(nicID2, ipv6.ProtocolNumber, utils.Ipv6Addr.Address); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID2, ipv6.ProtocolNumber, utils.Ipv6Addr.Address, err) + if err := s.AddProtocolAddress(nicID2, protocolAddrV6, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID2, protocolAddrV6, err) } if err := s.SetForwardingDefaultAndAllNICs(ipv4.ProtocolNumber, true); err != nil { @@ -1037,22 +1061,22 @@ func TestInputHookWithLocalForwarding(t *testing.T) { if err := s.CreateNICWithOptions(nicID1, e1, stack.NICOptions{Name: nic1Name}); err != nil { t.Fatalf("s.CreateNICWithOptions(%d, _, _): %s", nicID1, err) } - if err := s.AddProtocolAddress(nicID1, utils.Ipv4Addr1); err != nil { - t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID1, utils.Ipv4Addr1, err) + if err := s.AddProtocolAddress(nicID1, utils.Ipv4Addr1, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID1, utils.Ipv4Addr1, err) } - if err := s.AddProtocolAddress(nicID1, utils.Ipv6Addr1); err != nil { - t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID1, utils.Ipv6Addr1, err) + if err := s.AddProtocolAddress(nicID1, utils.Ipv6Addr1, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID1, utils.Ipv6Addr1, err) } e2 := channel.New(1, header.IPv6MinimumMTU, "") if err := s.CreateNICWithOptions(nicID2, e2, stack.NICOptions{Name: nic2Name}); err != nil { t.Fatalf("s.CreateNICWithOptions(%d, _, _): %s", nicID2, err) } - if err := s.AddProtocolAddress(nicID2, utils.Ipv4Addr2); err != nil { - t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID2, utils.Ipv4Addr2, err) + if err := s.AddProtocolAddress(nicID2, utils.Ipv4Addr2, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID2, utils.Ipv4Addr2, err) } - if err := s.AddProtocolAddress(nicID2, utils.Ipv6Addr2); err != nil { - t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID2, utils.Ipv6Addr2, err) + if err := s.AddProtocolAddress(nicID2, utils.Ipv6Addr2, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID2, utils.Ipv6Addr2, err) } if err := s.SetForwardingDefaultAndAllNICs(ipv4.ProtocolNumber, true); err != nil { diff --git a/pkg/tcpip/tests/integration/link_resolution_test.go b/pkg/tcpip/tests/integration/link_resolution_test.go index 27caa0c28..95ddd8ec3 100644 --- a/pkg/tcpip/tests/integration/link_resolution_test.go +++ b/pkg/tcpip/tests/integration/link_resolution_test.go @@ -56,17 +56,17 @@ func setupStack(t *testing.T, stackOpts stack.Options, host1NICID, host2NICID tc t.Fatalf("host2Stack.CreateNIC(%d, _): %s", host2NICID, err) } - if err := host1Stack.AddProtocolAddress(host1NICID, utils.Ipv4Addr1); err != nil { - t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, utils.Ipv4Addr1, err) + if err := host1Stack.AddProtocolAddress(host1NICID, utils.Ipv4Addr1, stack.AddressProperties{}); err != nil { + t.Fatalf("host1Stack.AddProtocolAddress(%d, %+v, {}): %s", host1NICID, utils.Ipv4Addr1, err) } - if err := host2Stack.AddProtocolAddress(host2NICID, utils.Ipv4Addr2); err != nil { - t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, utils.Ipv4Addr2, err) + if err := host2Stack.AddProtocolAddress(host2NICID, utils.Ipv4Addr2, stack.AddressProperties{}); err != nil { + t.Fatalf("host2Stack.AddProtocolAddress(%d, %+v, {}): %s", host2NICID, utils.Ipv4Addr2, err) } - if err := host1Stack.AddProtocolAddress(host1NICID, utils.Ipv6Addr1); err != nil { - t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, utils.Ipv6Addr1, err) + if err := host1Stack.AddProtocolAddress(host1NICID, utils.Ipv6Addr1, stack.AddressProperties{}); err != nil { + t.Fatalf("host1Stack.AddProtocolAddress(%d, %+v, {}): %s", host1NICID, utils.Ipv6Addr1, err) } - if err := host2Stack.AddProtocolAddress(host2NICID, utils.Ipv6Addr2); err != nil { - t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, utils.Ipv6Addr2, err) + if err := host2Stack.AddProtocolAddress(host2NICID, utils.Ipv6Addr2, stack.AddressProperties{}); err != nil { + t.Fatalf("host2Stack.AddProtocolAddress(%d, %+v, {}): %s", host2NICID, utils.Ipv6Addr2, err) } host1Stack.SetRouteTable([]tcpip.Route{ @@ -568,8 +568,8 @@ func TestForwardingWithLinkResolutionFailure(t *testing.T) { Protocol: test.networkProtocolNumber, AddressWithPrefix: test.incomingAddr, } - if err := s.AddProtocolAddress(incomingNICID, incomingProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", incomingNICID, incomingProtoAddr, err) + if err := s.AddProtocolAddress(incomingNICID, incomingProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", incomingNICID, incomingProtoAddr, err) } // Set up endpoint through which we will attempt to forward packets. @@ -582,8 +582,8 @@ func TestForwardingWithLinkResolutionFailure(t *testing.T) { Protocol: test.networkProtocolNumber, AddressWithPrefix: test.outgoingAddr, } - if err := s.AddProtocolAddress(outgoingNICID, outgoingProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", outgoingNICID, outgoingProtoAddr, err) + if err := s.AddProtocolAddress(outgoingNICID, outgoingProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", outgoingNICID, outgoingProtoAddr, err) } s.SetRouteTable([]tcpip.Route{ diff --git a/pkg/tcpip/tests/integration/loopback_test.go b/pkg/tcpip/tests/integration/loopback_test.go index 87d36e1dd..f33223e79 100644 --- a/pkg/tcpip/tests/integration/loopback_test.go +++ b/pkg/tcpip/tests/integration/loopback_test.go @@ -44,20 +44,17 @@ type ndpDispatcher struct{} func (*ndpDispatcher) OnDuplicateAddressDetectionResult(tcpip.NICID, tcpip.Address, stack.DADResult) { } -func (*ndpDispatcher) OnDefaultRouterDiscovered(tcpip.NICID, tcpip.Address) bool { - return false +func (*ndpDispatcher) OnOffLinkRouteUpdated(tcpip.NICID, tcpip.Subnet, tcpip.Address, header.NDPRoutePreference) { } -func (*ndpDispatcher) OnDefaultRouterInvalidated(tcpip.NICID, tcpip.Address) {} +func (*ndpDispatcher) OnOffLinkRouteInvalidated(tcpip.NICID, tcpip.Subnet, tcpip.Address) {} -func (*ndpDispatcher) OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) bool { - return false +func (*ndpDispatcher) OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) { } func (*ndpDispatcher) OnOnLinkPrefixInvalidated(tcpip.NICID, tcpip.Subnet) {} -func (*ndpDispatcher) OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool { - return true +func (*ndpDispatcher) OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) { } func (*ndpDispatcher) OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix) {} @@ -198,8 +195,8 @@ func TestLoopbackAcceptAllInSubnetUDP(t *testing.T) { if err := s.CreateNIC(nicID, loopback.New()); err != nil { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddProtocolAddress(nicID, test.addAddress); err != nil { - t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, test.addAddress, err) + if err := s.AddProtocolAddress(nicID, test.addAddress, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, test.addAddress, err) } s.SetRouteTable([]tcpip.Route{ { @@ -293,8 +290,8 @@ func TestLoopbackSubnetLifetimeBoundToAddr(t *testing.T) { if err := s.CreateNIC(nicID, loopback.New()); err != nil { t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddProtocolAddress(nicID, protoAddr); err != nil { - t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID, protoAddr, err) + if err := s.AddProtocolAddress(nicID, protoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID, protoAddr, err) } s.SetRouteTable([]tcpip.Route{ { @@ -434,8 +431,8 @@ func TestLoopbackAcceptAllInSubnetTCP(t *testing.T) { if err := s.CreateNIC(nicID, loopback.New()); err != nil { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddProtocolAddress(nicID, test.addAddress); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, test.addAddress, err) + if err := s.AddProtocolAddress(nicID, test.addAddress, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, test.addAddress, err) } s.SetRouteTable([]tcpip.Route{ { @@ -696,21 +693,40 @@ func TestExternalLoopbackTraffic(t *testing.T) { if err := s.CreateNIC(nicID1, e); err != nil { t.Fatalf("CreateNIC(%d, _): %s", nicID1, err) } - if err := s.AddAddressWithPrefix(nicID1, ipv4.ProtocolNumber, utils.Ipv4Addr); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s): %s", nicID1, ipv4.ProtocolNumber, utils.Ipv4Addr, err) + v4Addr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: utils.Ipv4Addr, } - if err := s.AddAddressWithPrefix(nicID1, ipv6.ProtocolNumber, utils.Ipv6Addr); err != nil { - t.Fatalf("AddAddressWithPrefix(%d, %d, %s): %s", nicID1, ipv6.ProtocolNumber, utils.Ipv6Addr, err) + if err := s.AddProtocolAddress(nicID1, v4Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID1, v4Addr, err) + } + v6Addr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: utils.Ipv6Addr, + } + if err := s.AddProtocolAddress(nicID1, v6Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID1, v6Addr, err) } if err := s.CreateNIC(nicID2, loopback.New()); err != nil { t.Fatalf("CreateNIC(%d, _): %s", nicID2, err) } - if err := s.AddAddress(nicID2, ipv4.ProtocolNumber, ipv4Loopback); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, ipv4.ProtocolNumber, ipv4Loopback, err) + protocolAddrV4 := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: ipv4Loopback, + PrefixLen: 8, + }, + } + if err := s.AddProtocolAddress(nicID2, protocolAddrV4, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID2, protocolAddrV4, err) + } + protocolAddrV6 := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: header.IPv6Loopback.WithPrefix(), } - if err := s.AddAddress(nicID2, ipv6.ProtocolNumber, header.IPv6Loopback); err != nil { - t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, ipv6.ProtocolNumber, header.IPv6Loopback, err) + if err := s.AddProtocolAddress(nicID2, protocolAddrV6, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID2, protocolAddrV6, err) } if test.forwarding { diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go index 2d0a6e6a7..7753e7d6e 100644 --- a/pkg/tcpip/tests/integration/multicast_broadcast_test.go +++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go @@ -119,12 +119,12 @@ func TestPingMulticastBroadcast(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } ipv4ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: utils.Ipv4Addr} - if err := s.AddProtocolAddress(nicID, ipv4ProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, ipv4ProtoAddr, err) + if err := s.AddProtocolAddress(nicID, ipv4ProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, ipv4ProtoAddr, err) } ipv6ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: utils.Ipv6Addr} - if err := s.AddProtocolAddress(nicID, ipv6ProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, ipv6ProtoAddr, err) + if err := s.AddProtocolAddress(nicID, ipv6ProtoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, ipv6ProtoAddr, err) } // Default routes for IPv4 and IPv6 so ICMP can find a route to the remote @@ -396,8 +396,8 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } protoAddr := tcpip.ProtocolAddress{Protocol: test.proto, AddressWithPrefix: test.localAddr} - if err := s.AddProtocolAddress(nicID, protoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, protoAddr, err) + if err := s.AddProtocolAddress(nicID, protoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protoAddr, err) } var wq waiter.Queue @@ -474,8 +474,8 @@ func TestReuseAddrAndBroadcast(t *testing.T) { PrefixLen: 8, }, } - if err := s.AddProtocolAddress(nicID, protoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, protoAddr, err) + if err := s.AddProtocolAddress(nicID, protoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protoAddr, err) } s.SetRouteTable([]tcpip.Route{ @@ -642,8 +642,8 @@ func TestUDPAddRemoveMembershipSocketOption(t *testing.T) { t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } protoAddr := tcpip.ProtocolAddress{Protocol: test.proto, AddressWithPrefix: test.localAddr} - if err := s.AddProtocolAddress(nicID, protoAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, protoAddr, err) + if err := s.AddProtocolAddress(nicID, protoAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protoAddr, err) } // Set the route table so that UDP can find a NIC that is diff --git a/pkg/tcpip/tests/integration/route_test.go b/pkg/tcpip/tests/integration/route_test.go index ac3c703d4..422eb8408 100644 --- a/pkg/tcpip/tests/integration/route_test.go +++ b/pkg/tcpip/tests/integration/route_test.go @@ -47,7 +47,10 @@ func TestLocalPing(t *testing.T) { // request/reply packets. icmpDataOffset = 8 ) - ipv4Loopback := testutil.MustParse4("127.0.0.1") + ipv4Loopback := tcpip.AddressWithPrefix{ + Address: testutil.MustParse4("127.0.0.1"), + PrefixLen: 8, + } channelEP := func() stack.LinkEndpoint { return channel.New(1, header.IPv6MinimumMTU, "") } channelEPCheck := func(t *testing.T, e stack.LinkEndpoint) { @@ -82,7 +85,7 @@ func TestLocalPing(t *testing.T) { transProto tcpip.TransportProtocolNumber netProto tcpip.NetworkProtocolNumber linkEndpoint func() stack.LinkEndpoint - localAddr tcpip.Address + localAddr tcpip.AddressWithPrefix icmpBuf func(*testing.T) buffer.View expectedConnectErr tcpip.Error checkLinkEndpoint func(t *testing.T, e stack.LinkEndpoint) @@ -101,7 +104,7 @@ func TestLocalPing(t *testing.T) { transProto: icmp.ProtocolNumber6, netProto: ipv6.ProtocolNumber, linkEndpoint: loopback.New, - localAddr: header.IPv6Loopback, + localAddr: header.IPv6Loopback.WithPrefix(), icmpBuf: ipv6ICMPBuf, checkLinkEndpoint: func(*testing.T, stack.LinkEndpoint) {}, }, @@ -110,7 +113,7 @@ func TestLocalPing(t *testing.T) { transProto: icmp.ProtocolNumber4, netProto: ipv4.ProtocolNumber, linkEndpoint: channelEP, - localAddr: utils.Ipv4Addr.Address, + localAddr: utils.Ipv4Addr, icmpBuf: ipv4ICMPBuf, checkLinkEndpoint: channelEPCheck, }, @@ -119,7 +122,7 @@ func TestLocalPing(t *testing.T) { transProto: icmp.ProtocolNumber6, netProto: ipv6.ProtocolNumber, linkEndpoint: channelEP, - localAddr: utils.Ipv6Addr.Address, + localAddr: utils.Ipv6Addr, icmpBuf: ipv6ICMPBuf, checkLinkEndpoint: channelEPCheck, }, @@ -182,9 +185,13 @@ func TestLocalPing(t *testing.T) { t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) } - if len(test.localAddr) != 0 { - if err := s.AddAddress(nicID, test.netProto, test.localAddr); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, test.netProto, test.localAddr, err) + if len(test.localAddr.Address) != 0 { + protocolAddr := tcpip.ProtocolAddress{ + Protocol: test.netProto, + AddressWithPrefix: test.localAddr, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddr, err) } } @@ -197,7 +204,7 @@ func TestLocalPing(t *testing.T) { } defer ep.Close() - connAddr := tcpip.FullAddress{Addr: test.localAddr} + connAddr := tcpip.FullAddress{Addr: test.localAddr.Address} if err := ep.Connect(connAddr); err != test.expectedConnectErr { t.Fatalf("got ep.Connect(%#v) = %s, want = %s", connAddr, err, test.expectedConnectErr) } @@ -229,8 +236,8 @@ func TestLocalPing(t *testing.T) { if diff := cmp.Diff(buffer.View(w.Bytes()[icmpDataOffset:]), payload[icmpDataOffset:]); diff != "" { t.Errorf("received data mismatch (-want +got):\n%s", diff) } - if rr.RemoteAddr.Addr != test.localAddr { - t.Errorf("got addr.Addr = %s, want = %s", rr.RemoteAddr.Addr, test.localAddr) + if rr.RemoteAddr.Addr != test.localAddr.Address { + t.Errorf("got addr.Addr = %s, want = %s", rr.RemoteAddr.Addr, test.localAddr.Address) } test.checkLinkEndpoint(t, e) @@ -302,11 +309,12 @@ func TestLocalUDP(t *testing.T) { } if subTest.addAddress { - if err := s.AddProtocolAddressWithOptions(nicID, test.canBePrimaryAddr, stack.CanBePrimaryEndpoint); err != nil { - t.Fatalf("s.AddProtocolAddressWithOptions(%d, %#v, %d): %s", nicID, test.canBePrimaryAddr, stack.FirstPrimaryEndpoint, err) + if err := s.AddProtocolAddress(nicID, test.canBePrimaryAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID, test.canBePrimaryAddr, err) } - if err := s.AddProtocolAddressWithOptions(nicID, test.firstPrimaryAddr, stack.FirstPrimaryEndpoint); err != nil { - t.Fatalf("s.AddProtocolAddressWithOptions(%d, %#v, %d): %s", nicID, test.firstPrimaryAddr, stack.FirstPrimaryEndpoint, err) + properties := stack.AddressProperties{PEB: stack.FirstPrimaryEndpoint} + if err := s.AddProtocolAddress(nicID, test.firstPrimaryAddr, properties); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, %+v): %s", nicID, test.firstPrimaryAddr, properties, err) } } diff --git a/pkg/tcpip/tests/utils/utils.go b/pkg/tcpip/tests/utils/utils.go index 2e6ae55ea..947bcc7b1 100644 --- a/pkg/tcpip/tests/utils/utils.go +++ b/pkg/tcpip/tests/utils/utils.go @@ -231,29 +231,29 @@ func SetupRoutedStacks(t *testing.T, host1Stack, routerStack, host2Stack *stack. t.Fatalf("routerStack.SetForwardingDefaultAndAllNICs(%d): %s", ipv6.ProtocolNumber, err) } - if err := host1Stack.AddProtocolAddress(Host1NICID, Host1IPv4Addr); err != nil { - t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", Host1NICID, Host1IPv4Addr, err) + if err := host1Stack.AddProtocolAddress(Host1NICID, Host1IPv4Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("host1Stack.AddProtocolAddress(%d, %+v, {}): %s", Host1NICID, Host1IPv4Addr, err) } - if err := routerStack.AddProtocolAddress(RouterNICID1, RouterNIC1IPv4Addr); err != nil { - t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", RouterNICID1, RouterNIC1IPv4Addr, err) + if err := routerStack.AddProtocolAddress(RouterNICID1, RouterNIC1IPv4Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("routerStack.AddProtocolAddress(%d, %+v, {}): %s", RouterNICID1, RouterNIC1IPv4Addr, err) } - if err := routerStack.AddProtocolAddress(RouterNICID2, RouterNIC2IPv4Addr); err != nil { - t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", RouterNICID2, RouterNIC2IPv4Addr, err) + if err := routerStack.AddProtocolAddress(RouterNICID2, RouterNIC2IPv4Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("routerStack.AddProtocolAddress(%d, %+v, {}): %s", RouterNICID2, RouterNIC2IPv4Addr, err) } - if err := host2Stack.AddProtocolAddress(Host2NICID, Host2IPv4Addr); err != nil { - t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", Host2NICID, Host2IPv4Addr, err) + if err := host2Stack.AddProtocolAddress(Host2NICID, Host2IPv4Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("host2Stack.AddProtocolAddress(%d, %+v, {}): %s", Host2NICID, Host2IPv4Addr, err) } - if err := host1Stack.AddProtocolAddress(Host1NICID, Host1IPv6Addr); err != nil { - t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", Host1NICID, Host1IPv6Addr, err) + if err := host1Stack.AddProtocolAddress(Host1NICID, Host1IPv6Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("host1Stack.AddProtocolAddress(%d, %+v, {}): %s", Host1NICID, Host1IPv6Addr, err) } - if err := routerStack.AddProtocolAddress(RouterNICID1, RouterNIC1IPv6Addr); err != nil { - t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", RouterNICID1, RouterNIC1IPv6Addr, err) + if err := routerStack.AddProtocolAddress(RouterNICID1, RouterNIC1IPv6Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("routerStack.AddProtocolAddress(%d, %+v, {}): %s", RouterNICID1, RouterNIC1IPv6Addr, err) } - if err := routerStack.AddProtocolAddress(RouterNICID2, RouterNIC2IPv6Addr); err != nil { - t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", RouterNICID2, RouterNIC2IPv6Addr, err) + if err := routerStack.AddProtocolAddress(RouterNICID2, RouterNIC2IPv6Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("routerStack.AddProtocolAddress(%d, %+v, {}): %s", RouterNICID2, RouterNIC2IPv6Addr, err) } - if err := host2Stack.AddProtocolAddress(Host2NICID, Host2IPv6Addr); err != nil { - t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", Host2NICID, Host2IPv6Addr, err) + if err := host2Stack.AddProtocolAddress(Host2NICID, Host2IPv6Addr, stack.AddressProperties{}); err != nil { + t.Fatalf("host2Stack.AddProtocolAddress(%d, %+v, {}): %s", Host2NICID, Host2IPv6Addr, err) } host1Stack.SetRouteTable([]tcpip.Route{ diff --git a/pkg/tcpip/transport/BUILD b/pkg/tcpip/transport/BUILD new file mode 100644 index 000000000..af332ed91 --- /dev/null +++ b/pkg/tcpip/transport/BUILD @@ -0,0 +1,13 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "transport", + srcs = [ + "datagram.go", + "transport.go", + ], + visibility = ["//visibility:public"], + deps = ["//pkg/tcpip"], +) diff --git a/pkg/tcpip/transport/datagram.go b/pkg/tcpip/transport/datagram.go new file mode 100644 index 000000000..dfce72c69 --- /dev/null +++ b/pkg/tcpip/transport/datagram.go @@ -0,0 +1,49 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package transport + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/tcpip" +) + +// DatagramEndpointState is the state of a datagram-based endpoint. +type DatagramEndpointState tcpip.EndpointState + +// The states a datagram-based endpoint may be in. +const ( + _ DatagramEndpointState = iota + DatagramEndpointStateInitial + DatagramEndpointStateBound + DatagramEndpointStateConnected + DatagramEndpointStateClosed +) + +// String implements fmt.Stringer. +func (s DatagramEndpointState) String() string { + switch s { + case DatagramEndpointStateInitial: + return "INITIAL" + case DatagramEndpointStateBound: + return "BOUND" + case DatagramEndpointStateConnected: + return "CONNECTED" + case DatagramEndpointStateClosed: + return "CLOSED" + default: + panic(fmt.Sprintf("unhandled %[1]T variant = %[1]d", s)) + } +} diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index fb77febcf..1e519085d 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -213,6 +213,7 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult // reacquire the mutex in exclusive mode. // // Returns true for retry if preparation should be retried. +// +checklocks:e.mu func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) { switch e.state { case stateInitial: @@ -229,10 +230,8 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip } e.mu.RUnlock() - defer e.mu.RLock() - e.mu.Lock() - defer e.mu.Unlock() + defer e.mu.DowngradeLock() // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. @@ -330,6 +329,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp route = r } + // TODO(https://gvisor.dev/issue/6538): Avoid this allocation. v := make([]byte, p.Len()) if _, err := io.ReadFull(p, v); err != nil { return 0, &tcpip.ErrBadBuffer{} @@ -688,9 +688,20 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) tcpip.Error { return nil } +func (e *endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, addr tcpip.Address) bool { + return addr == header.IPv4Broadcast || + header.IsV4MulticastAddress(addr) || + header.IsV6MulticastAddress(addr) || + e.stack.IsSubnetBroadcast(nicID, e.NetProto, addr) +} + // Bind binds the endpoint to a specific local address and port. // Specifying a NIC is optional. func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { + if len(addr.Addr) != 0 && e.isBroadcastOrMulticast(addr.NIC, addr.Addr) { + return &tcpip.ErrBadLocalAddress{} + } + e.mu.Lock() defer e.mu.Unlock() @@ -758,8 +769,6 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB switch e.NetProto { case header.IPv4ProtocolNumber: h := header.ICMPv4(pkt.TransportHeader().View()) - // TODO(gvisor.dev/issue/170): Determine if len(h) check is still needed - // after early parsing. if len(h) < header.ICMPv4MinimumSize || h.Type() != header.ICMPv4EchoReply { e.stack.Stats().DroppedPackets.Increment() e.stats.ReceiveErrors.MalformedPacketsReceived.Increment() @@ -767,8 +776,6 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB } case header.IPv6ProtocolNumber: h := header.ICMPv6(pkt.TransportHeader().View()) - // TODO(gvisor.dev/issue/170): Determine if len(h) check is still needed - // after early parsing. if len(h) < header.ICMPv6MinimumSize || h.Type() != header.ICMPv6EchoReply { e.stack.Stats().DroppedPackets.Increment() e.stats.ReceiveErrors.MalformedPacketsReceived.Increment() diff --git a/pkg/tcpip/transport/icmp/icmp_test.go b/pkg/tcpip/transport/icmp/icmp_test.go index cc950cbde..729f50e9a 100644 --- a/pkg/tcpip/transport/icmp/icmp_test.go +++ b/pkg/tcpip/transport/icmp/icmp_test.go @@ -55,8 +55,12 @@ func addNICWithDefaultRoute(t *testing.T, s *stack.Stack, id tcpip.NICID, name s t.Fatalf("s.CreateNIC(%d, _) = %s", id, err) } - if err := s.AddAddress(id, ipv4.ProtocolNumber, addrV4); err != nil { - t.Fatalf("s.AddAddress(%d, %d, %s) = %s", id, ipv4.ProtocolNumber, addrV4, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: addrV4.WithPrefix(), + } + if err := s.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", id, protocolAddr, err) } s.AddRoute(tcpip.Route{ diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go index 47f7dd1cb..fa82affc1 100644 --- a/pkg/tcpip/transport/icmp/protocol.go +++ b/pkg/tcpip/transport/icmp/protocol.go @@ -123,8 +123,6 @@ func (*protocol) Wait() {} // Parse implements stack.TransportProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) bool { - // TODO(gvisor.dev/issue/170): Implement parsing of ICMP. - // // Right now, the Parse() method is tied to enabled protocols passed into // stack.New. This works for UDP and TCP, but we handle ICMP traffic even // when netstack users don't pass ICMP as a supported protocol. diff --git a/pkg/tcpip/transport/internal/network/BUILD b/pkg/tcpip/transport/internal/network/BUILD new file mode 100644 index 000000000..b1edce39b --- /dev/null +++ b/pkg/tcpip/transport/internal/network/BUILD @@ -0,0 +1,45 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "network", + srcs = [ + "endpoint.go", + "endpoint_state.go", + ], + visibility = [ + "//pkg/tcpip/transport/raw:__pkg__", + "//pkg/tcpip/transport/udp:__pkg__", + ], + deps = [ + "//pkg/sync", + "//pkg/tcpip", + "//pkg/tcpip/header", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport", + ], +) + +go_test( + name = "network_test", + size = "small", + srcs = ["endpoint_test.go"], + deps = [ + ":network", + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/tcpip/checker", + "//pkg/tcpip/faketime", + "//pkg/tcpip/header", + "//pkg/tcpip/link/channel", + "//pkg/tcpip/link/loopback", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/network/ipv6", + "//pkg/tcpip/stack", + "//pkg/tcpip/testutil", + "//pkg/tcpip/transport", + "//pkg/tcpip/transport/udp", + "@com_github_google_go_cmp//cmp:go_default_library", + ], +) diff --git a/pkg/tcpip/transport/internal/network/endpoint.go b/pkg/tcpip/transport/internal/network/endpoint.go new file mode 100644 index 000000000..e3094f59f --- /dev/null +++ b/pkg/tcpip/transport/internal/network/endpoint.go @@ -0,0 +1,811 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package network provides facilities to support tcpip.Endpoints that operate +// at the network layer or above. +package network + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport" +) + +// Endpoint is a datagram-based endpoint. It only supports sending datagrams to +// a peer. +// +// +stateify savable +type Endpoint struct { + // The following fields must only be set once then never changed. + stack *stack.Stack `state:"manual"` + ops *tcpip.SocketOptions + netProto tcpip.NetworkProtocolNumber + transProto tcpip.TransportProtocolNumber + + mu sync.RWMutex `state:"nosave"` + // +checklocks:mu + wasBound bool + // owner is the owner of transmitted packets. + // + // +checklocks:mu + owner tcpip.PacketOwner + // +checklocks:mu + writeShutdown bool + // +checklocks:mu + effectiveNetProto tcpip.NetworkProtocolNumber + // +checklocks:mu + connectedRoute *stack.Route `state:"manual"` + // +checklocks:mu + multicastMemberships map[multicastMembership]struct{} + // TODO(https://gvisor.dev/issue/6389): Use different fields for IPv4/IPv6. + // +checklocks:mu + ttl uint8 + // TODO(https://gvisor.dev/issue/6389): Use different fields for IPv4/IPv6. + // +checklocks:mu + multicastTTL uint8 + // TODO(https://gvisor.dev/issue/6389): Use different fields for IPv4/IPv6. + // +checklocks:mu + multicastAddr tcpip.Address + // TODO(https://gvisor.dev/issue/6389): Use different fields for IPv4/IPv6. + // +checklocks:mu + multicastNICID tcpip.NICID + // +checklocks:mu + ipv4TOS uint8 + // +checklocks:mu + ipv6TClass uint8 + + // Lock ordering: mu > infoMu. + infoMu sync.RWMutex `state:"nosave"` + // info has a dedicated mutex so that we can avoid lock ordering violations + // when reading the endpoint's info. If we used mu, we need to guarantee + // that any lock taken while mu is held is not held when calling Info() + // which is not true as of writing (we hold mu while registering transport + // endpoints (taking the transport demuxer lock but we also hold the demuxer + // lock when delivering packets/errors to endpoints). + // + // Writes must be performed through setInfo. + // + // +checklocks:infoMu + info stack.TransportEndpointInfo + + // state holds a transport.DatagramBasedEndpointState. + // + // state must be accessed with atomics so that we can avoid lock ordering + // violations when reading the state. If we used mu, we need to guarantee + // that any lock taken while mu is held is not held when calling State() + // which is not true as of writing (we hold mu while registering transport + // endpoints (taking the transport demuxer lock but we also hold the demuxer + // lock when delivering packets/errors to endpoints). + // + // Writes must be performed through setEndpointState. + // + // +checkatomics + state uint32 +} + +// +stateify savable +type multicastMembership struct { + nicID tcpip.NICID + multicastAddr tcpip.Address +} + +// Init initializes the endpoint. +func (e *Endpoint) Init(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ops *tcpip.SocketOptions) { + e.mu.Lock() + memberships := e.multicastMemberships + e.mu.Unlock() + if memberships != nil { + panic(fmt.Sprintf("endpoint is already initialized; got e.multicastMemberships = %#v, want = nil", memberships)) + } + + switch netProto { + case header.IPv4ProtocolNumber, header.IPv6ProtocolNumber: + default: + panic(fmt.Sprintf("invalid protocol number = %d", netProto)) + } + + *e = Endpoint{ + stack: s, + ops: ops, + netProto: netProto, + transProto: transProto, + + info: stack.TransportEndpointInfo{ + NetProto: netProto, + TransProto: transProto, + }, + effectiveNetProto: netProto, + // Linux defaults to TTL=1. + multicastTTL: 1, + multicastMemberships: make(map[multicastMembership]struct{}), + } + + e.mu.Lock() + defer e.mu.Unlock() + e.setEndpointState(transport.DatagramEndpointStateInitial) +} + +// NetProto returns the network protocol the endpoint was initialized with. +func (e *Endpoint) NetProto() tcpip.NetworkProtocolNumber { + return e.netProto +} + +// setEndpointState sets the state of the endpoint. +// +// e.mu must be held to synchronize changes to state with the rest of the +// endpoint. +// +// +checklocks:e.mu +func (e *Endpoint) setEndpointState(state transport.DatagramEndpointState) { + atomic.StoreUint32(&e.state, uint32(state)) +} + +// State returns the state of the endpoint. +func (e *Endpoint) State() transport.DatagramEndpointState { + return transport.DatagramEndpointState(atomic.LoadUint32(&e.state)) +} + +// Close cleans the endpoint's resources and leaves the endpoint in a closed +// state. +func (e *Endpoint) Close() { + e.mu.Lock() + defer e.mu.Unlock() + + if e.State() == transport.DatagramEndpointStateClosed { + return + } + + for mem := range e.multicastMemberships { + e.stack.LeaveGroup(e.netProto, mem.nicID, mem.multicastAddr) + } + e.multicastMemberships = nil + + if e.connectedRoute != nil { + e.connectedRoute.Release() + e.connectedRoute = nil + } + + e.setEndpointState(transport.DatagramEndpointStateClosed) +} + +// SetOwner sets the owner of transmitted packets. +func (e *Endpoint) SetOwner(owner tcpip.PacketOwner) { + e.mu.Lock() + defer e.mu.Unlock() + e.owner = owner +} + +func calculateTTL(route *stack.Route, ttl uint8, multicastTTL uint8) uint8 { + if header.IsV4MulticastAddress(route.RemoteAddress()) || header.IsV6MulticastAddress(route.RemoteAddress()) { + return multicastTTL + } + + if ttl == 0 { + return route.DefaultTTL() + } + + return ttl +} + +// WriteContext holds the context for a write. +type WriteContext struct { + transProto tcpip.TransportProtocolNumber + route *stack.Route + ttl uint8 + tos uint8 + owner tcpip.PacketOwner +} + +// Release releases held resources. +func (c *WriteContext) Release() { + c.route.Release() + *c = WriteContext{} +} + +// WritePacketInfo is the properties of a packet that may be written. +type WritePacketInfo struct { + NetProto tcpip.NetworkProtocolNumber + LocalAddress, RemoteAddress tcpip.Address + MaxHeaderLength uint16 + RequiresTXTransportChecksum bool +} + +// PacketInfo returns the properties of a packet that will be written. +func (c *WriteContext) PacketInfo() WritePacketInfo { + return WritePacketInfo{ + NetProto: c.route.NetProto(), + LocalAddress: c.route.LocalAddress(), + RemoteAddress: c.route.RemoteAddress(), + MaxHeaderLength: c.route.MaxHeaderLength(), + RequiresTXTransportChecksum: c.route.RequiresTXTransportChecksum(), + } +} + +// WritePacket attempts to write the packet. +func (c *WriteContext) WritePacket(pkt *stack.PacketBuffer, headerIncluded bool) tcpip.Error { + pkt.Owner = c.owner + + if headerIncluded { + return c.route.WriteHeaderIncludedPacket(pkt) + } + + return c.route.WritePacket(stack.NetworkHeaderParams{ + Protocol: c.transProto, + TTL: c.ttl, + TOS: c.tos, + }, pkt) +} + +// AcquireContextForWrite acquires a WriteContext. +func (e *Endpoint) AcquireContextForWrite(opts tcpip.WriteOptions) (WriteContext, tcpip.Error) { + e.mu.RLock() + defer e.mu.RUnlock() + + // MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op. + if opts.More { + return WriteContext{}, &tcpip.ErrInvalidOptionValue{} + } + + if e.State() == transport.DatagramEndpointStateClosed { + return WriteContext{}, &tcpip.ErrInvalidEndpointState{} + } + + if e.writeShutdown { + return WriteContext{}, &tcpip.ErrClosedForSend{} + } + + route := e.connectedRoute + if opts.To == nil { + // If the user doesn't specify a destination, they should have + // connected to another address. + if e.State() != transport.DatagramEndpointStateConnected { + return WriteContext{}, &tcpip.ErrDestinationRequired{} + } + + route.Acquire() + } else { + // Reject destination address if it goes through a different + // NIC than the endpoint was bound to. + nicID := opts.To.NIC + if nicID == 0 { + nicID = tcpip.NICID(e.ops.GetBindToDevice()) + } + info := e.Info() + if info.BindNICID != 0 { + if nicID != 0 && nicID != info.BindNICID { + return WriteContext{}, &tcpip.ErrNoRoute{} + } + + nicID = info.BindNICID + } + if nicID == 0 { + nicID = info.RegisterNICID + } + + dst, netProto, err := e.checkV4Mapped(*opts.To) + if err != nil { + return WriteContext{}, err + } + + route, _, err = e.connectRouteRLocked(nicID, dst, netProto) + if err != nil { + return WriteContext{}, err + } + } + + if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() { + route.Release() + return WriteContext{}, &tcpip.ErrBroadcastDisabled{} + } + + var tos uint8 + switch netProto := route.NetProto(); netProto { + case header.IPv4ProtocolNumber: + tos = e.ipv4TOS + case header.IPv6ProtocolNumber: + tos = e.ipv6TClass + default: + panic(fmt.Sprintf("invalid protocol number = %d", netProto)) + } + + return WriteContext{ + transProto: e.transProto, + route: route, + ttl: calculateTTL(route, e.ttl, e.multicastTTL), + tos: tos, + owner: e.owner, + }, nil +} + +// Disconnect disconnects the endpoint from its peer. +func (e *Endpoint) Disconnect() { + e.mu.Lock() + defer e.mu.Unlock() + + if e.State() != transport.DatagramEndpointStateConnected { + return + } + + info := e.Info() + // Exclude ephemerally bound endpoints. + if e.wasBound { + info.ID = stack.TransportEndpointID{ + LocalAddress: info.BindAddr, + } + e.setEndpointState(transport.DatagramEndpointStateBound) + } else { + info.ID = stack.TransportEndpointID{} + e.setEndpointState(transport.DatagramEndpointStateInitial) + } + e.setInfo(info) + + e.connectedRoute.Release() + e.connectedRoute = nil +} + +// connectRouteRLocked establishes a route to the specified interface or the +// configured multicast interface if no interface is specified and the +// specified address is a multicast address. +// +// TODO(https://gvisor.dev/issue/6590): Annotate read lock requirement. +// +checklocks:e.mu +func (e *Endpoint) connectRouteRLocked(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (*stack.Route, tcpip.NICID, tcpip.Error) { + localAddr := e.Info().ID.LocalAddress + if e.isBroadcastOrMulticast(nicID, netProto, localAddr) { + // A packet can only originate from a unicast address (i.e., an interface). + localAddr = "" + } + + if header.IsV4MulticastAddress(addr.Addr) || header.IsV6MulticastAddress(addr.Addr) { + if nicID == 0 { + nicID = e.multicastNICID + } + if localAddr == "" && nicID == 0 { + localAddr = e.multicastAddr + } + } + + // Find a route to the desired destination. + r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.ops.GetMulticastLoop()) + if err != nil { + return nil, 0, err + } + return r, nicID, nil +} + +// Connect connects the endpoint to the address. +func (e *Endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { + return e.ConnectAndThen(addr, func(_ tcpip.NetworkProtocolNumber, _, _ stack.TransportEndpointID) tcpip.Error { + return nil + }) +} + +// ConnectAndThen connects the endpoint to the address and then calls the +// provided function. +// +// If the function returns an error, the endpoint's state does not change. The +// function will be called with the network protocol used to connect to the peer +// and the source and destination addresses that will be used to send traffic to +// the peer. +func (e *Endpoint) ConnectAndThen(addr tcpip.FullAddress, f func(netProto tcpip.NetworkProtocolNumber, previousID, nextID stack.TransportEndpointID) tcpip.Error) tcpip.Error { + addr.Port = 0 + + e.mu.Lock() + defer e.mu.Unlock() + + info := e.Info() + nicID := addr.NIC + switch e.State() { + case transport.DatagramEndpointStateInitial: + case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: + if info.BindNICID == 0 { + break + } + + if nicID != 0 && nicID != info.BindNICID { + return &tcpip.ErrInvalidEndpointState{} + } + + nicID = info.BindNICID + default: + return &tcpip.ErrInvalidEndpointState{} + } + + addr, netProto, err := e.checkV4Mapped(addr) + if err != nil { + return err + } + + r, nicID, err := e.connectRouteRLocked(nicID, addr, netProto) + if err != nil { + return err + } + + id := stack.TransportEndpointID{ + LocalAddress: info.ID.LocalAddress, + RemoteAddress: r.RemoteAddress(), + } + if e.State() == transport.DatagramEndpointStateInitial { + id.LocalAddress = r.LocalAddress() + } + + if err := f(r.NetProto(), info.ID, id); err != nil { + return err + } + + if e.connectedRoute != nil { + // If the endpoint was previously connected then release any previous route. + e.connectedRoute.Release() + } + e.connectedRoute = r + info.ID = id + info.RegisterNICID = nicID + e.setInfo(info) + e.effectiveNetProto = netProto + e.setEndpointState(transport.DatagramEndpointStateConnected) + return nil +} + +// Shutdown shutsdown the endpoint. +func (e *Endpoint) Shutdown() tcpip.Error { + e.mu.Lock() + defer e.mu.Unlock() + + switch state := e.State(); state { + case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: + return &tcpip.ErrNotConnected{} + case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: + e.writeShutdown = true + return nil + default: + panic(fmt.Sprintf("unhandled state = %s", state)) + } +} + +// checkV4MappedRLocked determines the effective network protocol and converts +// addr to its canonical form. +func (e *Endpoint) checkV4Mapped(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { + info := e.Info() + unwrapped, netProto, err := info.AddrNetProtoLocked(addr, e.ops.GetV6Only()) + if err != nil { + return tcpip.FullAddress{}, 0, err + } + return unwrapped, netProto, nil +} + +func (e *Endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { + return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr) || e.stack.IsSubnetBroadcast(nicID, netProto, addr) +} + +// Bind binds the endpoint to the address. +func (e *Endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { + return e.BindAndThen(addr, func(tcpip.NetworkProtocolNumber, tcpip.Address) tcpip.Error { + return nil + }) +} + +// BindAndThen binds the endpoint to the address and then calls the provided +// function. +// +// If the function returns an error, the endpoint's state does not change. The +// function will be called with the bound network protocol and address. +func (e *Endpoint) BindAndThen(addr tcpip.FullAddress, f func(tcpip.NetworkProtocolNumber, tcpip.Address) tcpip.Error) tcpip.Error { + addr.Port = 0 + + e.mu.Lock() + defer e.mu.Unlock() + + // Don't allow binding once endpoint is not in the initial state + // anymore. + if e.State() != transport.DatagramEndpointStateInitial { + return &tcpip.ErrInvalidEndpointState{} + } + + addr, netProto, err := e.checkV4Mapped(addr) + if err != nil { + return err + } + + nicID := addr.NIC + if len(addr.Addr) != 0 && !e.isBroadcastOrMulticast(addr.NIC, netProto, addr.Addr) { + nicID = e.stack.CheckLocalAddress(nicID, netProto, addr.Addr) + if nicID == 0 { + return &tcpip.ErrBadLocalAddress{} + } + } + + if err := f(netProto, addr.Addr); err != nil { + return err + } + + e.wasBound = true + + info := e.Info() + info.ID = stack.TransportEndpointID{ + LocalAddress: addr.Addr, + } + info.BindNICID = addr.NIC + info.RegisterNICID = nicID + info.BindAddr = addr.Addr + e.setInfo(info) + e.effectiveNetProto = netProto + e.setEndpointState(transport.DatagramEndpointStateBound) + return nil +} + +// WasBound returns true iff the endpoint was ever bound. +func (e *Endpoint) WasBound() bool { + e.mu.RLock() + defer e.mu.RUnlock() + return e.wasBound +} + +// GetLocalAddress returns the address that the endpoint is bound to. +func (e *Endpoint) GetLocalAddress() tcpip.FullAddress { + e.mu.RLock() + defer e.mu.RUnlock() + + info := e.Info() + addr := info.BindAddr + if e.State() == transport.DatagramEndpointStateConnected { + addr = e.connectedRoute.LocalAddress() + } + + return tcpip.FullAddress{ + NIC: info.RegisterNICID, + Addr: addr, + } +} + +// GetRemoteAddress returns the address that the endpoint is connected to. +func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, bool) { + e.mu.RLock() + defer e.mu.RUnlock() + + if e.State() != transport.DatagramEndpointStateConnected { + return tcpip.FullAddress{}, false + } + + return tcpip.FullAddress{ + Addr: e.connectedRoute.RemoteAddress(), + NIC: e.Info().RegisterNICID, + }, true +} + +// SetSockOptInt sets the socket option. +func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { + switch opt { + case tcpip.MTUDiscoverOption: + // Return not supported if the value is not disabling path + // MTU discovery. + if v != tcpip.PMTUDiscoveryDont { + return &tcpip.ErrNotSupported{} + } + + case tcpip.MulticastTTLOption: + e.mu.Lock() + e.multicastTTL = uint8(v) + e.mu.Unlock() + + case tcpip.TTLOption: + e.mu.Lock() + e.ttl = uint8(v) + e.mu.Unlock() + + case tcpip.IPv4TOSOption: + e.mu.Lock() + e.ipv4TOS = uint8(v) + e.mu.Unlock() + + case tcpip.IPv6TrafficClassOption: + e.mu.Lock() + e.ipv6TClass = uint8(v) + e.mu.Unlock() + } + + return nil +} + +// GetSockOptInt returns the socket option. +func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { + switch opt { + case tcpip.MTUDiscoverOption: + // The only supported setting is path MTU discovery disabled. + return tcpip.PMTUDiscoveryDont, nil + + case tcpip.MulticastTTLOption: + e.mu.Lock() + v := int(e.multicastTTL) + e.mu.Unlock() + return v, nil + + case tcpip.TTLOption: + e.mu.Lock() + v := int(e.ttl) + e.mu.Unlock() + return v, nil + + case tcpip.IPv4TOSOption: + e.mu.RLock() + v := int(e.ipv4TOS) + e.mu.RUnlock() + return v, nil + + case tcpip.IPv6TrafficClassOption: + e.mu.RLock() + v := int(e.ipv6TClass) + e.mu.RUnlock() + return v, nil + + default: + return -1, &tcpip.ErrUnknownProtocolOption{} + } +} + +// SetSockOpt sets the socket option. +func (e *Endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { + switch v := opt.(type) { + case *tcpip.MulticastInterfaceOption: + e.mu.Lock() + defer e.mu.Unlock() + + fa := tcpip.FullAddress{Addr: v.InterfaceAddr} + fa, netProto, err := e.checkV4Mapped(fa) + if err != nil { + return err + } + nic := v.NIC + addr := fa.Addr + + if nic == 0 && addr == "" { + e.multicastAddr = "" + e.multicastNICID = 0 + break + } + + if nic != 0 { + if !e.stack.CheckNIC(nic) { + return &tcpip.ErrBadLocalAddress{} + } + } else { + nic = e.stack.CheckLocalAddress(0, netProto, addr) + if nic == 0 { + return &tcpip.ErrBadLocalAddress{} + } + } + + if info := e.Info(); info.BindNICID != 0 && info.BindNICID != nic { + return &tcpip.ErrInvalidEndpointState{} + } + + e.multicastNICID = nic + e.multicastAddr = addr + + case *tcpip.AddMembershipOption: + if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) { + return &tcpip.ErrInvalidOptionValue{} + } + + nicID := v.NIC + + if v.InterfaceAddr.Unspecified() { + if nicID == 0 { + if r, err := e.stack.FindRoute(0, "", v.MulticastAddr, e.netProto, false /* multicastLoop */); err == nil { + nicID = r.NICID() + r.Release() + } + } + } else { + nicID = e.stack.CheckLocalAddress(nicID, e.netProto, v.InterfaceAddr) + } + if nicID == 0 { + return &tcpip.ErrUnknownDevice{} + } + + memToInsert := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr} + + e.mu.Lock() + defer e.mu.Unlock() + + if _, ok := e.multicastMemberships[memToInsert]; ok { + return &tcpip.ErrPortInUse{} + } + + if err := e.stack.JoinGroup(e.netProto, nicID, v.MulticastAddr); err != nil { + return err + } + + e.multicastMemberships[memToInsert] = struct{}{} + + case *tcpip.RemoveMembershipOption: + if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) { + return &tcpip.ErrInvalidOptionValue{} + } + + nicID := v.NIC + if v.InterfaceAddr.Unspecified() { + if nicID == 0 { + if r, err := e.stack.FindRoute(0, "", v.MulticastAddr, e.netProto, false /* multicastLoop */); err == nil { + nicID = r.NICID() + r.Release() + } + } + } else { + nicID = e.stack.CheckLocalAddress(nicID, e.netProto, v.InterfaceAddr) + } + if nicID == 0 { + return &tcpip.ErrUnknownDevice{} + } + + memToRemove := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr} + + e.mu.Lock() + defer e.mu.Unlock() + + if _, ok := e.multicastMemberships[memToRemove]; !ok { + return &tcpip.ErrBadLocalAddress{} + } + + if err := e.stack.LeaveGroup(e.netProto, nicID, v.MulticastAddr); err != nil { + return err + } + + delete(e.multicastMemberships, memToRemove) + + case *tcpip.SocketDetachFilterOption: + return nil + } + return nil +} + +// GetSockOpt returns the socket option. +func (e *Endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { + switch o := opt.(type) { + case *tcpip.MulticastInterfaceOption: + e.mu.Lock() + *o = tcpip.MulticastInterfaceOption{ + NIC: e.multicastNICID, + InterfaceAddr: e.multicastAddr, + } + e.mu.Unlock() + + default: + return &tcpip.ErrUnknownProtocolOption{} + } + return nil +} + +// Info returns a copy of the endpoint info. +func (e *Endpoint) Info() stack.TransportEndpointInfo { + e.infoMu.RLock() + defer e.infoMu.RUnlock() + return e.info +} + +// setInfo sets the endpoint's info. +// +// e.mu must be held to synchronize changes to info with the rest of the +// endpoint. +// +// +checklocks:e.mu +func (e *Endpoint) setInfo(info stack.TransportEndpointInfo) { + e.infoMu.Lock() + defer e.infoMu.Unlock() + e.info = info +} diff --git a/pkg/tcpip/transport/internal/network/endpoint_state.go b/pkg/tcpip/transport/internal/network/endpoint_state.go new file mode 100644 index 000000000..68bd1fbf6 --- /dev/null +++ b/pkg/tcpip/transport/internal/network/endpoint_state.go @@ -0,0 +1,58 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package network + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport" +) + +// Resume implements tcpip.ResumableEndpoint.Resume. +func (e *Endpoint) Resume(s *stack.Stack) { + e.mu.Lock() + defer e.mu.Unlock() + + e.stack = s + + for m := range e.multicastMemberships { + if err := e.stack.JoinGroup(e.netProto, m.nicID, m.multicastAddr); err != nil { + panic(fmt.Sprintf("e.stack.JoinGroup(%d, %d, %s): %s", e.netProto, m.nicID, m.multicastAddr, err)) + } + } + + info := e.Info() + + switch state := e.State(); state { + case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: + case transport.DatagramEndpointStateBound: + if len(info.ID.LocalAddress) != 0 && !e.isBroadcastOrMulticast(info.RegisterNICID, e.effectiveNetProto, info.ID.LocalAddress) { + if e.stack.CheckLocalAddress(info.RegisterNICID, e.effectiveNetProto, info.ID.LocalAddress) == 0 { + panic(fmt.Sprintf("got e.stack.CheckLocalAddress(%d, %d, %s) = 0, want != 0", info.RegisterNICID, e.effectiveNetProto, info.ID.LocalAddress)) + } + } + case transport.DatagramEndpointStateConnected: + var err tcpip.Error + multicastLoop := e.ops.GetMulticastLoop() + e.connectedRoute, err = e.stack.FindRoute(info.RegisterNICID, info.ID.LocalAddress, info.ID.RemoteAddress, e.effectiveNetProto, multicastLoop) + if err != nil { + panic(fmt.Sprintf("e.stack.FindRoute(%d, %s, %s, %d, %t): %s", info.RegisterNICID, info.ID.LocalAddress, info.ID.RemoteAddress, e.effectiveNetProto, multicastLoop, err)) + } + default: + panic(fmt.Sprintf("unhandled state = %s", state)) + } +} diff --git a/pkg/tcpip/transport/internal/network/endpoint_test.go b/pkg/tcpip/transport/internal/network/endpoint_test.go new file mode 100644 index 000000000..f263a9ea2 --- /dev/null +++ b/pkg/tcpip/transport/internal/network/endpoint_test.go @@ -0,0 +1,318 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package network_test + +import ( + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/checker" + "gvisor.dev/gvisor/pkg/tcpip/faketime" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/link/channel" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/testutil" + "gvisor.dev/gvisor/pkg/tcpip/transport" + "gvisor.dev/gvisor/pkg/tcpip/transport/internal/network" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" +) + +var ( + ipv4NICAddr = testutil.MustParse4("1.2.3.4") + ipv6NICAddr = testutil.MustParse6("a::1") + ipv4RemoteAddr = testutil.MustParse4("6.7.8.9") + ipv6RemoteAddr = testutil.MustParse6("b::1") +) + +func TestEndpointStateTransitions(t *testing.T) { + const nicID = 1 + + data := buffer.View([]byte{1, 2, 4, 5}) + v4Checker := func(t *testing.T, b buffer.View) { + checker.IPv4(t, b, + checker.SrcAddr(ipv4NICAddr), + checker.DstAddr(ipv4RemoteAddr), + checker.IPPayload(data), + ) + } + + v6Checker := func(t *testing.T, b buffer.View) { + checker.IPv6(t, b, + checker.SrcAddr(ipv6NICAddr), + checker.DstAddr(ipv6RemoteAddr), + checker.IPPayload(data), + ) + } + + tests := []struct { + name string + netProto tcpip.NetworkProtocolNumber + expectedMaxHeaderLength uint16 + expectedNetProto tcpip.NetworkProtocolNumber + expectedLocalAddr tcpip.Address + bindAddr tcpip.Address + expectedBoundAddr tcpip.Address + remoteAddr tcpip.Address + expectedRemoteAddr tcpip.Address + checker func(*testing.T, buffer.View) + }{ + { + name: "IPv4", + netProto: ipv4.ProtocolNumber, + expectedMaxHeaderLength: header.IPv4MaximumHeaderSize, + expectedNetProto: ipv4.ProtocolNumber, + expectedLocalAddr: ipv4NICAddr, + bindAddr: header.IPv4AllSystems, + expectedBoundAddr: header.IPv4AllSystems, + remoteAddr: ipv4RemoteAddr, + expectedRemoteAddr: ipv4RemoteAddr, + checker: v4Checker, + }, + { + name: "IPv6", + netProto: ipv6.ProtocolNumber, + expectedMaxHeaderLength: header.IPv6FixedHeaderSize, + expectedNetProto: ipv6.ProtocolNumber, + expectedLocalAddr: ipv6NICAddr, + bindAddr: header.IPv6AllNodesMulticastAddress, + expectedBoundAddr: header.IPv6AllNodesMulticastAddress, + remoteAddr: ipv6RemoteAddr, + expectedRemoteAddr: ipv6RemoteAddr, + checker: v6Checker, + }, + { + name: "IPv4-mapped-IPv6", + netProto: ipv6.ProtocolNumber, + expectedMaxHeaderLength: header.IPv4MaximumHeaderSize, + expectedNetProto: ipv4.ProtocolNumber, + expectedLocalAddr: ipv4NICAddr, + bindAddr: testutil.MustParse6("::ffff:e000:0001"), + expectedBoundAddr: header.IPv4AllSystems, + remoteAddr: testutil.MustParse6("::ffff:0607:0809"), + expectedRemoteAddr: ipv4RemoteAddr, + checker: v4Checker, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}, + Clock: &faketime.NullClock{}, + }) + e := channel.New(1, header.IPv6MinimumMTU, "") + if err := s.CreateNIC(nicID, e); err != nil { + t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) + } + + ipv4ProtocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: ipv4NICAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, ipv4ProtocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}: %s", nicID, ipv4ProtocolAddr, err) + } + ipv6ProtocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: ipv6NICAddr.WithPrefix(), + } + + if err := s.AddProtocolAddress(nicID, ipv6ProtocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID, ipv6ProtocolAddr, err) + } + + s.SetRouteTable([]tcpip.Route{ + {Destination: ipv4RemoteAddr.WithPrefix().Subnet(), NIC: nicID}, + {Destination: ipv6RemoteAddr.WithPrefix().Subnet(), NIC: nicID}, + }) + + var ops tcpip.SocketOptions + var ep network.Endpoint + ep.Init(s, test.netProto, udp.ProtocolNumber, &ops) + defer ep.Close() + if state := ep.State(); state != transport.DatagramEndpointStateInitial { + t.Fatalf("got ep.State() = %s, want = %s", state, transport.DatagramEndpointStateInitial) + } + + bindAddr := tcpip.FullAddress{Addr: test.bindAddr} + if err := ep.Bind(bindAddr); err != nil { + t.Fatalf("ep.Bind(%#v): %s", bindAddr, err) + } + if state := ep.State(); state != transport.DatagramEndpointStateBound { + t.Fatalf("got ep.State() = %s, want = %s", state, transport.DatagramEndpointStateBound) + } + if diff := cmp.Diff(ep.GetLocalAddress(), tcpip.FullAddress{Addr: test.expectedBoundAddr}); diff != "" { + t.Errorf("ep.GetLocalAddress() mismatch (-want +got):\n%s", diff) + } + if addr, connected := ep.GetRemoteAddress(); connected { + t.Errorf("got ep.GetRemoteAddress() = (true, %#v), want = (false, _)", addr) + } + + connectAddr := tcpip.FullAddress{Addr: test.remoteAddr} + if err := ep.Connect(connectAddr); err != nil { + t.Fatalf("ep.Connect(%#v): %s", connectAddr, err) + } + if state := ep.State(); state != transport.DatagramEndpointStateConnected { + t.Fatalf("got ep.State() = %s, want = %s", state, transport.DatagramEndpointStateConnected) + } + if diff := cmp.Diff(ep.GetLocalAddress(), tcpip.FullAddress{Addr: test.expectedLocalAddr}); diff != "" { + t.Errorf("ep.GetLocalAddress() mismatch (-want +got):\n%s", diff) + } + if addr, connected := ep.GetRemoteAddress(); !connected { + t.Errorf("got ep.GetRemoteAddress() = (false, _), want = (true, %#v)", connectAddr) + } else if diff := cmp.Diff(addr, tcpip.FullAddress{Addr: test.expectedRemoteAddr}); diff != "" { + t.Errorf("remote address mismatch (-want +got):\n%s", diff) + } + + ctx, err := ep.AcquireContextForWrite(tcpip.WriteOptions{}) + if err != nil { + t.Fatalf("ep.AcquireContexForWrite({}): %s", err) + } + defer ctx.Release() + info := ctx.PacketInfo() + if diff := cmp.Diff(network.WritePacketInfo{ + NetProto: test.expectedNetProto, + LocalAddress: test.expectedLocalAddr, + RemoteAddress: test.expectedRemoteAddr, + MaxHeaderLength: test.expectedMaxHeaderLength, + RequiresTXTransportChecksum: true, + }, info); diff != "" { + t.Errorf("write packet info mismatch (-want +got):\n%s", diff) + } + if err := ctx.WritePacket(stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: int(info.MaxHeaderLength), + Data: data.ToVectorisedView(), + }), false /* headerIncluded */); err != nil { + t.Fatalf("ctx.WritePacket(_, false): %s", err) + } + if pkt, ok := e.Read(); !ok { + t.Fatalf("expected packet to be read from link endpoint") + } else { + test.checker(t, stack.PayloadSince(pkt.Pkt.NetworkHeader())) + } + + ep.Close() + if state := ep.State(); state != transport.DatagramEndpointStateClosed { + t.Fatalf("got ep.State() = %s, want = %s", state, transport.DatagramEndpointStateClosed) + } + }) + } +} + +func TestBindNICID(t *testing.T) { + const nicID = 1 + + tests := []struct { + name string + netProto tcpip.NetworkProtocolNumber + bindAddr tcpip.Address + unicast bool + }{ + { + name: "IPv4 multicast", + netProto: ipv4.ProtocolNumber, + bindAddr: header.IPv4AllSystems, + unicast: false, + }, + { + name: "IPv6 multicast", + netProto: ipv6.ProtocolNumber, + bindAddr: header.IPv6AllNodesMulticastAddress, + unicast: false, + }, + { + name: "IPv4 unicast", + netProto: ipv4.ProtocolNumber, + bindAddr: ipv4NICAddr, + unicast: true, + }, + { + name: "IPv6 unicast", + netProto: ipv6.ProtocolNumber, + bindAddr: ipv6NICAddr, + unicast: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + for _, testBindNICID := range []tcpip.NICID{0, nicID} { + t.Run(fmt.Sprintf("BindNICID=%d", testBindNICID), func(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}, + Clock: &faketime.NullClock{}, + }) + if err := s.CreateNIC(nicID, loopback.New()); err != nil { + t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err) + } + + ipv4ProtocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: ipv4NICAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, ipv4ProtocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID, ipv4ProtocolAddr, err) + } + ipv6ProtocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: ipv6NICAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, ipv6ProtocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("s.AddProtocolAddress(%d, %+v, {}): %s", nicID, ipv6ProtocolAddr, err) + } + + var ops tcpip.SocketOptions + var ep network.Endpoint + ep.Init(s, test.netProto, udp.ProtocolNumber, &ops) + defer ep.Close() + if ep.WasBound() { + t.Fatal("got ep.WasBound() = true, want = false") + } + wantInfo := stack.TransportEndpointInfo{NetProto: test.netProto, TransProto: udp.ProtocolNumber} + if diff := cmp.Diff(wantInfo, ep.Info()); diff != "" { + t.Fatalf("ep.Info() mismatch (-want +got):\n%s", diff) + } + + bindAddr := tcpip.FullAddress{Addr: test.bindAddr, NIC: testBindNICID} + if err := ep.Bind(bindAddr); err != nil { + t.Fatalf("ep.Bind(%#v): %s", bindAddr, err) + } + if !ep.WasBound() { + t.Error("got ep.WasBound() = false, want = true") + } + wantInfo.ID = stack.TransportEndpointID{LocalAddress: bindAddr.Addr} + wantInfo.BindAddr = bindAddr.Addr + wantInfo.BindNICID = bindAddr.NIC + if test.unicast { + wantInfo.RegisterNICID = nicID + } else { + wantInfo.RegisterNICID = bindAddr.NIC + } + if diff := cmp.Diff(wantInfo, ep.Info()); diff != "" { + t.Errorf("ep.Info() mismatch (-want +got):\n%s", diff) + } + }) + } + }) + } +} diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index cd8c99d41..1f30e5adb 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -25,7 +25,6 @@ package packet import ( - "fmt" "io" "time" @@ -60,49 +59,46 @@ type packet struct { // // +stateify savable type endpoint struct { - stack.TransportEndpointInfo tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and are // immutable. stack *stack.Stack `state:"manual"` - netProto tcpip.NetworkProtocolNumber waiterQueue *waiter.Queue cooked bool - - // The following fields are used to manage the receive queue and are - // protected by rcvMu. - rcvMu sync.Mutex `state:"nosave"` - rcvList packetList + ops tcpip.SocketOptions + stats tcpip.TransportEndpointStats `state:"nosave"` + + // The following fields are used to manage the receive queue. + rcvMu sync.Mutex `state:"nosave"` + // +checklocks:rcvMu + rcvList packetList + // +checklocks:rcvMu rcvBufSize int - rcvClosed bool - - // The following fields are protected by mu. - mu sync.RWMutex `state:"nosave"` - closed bool - stats tcpip.TransportEndpointStats `state:"nosave"` - bound bool + // +checklocks:rcvMu + rcvClosed bool + // +checklocks:rcvMu + rcvDisabled bool + + mu sync.RWMutex `state:"nosave"` + // +checklocks:mu + netProto tcpip.NetworkProtocolNumber + // +checklocks:mu + closed bool + // +checklocks:mu + bound bool + // +checklocks:mu boundNIC tcpip.NICID - // lastErrorMu protects lastError. lastErrorMu sync.Mutex `state:"nosave"` - lastError tcpip.Error - - // ops is used to get socket level options. - ops tcpip.SocketOptions - - // frozen indicates if the packets should be delivered to the endpoint - // during restore. - frozen bool + // +checklocks:lastErrorMu + lastError tcpip.Error } // NewEndpoint returns a new packet endpoint. func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { ep := &endpoint{ - stack: s, - TransportEndpointInfo: stack.TransportEndpointInfo{ - NetProto: netProto, - }, + stack: s, cooked: cooked, netProto: netProto, waiterQueue: waiterQueue, @@ -207,9 +203,52 @@ func (ep *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResul return res, nil } -func (*endpoint) Write(tcpip.Payloader, tcpip.WriteOptions) (int64, tcpip.Error) { - // TODO(gvisor.dev/issue/173): Implement. - return 0, &tcpip.ErrInvalidOptionValue{} +func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { + if !ep.stack.PacketEndpointWriteSupported() { + return 0, &tcpip.ErrNotSupported{} + } + + ep.mu.Lock() + closed := ep.closed + nicID := ep.boundNIC + proto := ep.netProto + ep.mu.Unlock() + if closed { + return 0, &tcpip.ErrClosedForSend{} + } + + var remote tcpip.LinkAddress + if to := opts.To; to != nil { + remote = tcpip.LinkAddress(to.Addr) + + if n := to.NIC; n != 0 { + nicID = n + } + + if p := to.Port; p != 0 { + proto = tcpip.NetworkProtocolNumber(p) + } + } + + if nicID == 0 { + return 0, &tcpip.ErrInvalidOptionValue{} + } + + // TODO(https://gvisor.dev/issue/6538): Avoid this allocation. + payloadBytes := make(buffer.View, p.Len()) + if _, err := io.ReadFull(p, payloadBytes); err != nil { + return 0, &tcpip.ErrBadBuffer{} + } + + if err := func() tcpip.Error { + if ep.cooked { + return ep.stack.WritePacketToRemote(nicID, remote, proto, payloadBytes.ToVectorisedView()) + } + return ep.stack.WriteRawPacket(nicID, proto, payloadBytes.ToVectorisedView()) + }(); err != nil { + return 0, err + } + return int64(len(payloadBytes)), nil } // Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be @@ -244,8 +283,6 @@ func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpi // Bind implements tcpip.Endpoint.Bind. func (ep *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { - // TODO(gvisor.dev/issue/173): Add Bind support. - // "By default, all packets of the specified protocol type are passed // to a packet socket. To get packets only from a specific interface // use bind(2) specifying an address in a struct sockaddr_ll to bind @@ -256,7 +293,8 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { ep.mu.Lock() defer ep.mu.Unlock() - if ep.bound && ep.boundNIC == addr.NIC { + netProto := tcpip.NetworkProtocolNumber(addr.Port) + if ep.bound && ep.boundNIC == addr.NIC && ep.netProto == netProto { // If the NIC being bound is the same then just return success. return nil } @@ -266,12 +304,13 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { ep.bound = false // Bind endpoint to receive packets from specific interface. - if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil { + if err := ep.stack.RegisterPacketEndpoint(addr.NIC, netProto, ep); err != nil { return err } ep.bound = true ep.boundNIC = addr.NIC + ep.netProto = netProto return nil } @@ -374,7 +413,7 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, } rcvBufSize := ep.ops.GetReceiveBufferSize() - if ep.frozen || ep.rcvBufSize >= int(rcvBufSize) { + if ep.rcvDisabled || ep.rcvBufSize >= int(rcvBufSize) { ep.rcvMu.Unlock() ep.stack.Stats().DroppedPackets.Increment() ep.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() @@ -383,78 +422,39 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, wasEmpty := ep.rcvBufSize == 0 - // Push new packet into receive list and increment the buffer size. - var packet packet - // TODO(gvisor.dev/issue/173): Return network protocol. + rcvdPkt := packet{ + packetInfo: tcpip.LinkPacketInfo{ + Protocol: netProto, + PktType: pkt.PktType, + }, + senderAddr: tcpip.FullAddress{ + NIC: nicID, + }, + receivedAt: ep.stack.Clock().Now(), + } + if !pkt.LinkHeader().View().IsEmpty() { - // Get info directly from the ethernet header. hdr := header.Ethernet(pkt.LinkHeader().View()) - packet.senderAddr = tcpip.FullAddress{ - NIC: nicID, - Addr: tcpip.Address(hdr.SourceAddress()), - } - packet.packetInfo.Protocol = netProto - packet.packetInfo.PktType = pkt.PktType - } else { - // Guess the would-be ethernet header. - packet.senderAddr = tcpip.FullAddress{ - NIC: nicID, - Addr: tcpip.Address(localAddr), - } - packet.packetInfo.Protocol = netProto - packet.packetInfo.PktType = pkt.PktType + rcvdPkt.senderAddr.Addr = tcpip.Address(hdr.SourceAddress()) } if ep.cooked { - // Cooked packets can simply be queued. - switch pkt.PktType { - case tcpip.PacketHost: - packet.data = pkt.Data().ExtractVV() - case tcpip.PacketOutgoing: - // Strip Link Header. - var combinedVV buffer.VectorisedView - if v := pkt.NetworkHeader().View(); !v.IsEmpty() { - combinedVV.AppendView(v) - } - if v := pkt.TransportHeader().View(); !v.IsEmpty() { - combinedVV.AppendView(v) - } - combinedVV.Append(pkt.Data().ExtractVV()) - packet.data = combinedVV - default: - panic(fmt.Sprintf("unexpected PktType in pkt: %+v", pkt)) + // Cooked packet endpoints don't include the link-headers in received + // packets. + if v := pkt.NetworkHeader().View(); !v.IsEmpty() { + rcvdPkt.data.AppendView(v) } - - } else { - // Raw packets need their ethernet headers prepended before - // queueing. - var linkHeader buffer.View - if pkt.PktType != tcpip.PacketOutgoing { - if pkt.LinkHeader().View().IsEmpty() { - // We weren't provided with an actual ethernet header, - // so fake one. - ethFields := header.EthernetFields{ - SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), - DstAddr: localAddr, - Type: netProto, - } - fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) - fakeHeader.Encode(ðFields) - linkHeader = buffer.View(fakeHeader) - } else { - linkHeader = append(buffer.View(nil), pkt.LinkHeader().View()...) - } - combinedVV := linkHeader.ToVectorisedView() - combinedVV.Append(pkt.Data().ExtractVV()) - packet.data = combinedVV - } else { - packet.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views()) + if v := pkt.TransportHeader().View(); !v.IsEmpty() { + rcvdPkt.data.AppendView(v) } + rcvdPkt.data.Append(pkt.Data().ExtractVV()) + } else { + // Raw packet endpoints include link-headers in received packets. + rcvdPkt.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views()) } - packet.receivedAt = ep.stack.Clock().Now() - ep.rcvList.PushBack(&packet) - ep.rcvBufSize += packet.data.Size() + ep.rcvList.PushBack(&rcvdPkt) + ep.rcvBufSize += rcvdPkt.data.Size() ep.rcvMu.Unlock() ep.stats.PacketsReceived.Increment() @@ -472,10 +472,8 @@ func (*endpoint) State() uint32 { // Info returns a copy of the endpoint info. func (ep *endpoint) Info() tcpip.EndpointInfo { ep.mu.RLock() - // Make a copy of the endpoint info. - ret := ep.TransportEndpointInfo - ep.mu.RUnlock() - return &ret + defer ep.mu.RUnlock() + return &stack.TransportEndpointInfo{NetProto: ep.netProto} } // Stats returns a pointer to the endpoint stats. @@ -490,18 +488,3 @@ func (*endpoint) SetOwner(tcpip.PacketOwner) {} func (ep *endpoint) SocketOptions() *tcpip.SocketOptions { return &ep.ops } - -// freeze prevents any more packets from being delivered to the endpoint. -func (ep *endpoint) freeze() { - ep.mu.Lock() - ep.frozen = true - ep.mu.Unlock() -} - -// thaw unfreezes a previously frozen endpoint using endpoint.freeze() allows -// new packets to be delivered again. -func (ep *endpoint) thaw() { - ep.mu.Lock() - ep.frozen = false - ep.mu.Unlock() -} diff --git a/pkg/tcpip/transport/packet/endpoint_state.go b/pkg/tcpip/transport/packet/endpoint_state.go index e729921db..d2768db7b 100644 --- a/pkg/tcpip/transport/packet/endpoint_state.go +++ b/pkg/tcpip/transport/packet/endpoint_state.go @@ -34,28 +34,26 @@ func (p *packet) loadReceivedAt(nsec int64) { // saveData saves packet.data field. func (p *packet) saveData() buffer.VectorisedView { - // We cannot save p.data directly as p.data.views may alias to p.views, - // which is not allowed by state framework (in-struct pointer). return p.data.Clone(nil) } // loadData loads packet.data field. func (p *packet) loadData(data buffer.VectorisedView) { - // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization - // here because data.views is not guaranteed to be loaded by now. Plus, - // data.views will be allocated anyway so there really is little point - // of utilizing p.views for data.views. p.data = data } // beforeSave is invoked by stateify. func (ep *endpoint) beforeSave() { - ep.freeze() + ep.rcvMu.Lock() + defer ep.rcvMu.Unlock() + ep.rcvDisabled = true } // afterLoad is invoked by stateify. func (ep *endpoint) afterLoad() { - ep.thaw() + ep.mu.Lock() + defer ep.mu.Unlock() + ep.stack = stack.StackFromEnv ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) @@ -63,4 +61,8 @@ func (ep *endpoint) afterLoad() { if err := ep.stack.RegisterPacketEndpoint(0, ep.netProto, ep); err != nil { panic(err) } + + ep.rcvMu.Lock() + ep.rcvDisabled = false + ep.rcvMu.Unlock() } diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD index 2eab09088..b7e97e218 100644 --- a/pkg/tcpip/transport/raw/BUILD +++ b/pkg/tcpip/transport/raw/BUILD @@ -33,6 +33,8 @@ go_library( "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/stack", + "//pkg/tcpip/transport", + "//pkg/tcpip/transport/internal/network", "//pkg/tcpip/transport/packet", "//pkg/waiter", ], diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 1bce2769a..3040a445b 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -26,6 +26,7 @@ package raw import ( + "fmt" "io" "time" @@ -34,6 +35,8 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport" + "gvisor.dev/gvisor/pkg/tcpip/transport/internal/network" "gvisor.dev/gvisor/pkg/waiter" ) @@ -57,15 +60,19 @@ type rawPacket struct { // // +stateify savable type endpoint struct { - stack.TransportEndpointInfo tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and are // immutable. stack *stack.Stack `state:"manual"` + transProto tcpip.TransportProtocolNumber waiterQueue *waiter.Queue associated bool + net network.Endpoint + stats tcpip.TransportEndpointStats `state:"nosave"` + ops tcpip.SocketOptions + // The following fields are used to manage the receive queue and are // protected by rcvMu. rcvMu sync.Mutex `state:"nosave"` @@ -74,20 +81,7 @@ type endpoint struct { rcvClosed bool // The following fields are protected by mu. - mu sync.RWMutex `state:"nosave"` - closed bool - connected bool - bound bool - // route is the route to a remote network endpoint. It is set via - // Connect(), and is valid only when conneted is true. - route *stack.Route `state:"manual"` - stats tcpip.TransportEndpointStats `state:"nosave"` - // owner is used to get uid and gid of the packet. - owner tcpip.PacketOwner - - // ops is used to get socket level options. - ops tcpip.SocketOptions - + mu sync.RWMutex `state:"nosave"` // frozen indicates if the packets should be delivered to the endpoint // during restore. frozen bool @@ -99,16 +93,9 @@ func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, trans } func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, tcpip.Error) { - if netProto != header.IPv4ProtocolNumber && netProto != header.IPv6ProtocolNumber { - return nil, &tcpip.ErrUnknownProtocol{} - } - e := &endpoint{ - stack: s, - TransportEndpointInfo: stack.TransportEndpointInfo{ - NetProto: netProto, - TransProto: transProto, - }, + stack: s, + transProto: transProto, waiterQueue: waiterQueue, associated: associated, } @@ -116,6 +103,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt e.ops.SetHeaderIncluded(!associated) e.ops.SetSendBufferSize(32*1024, false /* notify */) e.ops.SetReceiveBufferSize(32*1024, false /* notify */) + e.net.Init(s, netProto, transProto, &e.ops) // Override with stack defaults. var ss tcpip.SendBufferSizeOption @@ -132,12 +120,12 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt // headers included. Because they're write-only, We don't need to // register with the stack. if !associated { - e.ops.SetReceiveBufferSize(0, false) + e.ops.SetReceiveBufferSize(0, false /* notify */) e.waiterQueue = nil return e, nil } - if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil { + if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { return nil, err } @@ -154,11 +142,17 @@ func (e *endpoint) Close() { e.mu.Lock() defer e.mu.Unlock() - if e.closed || !e.associated { + if e.net.State() == transport.DatagramEndpointStateClosed { return } - e.stack.UnregisterRawTransportEndpoint(e.NetProto, e.TransProto, e) + e.net.Close() + + if !e.associated { + return + } + + e.stack.UnregisterRawTransportEndpoint(e.net.NetProto(), e.transProto, e) e.rcvMu.Lock() defer e.rcvMu.Unlock() @@ -170,15 +164,6 @@ func (e *endpoint) Close() { e.rcvList.Remove(e.rcvList.Front()) } - e.connected = false - - if e.route != nil { - e.route.Release() - e.route = nil - } - - e.closed = true - e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } @@ -186,9 +171,7 @@ func (e *endpoint) Close() { func (*endpoint) ModerateRecvBuf(int) {} func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { - e.mu.Lock() - defer e.mu.Unlock() - e.owner = owner + e.net.SetOwner(owner) } // Read implements tcpip.Endpoint.Read. @@ -236,14 +219,15 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult // Write implements tcpip.Endpoint.Write. func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { + netProto := e.net.NetProto() // We can create, but not write to, unassociated IPv6 endpoints. - if !e.associated && e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber { + if !e.associated && netProto == header.IPv6ProtocolNumber { return 0, &tcpip.ErrInvalidOptionValue{} } if opts.To != nil { // Raw sockets do not support sending to a IPv4 address on a IPv6 endpoint. - if e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber && len(opts.To.Addr) != header.IPv6AddressSize { + if netProto == header.IPv6ProtocolNumber && len(opts.To.Addr) != header.IPv6AddressSize { return 0, &tcpip.ErrInvalidOptionValue{} } } @@ -269,99 +253,25 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp } func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { - // MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op. - if opts.More { - return 0, &tcpip.ErrInvalidOptionValue{} + ctx, err := e.net.AcquireContextForWrite(opts) + if err != nil { + return 0, err } - payloadBytes, route, owner, err := func() ([]byte, *stack.Route, tcpip.PacketOwner, tcpip.Error) { - e.mu.RLock() - defer e.mu.RUnlock() - - if e.closed { - return nil, nil, nil, &tcpip.ErrInvalidEndpointState{} - } - - payloadBytes := make([]byte, p.Len()) - if _, err := io.ReadFull(p, payloadBytes); err != nil { - return nil, nil, nil, &tcpip.ErrBadBuffer{} - } - // If this is an unassociated socket and callee provided a nonzero - // destination address, route using that address. - if e.ops.GetHeaderIncluded() { - ip := header.IPv4(payloadBytes) - if !ip.IsValid(len(payloadBytes)) { - return nil, nil, nil, &tcpip.ErrInvalidOptionValue{} - } - dstAddr := ip.DestinationAddress() - // Update dstAddr with the address in the IP header, unless - // opts.To is set (e.g. if sendto specifies a specific - // address). - if dstAddr != tcpip.Address([]byte{0, 0, 0, 0}) && opts.To == nil { - opts.To = &tcpip.FullAddress{ - NIC: 0, // NIC is unset. - Addr: dstAddr, // The address from the payload. - Port: 0, // There are no ports here. - } - } - } - - // Did the user caller provide a destination? If not, use the connected - // destination. - if opts.To == nil { - // If the user doesn't specify a destination, they should have - // connected to another address. - if !e.connected { - return nil, nil, nil, &tcpip.ErrDestinationRequired{} - } - - e.route.Acquire() - - return payloadBytes, e.route, e.owner, nil - } - - // The caller provided a destination. Reject destination address if it - // goes through a different NIC than the endpoint was bound to. - nic := opts.To.NIC - if e.bound && nic != 0 && nic != e.BindNICID { - return nil, nil, nil, &tcpip.ErrNoRoute{} - } + // TODO(https://gvisor.dev/issue/6538): Avoid this allocation. + payloadBytes := make([]byte, p.Len()) + if _, err := io.ReadFull(p, payloadBytes); err != nil { + return 0, &tcpip.ErrBadBuffer{} + } - // Find the route to the destination. If BindAddress is 0, - // FindRoute will choose an appropriate source address. - route, err := e.stack.FindRoute(nic, e.BindAddr, opts.To.Addr, e.NetProto, false) - if err != nil { - return nil, nil, nil, err - } + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: int(ctx.PacketInfo().MaxHeaderLength), + Data: buffer.View(payloadBytes).ToVectorisedView(), + }) - return payloadBytes, route, e.owner, nil - }() - if err != nil { + if err := ctx.WritePacket(pkt, e.ops.GetHeaderIncluded()); err != nil { return 0, err } - defer route.Release() - - if e.ops.GetHeaderIncluded() { - pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - Data: buffer.View(payloadBytes).ToVectorisedView(), - }) - if err := route.WriteHeaderIncludedPacket(pkt); err != nil { - return 0, err - } - } else { - pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: int(route.MaxHeaderLength()), - Data: buffer.View(payloadBytes).ToVectorisedView(), - }) - pkt.Owner = owner - if err := route.WritePacket(stack.NetworkHeaderParams{ - Protocol: e.TransProto, - TTL: route.DefaultTTL(), - TOS: stack.DefaultTOS, - }, pkt); err != nil { - return 0, err - } - } return int64(len(payloadBytes)), nil } @@ -373,66 +283,29 @@ func (*endpoint) Disconnect() tcpip.Error { // Connect implements tcpip.Endpoint.Connect. func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { + netProto := e.net.NetProto() + // Raw sockets do not support connecting to a IPv4 address on a IPv6 endpoint. - if e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber && len(addr.Addr) != header.IPv6AddressSize { + if netProto == header.IPv6ProtocolNumber && len(addr.Addr) != header.IPv6AddressSize { return &tcpip.ErrAddressFamilyNotSupported{} } - e.mu.Lock() - defer e.mu.Unlock() - - if e.closed { - return &tcpip.ErrInvalidEndpointState{} - } - - nic := addr.NIC - if e.bound { - if e.BindNICID == 0 { - // If we're bound, but not to a specific NIC, the NIC - // in addr will be used. Nothing to do here. - } else if addr.NIC == 0 { - // If we're bound to a specific NIC, but addr doesn't - // specify a NIC, use the bound NIC. - nic = e.BindNICID - } else if addr.NIC != e.BindNICID { - // We're bound and addr specifies a NIC. They must be - // the same. - return &tcpip.ErrInvalidEndpointState{} - } - } - - // Find a route to the destination. - route, err := e.stack.FindRoute(nic, "", addr.Addr, e.NetProto, false) - if err != nil { - return err - } - - if e.associated { - // Re-register the endpoint with the appropriate NIC. - if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil { - route.Release() - return err + return e.net.ConnectAndThen(addr, func(_ tcpip.NetworkProtocolNumber, _, _ stack.TransportEndpointID) tcpip.Error { + if e.associated { + // Re-register the endpoint with the appropriate NIC. + if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { + return err + } + e.stack.UnregisterRawTransportEndpoint(netProto, e.transProto, e) } - e.stack.UnregisterRawTransportEndpoint(e.NetProto, e.TransProto, e) - e.RegisterNICID = nic - } - if e.route != nil { - // If the endpoint was previously connected then release any previous route. - e.route.Release() - } - e.route = route - e.connected = true - - return nil + return nil + }) } // Shutdown implements tcpip.Endpoint.Shutdown. It's a noop for raw sockets. func (e *endpoint) Shutdown(tcpip.ShutdownFlags) tcpip.Error { - e.mu.Lock() - defer e.mu.Unlock() - - if !e.connected { + if e.net.State() != transport.DatagramEndpointStateConnected { return &tcpip.ErrNotConnected{} } return nil @@ -450,33 +323,26 @@ func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpi // Bind implements tcpip.Endpoint.Bind. func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { - e.mu.Lock() - defer e.mu.Unlock() - - // If a local address was specified, verify that it's valid. - if len(addr.Addr) != 0 && e.stack.CheckLocalAddress(e.RegisterNICID, e.NetProto, addr.Addr) == 0 { - return &tcpip.ErrBadLocalAddress{} - } + return e.net.BindAndThen(addr, func(netProto tcpip.NetworkProtocolNumber, _ tcpip.Address) tcpip.Error { + if !e.associated { + return nil + } - if e.associated { // Re-register the endpoint with the appropriate NIC. - if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil { + if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { return err } - e.stack.UnregisterRawTransportEndpoint(e.NetProto, e.TransProto, e) - e.RegisterNICID = addr.NIC - e.BindNICID = addr.NIC - } - - e.BindAddr = addr.Addr - e.bound = true - - return nil + e.stack.UnregisterRawTransportEndpoint(netProto, e.transProto, e) + return nil + }) } // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. -func (*endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { - return tcpip.FullAddress{}, &tcpip.ErrNotSupported{} +func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { + a := e.net.GetLocalAddress() + // Linux returns the protocol in the port field. + a.Port = uint16(e.transProto) + return a, nil } // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. @@ -509,17 +375,17 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { return nil default: - return &tcpip.ErrUnknownProtocolOption{} + return e.net.SetSockOpt(opt) } } -func (*endpoint) SetSockOptInt(tcpip.SockOptInt, int) tcpip.Error { - return &tcpip.ErrUnknownProtocolOption{} +func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { + return e.net.SetSockOptInt(opt, v) } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) tcpip.Error { - return &tcpip.ErrUnknownProtocolOption{} +func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { + return e.net.GetSockOpt(opt) } // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. @@ -536,100 +402,108 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { return v, nil default: - return -1, &tcpip.ErrUnknownProtocolOption{} + return e.net.GetSockOptInt(opt) } } // HandlePacket implements stack.RawTransportEndpoint.HandlePacket. func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { - e.mu.RLock() - e.rcvMu.Lock() + notifyReadableEvents := func() bool { + e.mu.RLock() + defer e.mu.RUnlock() + e.rcvMu.Lock() + defer e.rcvMu.Unlock() + + // Drop the packet if our buffer is currently full or if this is an unassociated + // endpoint (i.e endpoint created w/ IPPROTO_RAW). Such endpoints are send only + // See: https://man7.org/linux/man-pages/man7/raw.7.html + // + // An IPPROTO_RAW socket is send only. If you really want to receive + // all IP packets, use a packet(7) socket with the ETH_P_IP protocol. + // Note that packet sockets don't reassemble IP fragments, unlike raw + // sockets. + if e.rcvClosed || !e.associated { + e.stack.Stats().DroppedPackets.Increment() + e.stats.ReceiveErrors.ClosedReceiver.Increment() + return false + } - // Drop the packet if our buffer is currently full or if this is an unassociated - // endpoint (i.e endpoint created w/ IPPROTO_RAW). Such endpoints are send only - // See: https://man7.org/linux/man-pages/man7/raw.7.html - // - // An IPPROTO_RAW socket is send only. If you really want to receive - // all IP packets, use a packet(7) socket with the ETH_P_IP protocol. - // Note that packet sockets don't reassemble IP fragments, unlike raw - // sockets. - if e.rcvClosed || !e.associated { - e.rcvMu.Unlock() - e.mu.RUnlock() - e.stack.Stats().DroppedPackets.Increment() - e.stats.ReceiveErrors.ClosedReceiver.Increment() - return - } + rcvBufSize := e.ops.GetReceiveBufferSize() + if e.frozen || e.rcvBufSize >= int(rcvBufSize) { + e.stack.Stats().DroppedPackets.Increment() + e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() + return false + } - rcvBufSize := e.ops.GetReceiveBufferSize() - if e.frozen || e.rcvBufSize >= int(rcvBufSize) { - e.rcvMu.Unlock() - e.mu.RUnlock() - e.stack.Stats().DroppedPackets.Increment() - e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() - return - } + srcAddr := pkt.Network().SourceAddress() + info := e.net.Info() + + switch state := e.net.State(); state { + case transport.DatagramEndpointStateInitial: + case transport.DatagramEndpointStateConnected: + // If connected, only accept packets from the remote address we + // connected to. + if info.ID.RemoteAddress != srcAddr { + return false + } - remoteAddr := pkt.Network().SourceAddress() + // Connected sockets may also have been bound to a specific + // address/NIC. + fallthrough + case transport.DatagramEndpointStateBound: + // If bound to a NIC, only accept data for that NIC. + if info.BindNICID != 0 && info.BindNICID != pkt.NICID { + return false + } - if e.bound { - // If bound to a NIC, only accept data for that NIC. - if e.BindNICID != 0 && e.BindNICID != pkt.NICID { - e.rcvMu.Unlock() - e.mu.RUnlock() - return - } - // If bound to an address, only accept data for that address. - if e.BindAddr != "" && e.BindAddr != remoteAddr { - e.rcvMu.Unlock() - e.mu.RUnlock() - return + // If bound to an address, only accept data for that address. + if info.BindAddr != "" && info.BindAddr != pkt.Network().DestinationAddress() { + return false + } + default: + panic(fmt.Sprintf("unhandled state = %s", state)) } - } - // If connected, only accept packets from the remote address we - // connected to. - if e.connected && e.route.RemoteAddress() != remoteAddr { - e.rcvMu.Unlock() - e.mu.RUnlock() - return - } + wasEmpty := e.rcvBufSize == 0 - wasEmpty := e.rcvBufSize == 0 + // Push new packet into receive list and increment the buffer size. + packet := &rawPacket{ + senderAddr: tcpip.FullAddress{ + NIC: pkt.NICID, + Addr: srcAddr, + }, + } - // Push new packet into receive list and increment the buffer size. - packet := &rawPacket{ - senderAddr: tcpip.FullAddress{ - NIC: pkt.NICID, - Addr: remoteAddr, - }, - } + // Raw IPv4 endpoints return the IP header, but IPv6 endpoints do not. + // We copy headers' underlying bytes because pkt.*Header may point to + // the middle of a slice, and another struct may point to the "outer" + // slice. Save/restore doesn't support overlapping slices and will fail. + // + // TODO(https://gvisor.dev/issue/6517): Avoid the copy once S/R supports + // overlapping slices. + var combinedVV buffer.VectorisedView + if info.NetProto == header.IPv4ProtocolNumber { + network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View() + headers := make(buffer.View, 0, len(network)+len(transport)) + headers = append(headers, network...) + headers = append(headers, transport...) + combinedVV = headers.ToVectorisedView() + } else { + combinedVV = append(buffer.View(nil), pkt.TransportHeader().View()...).ToVectorisedView() + } + combinedVV.Append(pkt.Data().ExtractVV()) + packet.data = combinedVV + packet.receivedAt = e.stack.Clock().Now() - // Raw IPv4 endpoints return the IP header, but IPv6 endpoints do not. - // We copy headers' underlying bytes because pkt.*Header may point to - // the middle of a slice, and another struct may point to the "outer" - // slice. Save/restore doesn't support overlapping slices and will fail. - var combinedVV buffer.VectorisedView - if e.TransportEndpointInfo.NetProto == header.IPv4ProtocolNumber { - network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View() - headers := make(buffer.View, 0, len(network)+len(transport)) - headers = append(headers, network...) - headers = append(headers, transport...) - combinedVV = headers.ToVectorisedView() - } else { - combinedVV = append(buffer.View(nil), pkt.TransportHeader().View()...).ToVectorisedView() - } - combinedVV.Append(pkt.Data().ExtractVV()) - packet.data = combinedVV - packet.receivedAt = e.stack.Clock().Now() + e.rcvList.PushBack(packet) + e.rcvBufSize += packet.data.Size() + e.stats.PacketsReceived.Increment() - e.rcvList.PushBack(packet) - e.rcvBufSize += packet.data.Size() - e.rcvMu.Unlock() - e.mu.RUnlock() - e.stats.PacketsReceived.Increment() - // Notify waiters that there's data to be read. - if wasEmpty { + // Notify waiters that there is data to be read now. + return wasEmpty + }() + + if notifyReadableEvents { e.waiterQueue.Notify(waiter.ReadableEvents) } } @@ -641,10 +515,7 @@ func (e *endpoint) State() uint32 { // Info returns a copy of the endpoint info. func (e *endpoint) Info() tcpip.EndpointInfo { - e.mu.RLock() - // Make a copy of the endpoint info. - ret := e.TransportEndpointInfo - e.mu.RUnlock() + ret := e.net.Info() return &ret } diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go index 39669b445..e74713064 100644 --- a/pkg/tcpip/transport/raw/endpoint_state.go +++ b/pkg/tcpip/transport/raw/endpoint_state.go @@ -15,6 +15,7 @@ package raw import ( + "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" @@ -60,35 +61,16 @@ func (e *endpoint) beforeSave() { // Resume implements tcpip.ResumableEndpoint.Resume. func (e *endpoint) Resume(s *stack.Stack) { + e.net.Resume(s) + e.thaw() e.stack = s e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) - // If the endpoint is connected, re-connect. - if e.connected { - var err tcpip.Error - // TODO(gvisor.dev/issue/4906): Properly restore the route with the right - // remote address. We used to pass e.remote.RemoteAddress which was - // effectively the empty address but since moving e.route to hold a pointer - // to a route instead of the route by value, we pass the empty address - // directly. Obviously this was always wrong since we should provide the - // remote address we were connected to, to properly restore the route. - e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, "", e.NetProto, false) - if err != nil { - panic(err) - } - } - - // If the endpoint is bound, re-bind. - if e.bound { - if e.stack.CheckLocalAddress(e.RegisterNICID, e.NetProto, e.BindAddr) == 0 { - panic(&tcpip.ErrBadLocalAddress{}) - } - } - if e.associated { - if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil { - panic(err) + netProto := e.net.NetProto() + if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { + panic(fmt.Sprintf("e.stack.RegisterRawTransportEndpoint(%d, %d, _): %s", netProto, e.transProto, err)) } } } diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 8436d2cf0..5148fe157 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -68,6 +68,7 @@ go_library( "//pkg/tcpip/hash/jenkins", "//pkg/tcpip/header", "//pkg/tcpip/header/parse", + "//pkg/tcpip/internal/tcp", "//pkg/tcpip/ports", "//pkg/tcpip/seqnum", "//pkg/tcpip/stack", @@ -96,6 +97,7 @@ go_test( "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/checker", + "//pkg/tcpip/faketime", "//pkg/tcpip/header", "//pkg/tcpip/link/loopback", "//pkg/tcpip/link/sniffer", diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index d807b13b7..03c9fafa1 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -72,7 +72,8 @@ func encodeMSS(mss uint16) uint32 { // and must not be accessed or have its methods called concurrently as they // may mutate the stored objects. type listenContext struct { - stack *stack.Stack + stack *stack.Stack + protocol *protocol // rcvWnd is the receive window that is sent by this listening context // in the initial SYN-ACK. @@ -119,9 +120,10 @@ func timeStamp(clock tcpip.Clock) uint32 { } // newListenContext creates a new listen context. -func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext { +func newListenContext(stk *stack.Stack, protocol *protocol, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext { l := &listenContext{ stack: stk, + protocol: protocol, rcvWnd: rcvWnd, hasher: sha1.New(), v6Only: v6Only, @@ -201,7 +203,7 @@ func (l *listenContext) useSynCookies() bool { // createConnectingEndpoint creates a new endpoint in a connecting state, with // the connection parameters given by the arguments. -func (l *listenContext) createConnectingEndpoint(s *segment, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, tcpip.Error) { +func (l *listenContext) createConnectingEndpoint(s *segment, rcvdSynOpts header.TCPSynOptions, queue *waiter.Queue) (*endpoint, tcpip.Error) { // Create a new endpoint. netProto := l.netProto if netProto == 0 { @@ -213,7 +215,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, rcvdSynOpts *header return nil, err } - n := newEndpoint(l.stack, netProto, queue) + n := newEndpoint(l.stack, l.protocol, netProto, queue) n.ops.SetV6Only(l.v6Only) n.TransportEndpointInfo.ID = s.id n.boundNICID = s.nicID @@ -244,10 +246,10 @@ func (l *listenContext) createConnectingEndpoint(s *segment, rcvdSynOpts *header // On success, a handshake h is returned with h.ep.mu held. // // Precondition: if l.listenEP != nil, l.listenEP.mu must be locked. -func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*handshake, tcpip.Error) { +func (l *listenContext) startHandshake(s *segment, opts header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*handshake, tcpip.Error) { // Create new endpoint. irs := s.sequenceNumber - isn := generateSecureISN(s.id, l.stack.Clock(), l.stack.Seed()) + isn := generateSecureISN(s.id, l.stack.Clock(), l.protocol.seqnumSecret) ep, err := l.createConnectingEndpoint(s, opts, queue) if err != nil { return nil, err @@ -323,14 +325,16 @@ func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, q // established endpoint is returned with e.mu held. // // Precondition: if l.listenEP != nil, l.listenEP.mu must be locked. -func (l *listenContext) performHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, tcpip.Error) { +func (l *listenContext) performHandshake(s *segment, opts header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, tcpip.Error) { h, err := l.startHandshake(s, opts, queue, owner) if err != nil { return nil, err } ep := h.ep - if err := h.complete(); err != nil { + // N.B. the endpoint is generated above by startHandshake, and will be + // returned locked. This first call is forced. + if err := h.complete(); err != nil { // +checklocksforce ep.stack.Stats().TCP.FailedConnectionAttempts.Increment() ep.stats.FailedConnectionAttempts.Increment() l.cleanupFailedHandshake(h) @@ -364,6 +368,7 @@ func (l *listenContext) closeAllPendingEndpoints() { } // Precondition: h.ep.mu must be held. +// +checklocks:h.ep.mu func (l *listenContext) cleanupFailedHandshake(h *handshake) { e := h.ep e.mu.Unlock() @@ -492,7 +497,7 @@ func (e *endpoint) notifyAborted() { // cookies to accept connections. // // Precondition: if ctx.listenEP != nil, ctx.listenEP.mu must be locked. -func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) tcpip.Error { +func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts header.TCPSynOptions) tcpip.Error { defer s.decRef() h, err := ctx.startHandshake(s, opts, &waiter.Queue{}, e.owner) @@ -504,7 +509,9 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header } go func() { - if err := h.complete(); err != nil { + // Note that startHandshake returns a locked endpoint. The + // force call here just makes it so. + if err := h.complete(); err != nil { // +checklocksforce e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() ctx.cleanupFailedHandshake(h) @@ -576,7 +583,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err if !ctx.useSynCookies() { s.incRef() atomic.AddInt32(&e.synRcvdCount, 1) - return e.handleSynSegment(ctx, s, &opts) + return e.handleSynSegment(ctx, s, opts) } route, err := e.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */) if err != nil { @@ -595,10 +602,14 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err synOpts := header.TCPSynOptions{ WS: -1, TS: opts.TS, - TSVal: tcpTimeStamp(e.stack.Clock().NowMonotonic(), timeStampOffset(e.stack.Rand())), TSEcr: opts.TSVal, MSS: calculateAdvertisedMSS(e.userMSS, route), } + if opts.TS { + offset := e.protocol.tsOffset(s.dstAddr, s.srcAddr) + now := e.stack.Clock().NowMonotonic() + synOpts.TSVal = offset.TSVal(now) + } cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS)) fields := tcpFields{ id: s.id, @@ -665,7 +676,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err } e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment() // Create newly accepted endpoint and deliver it. - rcvdSynOptions := &header.TCPSynOptions{ + rcvdSynOptions := header.TCPSynOptions{ MSS: mssTable[data], // Disable Window scaling as original SYN is // lost. @@ -720,25 +731,22 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err } n.isRegistered = true - - // clear the tsOffset for the newly created - // endpoint as the Timestamp was already - // randomly offset when the original SYN-ACK was - // sent above. - n.TSOffset = 0 + n.TSOffset = n.protocol.tsOffset(s.dstAddr, s.srcAddr) // Switch state to connected. n.isConnectNotified = true - n.transitionToStateEstablishedLocked(&handshake{ - ep: n, - iss: iss, - ackNum: irs + 1, - rcvWnd: seqnum.Size(n.initialReceiveWindow()), - sndWnd: s.window, - rcvWndScale: e.rcvWndScaleForHandshake(), - sndWndScale: rcvdSynOptions.WS, - mss: rcvdSynOptions.MSS, - }) + h := handshake{ + ep: n, + iss: iss, + ackNum: irs + 1, + rcvWnd: seqnum.Size(n.initialReceiveWindow()), + sndWnd: s.window, + rcvWndScale: e.rcvWndScaleForHandshake(), + sndWndScale: rcvdSynOptions.WS, + mss: rcvdSynOptions.MSS, + sampleRTTWithTSOnly: true, + } + h.transitionToStateEstablishedLocked(s) // Requeue the segment if the ACK completing the handshake has more info // to be procesed by the newly established endpoint. @@ -774,7 +782,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) { e.mu.Lock() v6Only := e.ops.GetV6Only() - ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto) + ctx := newListenContext(e.stack, e.protocol, e, rcvWnd, v6Only, e.NetProto) defer func() { // Mark endpoint as closed. This will prevent goroutines running diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 2137ebc25..5d8e18484 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -105,6 +105,11 @@ type handshake struct { // sendSYNOpts is the cached values for the SYN options to be sent. sendSYNOpts header.TCPSynOptions + + // sampleRTTWithTSOnly is true when the segment was retransmitted or we can't + // tell; then RTT can only be sampled when the incoming segment has timestamp + // options enabled. + sampleRTTWithTSOnly bool } func (e *endpoint) newHandshake() *handshake { @@ -117,10 +122,12 @@ func (e *endpoint) newHandshake() *handshake { h.resetState() // Store reference to handshake state in endpoint. e.h = h + // By the time handshake is created, e.ID is already initialized. + e.TSOffset = e.protocol.tsOffset(e.ID.LocalAddress, e.ID.RemoteAddress) return h } -func (e *endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) *handshake { +func (e *endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) *handshake { h := e.newHandshake() h.resetToSynRcvd(isn, irs, opts, deferAccept) return h @@ -150,20 +157,23 @@ func (h *handshake) resetState() { h.flags = header.TCPFlagSyn h.ackNum = 0 h.mss = 0 - h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.stack.Seed()) + h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.protocol.seqnumSecret) } // generateSecureISN generates a secure Initial Sequence number based on the // recommendation here https://tools.ietf.org/html/rfc6528#page-3. func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed uint32) seqnum.Value { isnHasher := jenkins.Sum32(seed) - isnHasher.Write([]byte(id.LocalAddress)) - isnHasher.Write([]byte(id.RemoteAddress)) + // Per hash.Hash.Writer: + // + // It never returns an error. + _, _ = isnHasher.Write([]byte(id.LocalAddress)) + _, _ = isnHasher.Write([]byte(id.RemoteAddress)) portBuf := make([]byte, 2) binary.LittleEndian.PutUint16(portBuf, id.LocalPort) - isnHasher.Write(portBuf) + _, _ = isnHasher.Write(portBuf) binary.LittleEndian.PutUint16(portBuf, id.RemotePort) - isnHasher.Write(portBuf) + _, _ = isnHasher.Write(portBuf) // The time period here is 64ns. This is similar to what linux uses // generate a sequence number that overlaps less than one // time per MSL (2 minutes). @@ -190,7 +200,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 { // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD // state. -func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) { +func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) { h.active = false h.state = handshakeSynRcvd h.flags = header.TCPFlagSyn | header.TCPFlagAck @@ -251,10 +261,10 @@ func (h *handshake) synSentState(s *segment) tcpip.Error { rcvSynOpts := parseSynSegmentOptions(s) // Remember if the Timestamp option was negotiated. - h.ep.maybeEnableTimestamp(&rcvSynOpts) + h.ep.maybeEnableTimestamp(rcvSynOpts) // Remember if the SACKPermitted option was negotiated. - h.ep.maybeEnableSACKPermitted(&rcvSynOpts) + h.ep.maybeEnableSACKPermitted(rcvSynOpts) // Remember the sequence we'll ack from now on. h.ackNum = s.sequenceNumber + 1 @@ -266,8 +276,7 @@ func (h *handshake) synSentState(s *segment) tcpip.Error { // and the handshake is completed. if s.flags.Contains(header.TCPFlagAck) { h.state = handshakeCompleted - - h.ep.transitionToStateEstablishedLocked(h) + h.transitionToStateEstablishedLocked(s) h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) return nil @@ -283,7 +292,7 @@ func (h *handshake) synSentState(s *segment) tcpip.Error { synOpts := header.TCPSynOptions{ WS: int(h.effectiveRcvWndScale()), TS: rcvSynOpts.TS, - TSVal: h.ep.timestamp(), + TSVal: h.ep.tsValNow(), TSEcr: h.ep.recentTimestamp(), // We only send SACKPermitted if the other side indicated it @@ -353,7 +362,7 @@ func (h *handshake) synRcvdState(s *segment) tcpip.Error { synOpts := header.TCPSynOptions{ WS: h.rcvWndScale, TS: h.ep.SendTSOk, - TSVal: h.ep.timestamp(), + TSVal: h.ep.tsValNow(), TSEcr: h.ep.recentTimestamp(), SACKPermitted: h.ep.SACKPermitted, MSS: h.ep.amss, @@ -402,9 +411,10 @@ func (h *handshake) synRcvdState(s *segment) tcpip.Error { if h.ep.SendTSOk && s.parsedOptions.TS { h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) } + h.state = handshakeCompleted - h.ep.transitionToStateEstablishedLocked(h) + h.transitionToStateEstablishedLocked(s) // Requeue the segment if the ACK completing the handshake has more info // to be procesed by the newly established endpoint. @@ -480,7 +490,7 @@ func (h *handshake) start() { synOpts := header.TCPSynOptions{ WS: h.rcvWndScale, TS: true, - TSVal: h.ep.timestamp(), + TSVal: h.ep.tsValNow(), TSEcr: h.ep.recentTimestamp(), SACKPermitted: bool(sackEnabled), MSS: h.ep.amss, @@ -511,6 +521,7 @@ func (h *handshake) start() { } // complete completes the TCP 3-way handshake initiated by h.start(). +// +checklocks:h.ep.mu func (h *handshake) complete() tcpip.Error { // Set up the wakers. var s sleep.Sleeper @@ -556,6 +567,10 @@ func (h *handshake) complete() tcpip.Error { ack: h.ackNum, rcvWnd: h.rcvWnd, }, h.sendSYNOpts) + // If we have ever retransmitted the SYN-ACK or + // SYN segment, we should only measure RTT if + // TS option is present. + h.sampleRTTWithTSOnly = true } case wakerForNotification: @@ -599,6 +614,40 @@ func (h *handshake) complete() tcpip.Error { return nil } +// transitionToStateEstablisedLocked transitions the endpoint of the handshake +// to an established state given the last segment received from peer. It also +// initializes sender/receiver. +func (h *handshake) transitionToStateEstablishedLocked(s *segment) { + // Transfer handshake state to TCP connection. We disable + // receive window scaling if the peer doesn't support it + // (indicated by a negative send window scale). + h.ep.snd = newSender(h.ep, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) + + now := h.ep.stack.Clock().NowMonotonic() + + var rtt time.Duration + if h.ep.SendTSOk && s.parsedOptions.TSEcr != 0 { + rtt = h.ep.elapsed(now, s.parsedOptions.TSEcr) + } + if !h.sampleRTTWithTSOnly && rtt == 0 { + rtt = now.Sub(h.startTime) + } + + if rtt > 0 { + h.ep.snd.updateRTO(rtt) + } + + h.ep.rcvQueueInfo.rcvQueueMu.Lock() + h.ep.rcv = newReceiver(h.ep, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale()) + // Bootstrap the auto tuning algorithm. Starting at zero will + // result in a really large receive window after the first auto + // tuning adjustment. + h.ep.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd) + h.ep.rcvQueueInfo.rcvQueueMu.Unlock() + + h.ep.setEndpointState(StateEstablished) +} + type backoffTimer struct { timeout time.Duration maxTimeout time.Duration @@ -872,7 +921,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte { // Ref: https://tools.ietf.org/html/rfc7323#section-5.4. offset += header.EncodeNOP(options[offset:]) offset += header.EncodeNOP(options[offset:]) - offset += header.EncodeTSOption(e.timestamp(), e.recentTimestamp(), options[offset:]) + offset += header.EncodeTSOption(e.tsValNow(), e.recentTimestamp(), options[offset:]) } if e.SACKPermitted && len(sackBlocks) > 0 { offset += header.EncodeNOP(options[offset:]) @@ -909,30 +958,13 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags header.TCPFlags, se return err } -func (e *endpoint) handleWrite() { - e.sndQueueInfo.sndQueueMu.Lock() - next := e.drainSendQueueLocked() - e.sndQueueInfo.sndQueueMu.Unlock() - - e.sendData(next) -} - -// Move packets from send queue to send list. -// -// Precondition: e.sndBufMu must be locked. -func (e *endpoint) drainSendQueueLocked() *segment { - first := e.sndQueueInfo.sndQueue.Front() - if first != nil { - e.snd.writeList.PushBackList(&e.sndQueueInfo.sndQueue) - e.sndQueueInfo.SndBufInQueue = 0 - } - return first -} - // Precondition: e.mu must be locked. func (e *endpoint) sendData(next *segment) { // Initialize the next segment to write if it's currently nil. if e.snd.writeNext == nil { + if next == nil { + return + } e.snd.writeNext = next } @@ -940,17 +972,6 @@ func (e *endpoint) sendData(next *segment) { e.snd.sendData() } -func (e *endpoint) handleClose() { - if !e.EndpointState().connected() { - return - } - // Drain the send queue. - e.handleWrite() - - // Mark send side as closed. - e.snd.Closed = true -} - // resetConnectionLocked puts the endpoint in an error state with the given // error code and sends a RST if and only if the error is not ErrConnectionReset // indicating that the connection is being reset due to receiving a RST. This @@ -992,26 +1013,6 @@ func (e *endpoint) completeWorkerLocked() { } } -// transitionToStateEstablisedLocked transitions a given endpoint -// to an established state using the handshake parameters provided. -// It also initializes sender/receiver. -func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) { - // Transfer handshake state to TCP connection. We disable - // receive window scaling if the peer doesn't support it - // (indicated by a negative send window scale). - e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) - - e.rcvQueueInfo.rcvQueueMu.Lock() - e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale()) - // Bootstrap the auto tuning algorithm. Starting at zero will - // result in a really large receive window after the first auto - // tuning adjustment. - e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd) - e.rcvQueueInfo.rcvQueueMu.Unlock() - - e.setEndpointState(StateEstablished) -} - // transitionToStateCloseLocked ensures that the endpoint is // cleaned up from the transport demuxer, "before" moving to // StateClose. This will ensure that no packet will be @@ -1130,7 +1131,7 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) { func (e *endpoint) handleSegmentsLocked(fastPath bool) tcpip.Error { checkRequeue := true for i := 0; i < maxSegmentsPerWake; i++ { - if e.EndpointState().closed() { + if state := e.EndpointState(); state.closed() || state == StateTimeWait { return nil } s := e.segmentQueue.dequeue() @@ -1311,42 +1312,45 @@ func (e *endpoint) disableKeepaliveTimer() { e.keepalive.Unlock() } -// protocolMainLoop is the main loop of the TCP protocol. It runs in its own -// goroutine and is responsible for sending segments and handling received -// segments. -func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error { - e.mu.Lock() - var closeTimer tcpip.Timer - var closeWaker sleep.Waker - - epilogue := func() { - // e.mu is expected to be hold upon entering this section. - if e.snd != nil { - e.snd.resendTimer.cleanup() - e.snd.probeTimer.cleanup() - e.snd.reorderTimer.cleanup() - } +// protocolMainLoopDone is called at the end of protocolMainLoop. +// +checklocksrelease:e.mu +func (e *endpoint) protocolMainLoopDone(closeTimer tcpip.Timer) { + if e.snd != nil { + e.snd.resendTimer.cleanup() + e.snd.probeTimer.cleanup() + e.snd.reorderTimer.cleanup() + } - if closeTimer != nil { - closeTimer.Stop() - } + if closeTimer != nil { + closeTimer.Stop() + } - e.completeWorkerLocked() + e.completeWorkerLocked() - if e.drainDone != nil { - close(e.drainDone) - } + if e.drainDone != nil { + close(e.drainDone) + } - e.mu.Unlock() + e.mu.Unlock() - e.drainClosingSegmentQueue() + e.drainClosingSegmentQueue() - // When the protocol loop exits we should wake up our waiters. - e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) - } + // When the protocol loop exits we should wake up our waiters. + e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) +} + +// protocolMainLoop is the main loop of the TCP protocol. It runs in its own +// goroutine and is responsible for sending segments and handling received +// segments. +func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) { + var ( + closeTimer tcpip.Timer + closeWaker sleep.Waker + ) + e.mu.Lock() if handshake { - if err := e.h.complete(); err != nil { + if err := e.h.complete(); err != nil { // +checklocksforce e.lastErrorMu.Lock() e.lastError = err e.lastErrorMu.Unlock() @@ -1355,9 +1359,8 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ e.hardError = err e.workerCleanup = true - // Lock released below. - epilogue() - return err + e.protocolMainLoopDone(closeTimer) + return } } @@ -1402,14 +1405,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ { w: &e.sndQueueInfo.sndWaker, f: func() tcpip.Error { - e.handleWrite() - return nil - }, - }, - { - w: &e.sndQueueInfo.sndCloseWaker, - f: func() tcpip.Error { - e.handleClose() + e.sendData(nil /* next */) return nil }, }, @@ -1507,7 +1503,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ // Only block the worker if the endpoint // is not in closed state or error state. close(e.drainDone) - e.mu.Unlock() + e.mu.Unlock() // +checklocksforce <-e.undrain e.mu.Lock() } @@ -1568,8 +1564,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ if err != nil { e.resetConnectionLocked(err) } - // Lock released below. - epilogue() } loop: @@ -1593,7 +1587,8 @@ loop: // just want to terminate the loop and cleanup the // endpoint. cleanupOnError(nil) - return nil + e.protocolMainLoopDone(closeTimer) + return case StateTimeWait: fallthrough case StateClose: @@ -1601,7 +1596,8 @@ loop: default: if err := funcs[v].f(); err != nil { cleanupOnError(err) - return nil + e.protocolMainLoopDone(closeTimer) + return } } } @@ -1624,21 +1620,19 @@ loop: // Handle any StateError transition from StateTimeWait. if e.EndpointState() == StateError { cleanupOnError(nil) - return nil + e.protocolMainLoopDone(closeTimer) + return } e.transitionToStateCloseLocked() - // Lock released below. - epilogue() + e.protocolMainLoopDone(closeTimer) // A new SYN was received during TIME_WAIT and we need to abort // the timewait and redirect the segment to the listener queue if reuseTW != nil { reuseTW() } - - return nil } // handleTimeWaitSegments processes segments received during TIME_WAIT @@ -1700,6 +1694,7 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func() // should be executed after releasing the endpoint registrations. This is // done in cases where a new SYN is received during TIME_WAIT that carries // a sequence number larger than one see on the connection. +// +checklocks:e.mu func (e *endpoint) doTimeWait() (twReuse func()) { // Trigger a 2 * MSL time wait state. During this period // we will drop all incoming segments. diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go index dff7cb89c..7d110516b 100644 --- a/pkg/tcpip/transport/tcp/dispatcher.go +++ b/pkg/tcpip/transport/tcp/dispatcher.go @@ -127,7 +127,7 @@ func (p *processor) start(wg *sync.WaitGroup) { case !ep.segmentQueue.empty(): p.epQ.enqueue(ep) } - ep.mu.Unlock() + ep.mu.Unlock() // +checklocksforce } else { ep.newSegmentWaker.Assert() } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index a27e2110b..d2b8f298f 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -20,7 +20,6 @@ import ( "fmt" "io" "math" - "math/rand" "runtime" "strings" "sync/atomic" @@ -293,16 +292,9 @@ type sndQueueInfo struct { sndQueueMu sync.Mutex `state:"nosave"` stack.TCPSndBufState - // sndQueue holds segments that are ready to be sent. - sndQueue segmentList `state:"wait"` - - // sndWaker is used to signal the protocol goroutine when segments are - // added to the `sndQueue`. + // sndWaker is used to signal the protocol goroutine when there may be + // segments that need to be sent. sndWaker sleep.Waker `state:"manual"` - - // sndCloseWaker is used to notify the protocol goroutine when the send - // side is closed. - sndCloseWaker sleep.Waker `state:"manual"` } // rcvQueueInfo contains the endpoint's rcvQueue and associated metadata. @@ -385,6 +377,7 @@ type endpoint struct { // The following fields are initialized at creation time and do not // change throughout the lifetime of the endpoint. stack *stack.Stack `state:"manual"` + protocol *protocol `state:"manual"` waiterQueue *waiter.Queue `state:"wait"` uniqueID uint64 @@ -485,7 +478,7 @@ type endpoint struct { // shutdownFlags represent the current shutdown state of the endpoint. shutdownFlags tcpip.ShutdownFlags - // tcpRecovery is the loss deteoction algorithm used by TCP. + // tcpRecovery is the loss recovery algorithm used by TCP. tcpRecovery tcpip.TCPRecovery // sack holds TCP SACK related information for this endpoint. @@ -671,6 +664,7 @@ func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { // The assumption behind spinning here being that background packet processing // should not be holding the lock for long and spinning reduces latency as we // avoid an expensive sleep/wakeup of of the syscall goroutine). +// +checklocksacquire:e.mu func (e *endpoint) LockUser() { for { // Try first if the sock is locked then check if it's owned @@ -690,7 +684,7 @@ func (e *endpoint) LockUser() { continue } atomic.StoreUint32(&e.ownedByUser, 1) - return + return // +checklocksforce } } @@ -707,7 +701,7 @@ func (e *endpoint) LockUser() { // protocol goroutine altogether. // // Precondition: e.LockUser() must have been called before calling e.UnlockUser() -// +checklocks:e.mu +// +checklocksrelease:e.mu func (e *endpoint) UnlockUser() { // Lock segment queue before checking so that we avoid a race where // segments can be queued between the time we check if queue is empty @@ -743,12 +737,13 @@ func (e *endpoint) UnlockUser() { } // StopWork halts packet processing. Only to be used in tests. +// +checklocksacquire:e.mu func (e *endpoint) StopWork() { e.mu.Lock() } // ResumeWork resumes packet processing. Only to be used in tests. -// +checklocks:e.mu +// +checklocksrelease:e.mu func (e *endpoint) ResumeWork() { e.mu.Unlock() } @@ -759,7 +754,7 @@ func (e *endpoint) ResumeWork() { // // Precondition: e.mu must be held to call this method. func (e *endpoint) setEndpointState(state EndpointState) { - oldstate := EndpointState(atomic.LoadUint32(&e.state)) + oldstate := EndpointState(atomic.SwapUint32(&e.state, uint32(state))) switch state { case StateEstablished: e.stack.Stats().TCP.CurrentEstablished.Increment() @@ -776,7 +771,6 @@ func (e *endpoint) setEndpointState(state EndpointState) { e.stack.Stats().TCP.CurrentEstablished.Decrement() } } - atomic.StoreUint32(&e.state, uint32(state)) } // EndpointState returns the current state of the endpoint. @@ -809,9 +803,10 @@ type keepalive struct { waker sleep.Waker `state:"nosave"` } -func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { +func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { e := &endpoint{ - stack: s, + stack: s, + protocol: protocol, TransportEndpointInfo: stack.TransportEndpointInfo{ NetProto: netProto, TransProto: header.TCPProtocolNumber, @@ -875,14 +870,12 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue e.maxSynRetries = uint8(synRetries) } - s.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) - if p := s.GetTCPProbe(); p != nil { e.probe = p } e.segmentQueue.ep = e - e.TSOffset = timeStampOffset(e.stack.Rand()) + e.acceptCond = sync.NewCond(&e.acceptMu) e.keepalive.timer.init(e.stack.Clock(), &e.keepalive.waker) @@ -1487,87 +1480,101 @@ func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) { return avail, nil } -// Write writes data to the endpoint's peer. -func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { - // Linux completely ignores any address passed to sendto(2) for TCP sockets - // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More - // and opts.EndOfRecord are also ignored. +// readFromPayloader reads a slice from the Payloader. +// +checklocks:e.mu +// +checklocks:e.sndQueueInfo.sndQueueMu +func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) ([]byte, tcpip.Error) { + // We can release locks while copying data. + // + // This is not possible if atomic is set, because we can't allow the + // available buffer space to be consumed by some other caller while we + // are copying data in. + if !opts.Atomic { + e.sndQueueInfo.sndQueueMu.Unlock() + defer e.sndQueueInfo.sndQueueMu.Lock() - e.LockUser() - defer e.UnlockUser() + e.UnlockUser() + defer e.LockUser() + } - nextSeg, n, err := func() (*segment, int, tcpip.Error) { - e.sndQueueInfo.sndQueueMu.Lock() - defer e.sndQueueInfo.sndQueueMu.Unlock() + // Fetch data. + if l := p.Len(); l < avail { + avail = l + } + if avail == 0 { + return nil, nil + } + v := make([]byte, avail) + n, err := p.Read(v) + if err != nil && err != io.EOF { + return nil, &tcpip.ErrBadBuffer{} + } + return v[:n], nil +} + +// queueSegment reads data from the payloader and returns a segment to be sent. +// +checklocks:e.mu +func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { + e.sndQueueInfo.sndQueueMu.Lock() + defer e.sndQueueInfo.sndQueueMu.Unlock() + + avail, err := e.isEndpointWritableLocked() + if err != nil { + e.stats.WriteErrors.WriteClosed.Increment() + return nil, 0, err + } + + v, err := e.readFromPayloader(p, opts, avail) + if err != nil { + return nil, 0, err + } + + // Do not queue zero length segments. + if len(v) == 0 { + return nil, 0, nil + } + if !opts.Atomic { + // Since we released locks in between it's possible that the + // endpoint transitioned to a CLOSED/ERROR states so make + // sure endpoint is still writable before trying to write. avail, err := e.isEndpointWritableLocked() if err != nil { e.stats.WriteErrors.WriteClosed.Increment() return nil, 0, err } - v, err := func() ([]byte, tcpip.Error) { - // We can release locks while copying data. - // - // This is not possible if atomic is set, because we can't allow the - // available buffer space to be consumed by some other caller while we - // are copying data in. - if !opts.Atomic { - e.sndQueueInfo.sndQueueMu.Unlock() - defer e.sndQueueInfo.sndQueueMu.Lock() - - e.UnlockUser() - defer e.LockUser() - } - - // Fetch data. - if l := p.Len(); l < avail { - avail = l - } - if avail == 0 { - return nil, nil - } - v := make([]byte, avail) - n, err := p.Read(v) - if err != nil && err != io.EOF { - return nil, &tcpip.ErrBadBuffer{} - } - return v[:n], nil - }() - if len(v) == 0 || err != nil { - return nil, 0, err + // Discard any excess data copied in due to avail being reduced due + // to a simultaneous write call to the socket. + if avail < len(v) { + v = v[:avail] } + } - if !opts.Atomic { - // Since we released locks in between it's possible that the - // endpoint transitioned to a CLOSED/ERROR states so make - // sure endpoint is still writable before trying to write. - avail, err := e.isEndpointWritableLocked() - if err != nil { - e.stats.WriteErrors.WriteClosed.Increment() - return nil, 0, err - } + // Add data to the send queue. + s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v) + e.sndQueueInfo.SndBufUsed += len(v) + e.snd.writeList.PushBack(s) - // Discard any excess data copied in due to avail being reduced due - // to a simultaneous write call to the socket. - if avail < len(v) { - v = v[:avail] - } - } + return s, len(v), nil +} - // Add data to the send queue. - s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v) - e.sndQueueInfo.SndBufUsed += len(v) - e.sndQueueInfo.SndBufInQueue += seqnum.Size(len(v)) - e.sndQueueInfo.sndQueue.PushBack(s) +// Write writes data to the endpoint's peer. +func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { + // Linux completely ignores any address passed to sendto(2) for TCP sockets + // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More + // and opts.EndOfRecord are also ignored. + + e.LockUser() + defer e.UnlockUser() - return e.drainSendQueueLocked(), len(v), nil - }() // Return if either we didn't queue anything or if an error occurred while // attempting to queue data. + nextSeg, n, err := e.queueSegment(p, opts) if n == 0 || err != nil { return 0, err } + e.sendData(nextSeg) return int64(n), nil } @@ -1711,6 +1718,27 @@ func (e *endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64) { return rcvBufSz } +// OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. +func (e *endpoint) OnSetSendBufferSize(sz int64) int64 { + atomic.StoreUint32(&e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled, 1) + return sz +} + +// WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. +func (e *endpoint) WakeupWriters() { + e.LockUser() + defer e.UnlockUser() + + sendBufferSize := e.getSendBufferSize() + e.sndQueueInfo.sndQueueMu.Lock() + notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1 + e.sndQueueInfo.sndQueueMu.Unlock() + + if notify { + e.waiterQueue.Notify(waiter.WritableEvents) + } +} + // SetSockOptInt sets a socket option. func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 @@ -2171,7 +2199,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcp portBuf := make([]byte, 2) binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort) - h := jenkins.Sum32(e.stack.Seed()) + h := jenkins.Sum32(e.protocol.portOffsetSecret) for _, s := range [][]byte{ []byte(e.ID.LocalAddress), []byte(e.ID.RemoteAddress), @@ -2314,7 +2342,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcp // connection setting here. if !handshake { e.segmentQueue.mu.Lock() - for _, l := range []segmentList{e.segmentQueue.list, e.sndQueueInfo.sndQueue, e.snd.writeList} { + for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} { for s := l.Front(); s != nil; s = s.Next() { s.id = e.TransportEndpointInfo.ID e.sndQueueInfo.sndWaker.Assert() @@ -2323,6 +2351,9 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcp e.segmentQueue.mu.Unlock() e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) e.setEndpointState(StateEstablished) + // Set the new auto tuned send buffer size after entering + // established state. + e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */) } if run { @@ -2372,6 +2403,9 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { e.notifyProtocolGoroutine(notifyTickleWorker) return nil } + // Wake up any readers that maybe waiting for the stream to become + // readable. + e.waiterQueue.Notify(waiter.ReadableEvents) } // Close for write. @@ -2388,12 +2422,20 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { // Queue fin segment. s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), nil) - e.sndQueueInfo.sndQueue.PushBack(s) - e.sndQueueInfo.SndBufInQueue++ + e.snd.writeList.PushBack(s) // Mark endpoint as closed. e.sndQueueInfo.SndClosed = true e.sndQueueInfo.sndQueueMu.Unlock() - e.handleClose() + + // Drain the send queue. + e.sendData(s) + + // Mark send side as closed. + e.snd.Closed = true + + // Wake up any writers that maybe waiting for the stream to become + // writable. + e.waiterQueue.Notify(waiter.WritableEvents) } return nil @@ -2501,6 +2543,7 @@ func (e *endpoint) listen(backlog int) tcpip.Error { // startAcceptedLoop sets up required state and starts a goroutine with the // main loop for accepted connections. +// +checklocksrelease:e.mu func (e *endpoint) startAcceptedLoop() { e.workerRunning = true e.mu.Unlock() @@ -2745,13 +2788,20 @@ func (e *endpoint) updateSndBufferUsage(v int) { e.sndQueueInfo.sndQueueMu.Lock() notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1 e.sndQueueInfo.SndBufUsed -= v + + // Get the new send buffer size with auto tuning, but do not set it + // unless we decide to notify the writers. + newSndBufSz := e.computeTCPSendBufferSize() + // We only notify when there is half the sendBufferSize available after // a full buffer event occurs. This ensures that we don't wake up // writers to queue just 1-2 segments and go back to sleep. - notify = notify && e.sndQueueInfo.SndBufUsed < sendBufferSize>>1 + notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1 e.sndQueueInfo.sndQueueMu.Unlock() if notify { + // Set the new send buffer size calculated from auto tuning. + e.ops.SetSendBufferSize(newSndBufSz, false /* notify */) e.waiterQueue.Notify(waiter.WritableEvents) } } @@ -2855,46 +2905,29 @@ func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if // the SYN options indicate that timestamp option was negotiated. It also // initializes the recentTS with the value provided in synOpts.TSval. -func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) { +func (e *endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) { if synOpts.TS { e.SendTSOk = true e.setRecentTimestamp(synOpts.TSVal) } } -// timestamp returns the timestamp value to be used in the TSVal field of the -// timestamp option for outgoing TCP segments for a given endpoint. -func (e *endpoint) timestamp() uint32 { - return tcpTimeStamp(e.stack.Clock().NowMonotonic(), e.TSOffset) +func (e *endpoint) tsVal(now tcpip.MonotonicTime) uint32 { + return e.TSOffset.TSVal(now) } -// tcpTimeStamp returns a timestamp offset by the provided offset. This is -// not inlined above as it's used when SYN cookies are in use and endpoint -// is not created at the time when the SYN cookie is sent. -func tcpTimeStamp(curTime tcpip.MonotonicTime, offset uint32) uint32 { - d := curTime.Sub(tcpip.MonotonicTime{}) - return uint32(d.Milliseconds()) + offset +func (e *endpoint) tsValNow() uint32 { + return e.tsVal(e.stack.Clock().NowMonotonic()) } -// timeStampOffset returns a randomized timestamp offset to be used when sending -// timestamp values in a timestamp option for a TCP segment. -func timeStampOffset(rng *rand.Rand) uint32 { - // Initialize a random tsOffset that will be added to the recentTS - // everytime the timestamp is sent when the Timestamp option is enabled. - // - // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on - // why this is required. - // - // NOTE: This is not completely to spec as normally this should be - // initialized in a manner analogous to how sequence numbers are - // randomized per connection basis. But for now this is sufficient. - return rng.Uint32() +func (e *endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { + return e.TSOffset.Elapsed(now, tsEcr) } // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint // if the SYN options indicate that the SACK option was negotiated and the TCP // stack is configured to enable TCP SACK option. -func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) { +func (e *endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) { var v tcpip.TCPSACKEnabled if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { // Stack doesn't support SACK. So just return. @@ -2902,6 +2935,7 @@ func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) { } if bool(v) && synOpts.SACKPermitted { e.SACKPermitted = true + e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) } } @@ -3072,3 +3106,36 @@ func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOpti Max: ss.Max, } } + +// computeTCPSendBufferSize implements auto tuning of send buffer size and +// returns the new send buffer size. +func (e *endpoint) computeTCPSendBufferSize() int64 { + curSndBufSz := int64(e.getSendBufferSize()) + + // Auto tuning is disabled when the user explicitly sets the send + // buffer size with SO_SNDBUF option. + if disabled := atomic.LoadUint32(&e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled); disabled == 1 { + return curSndBufSz + } + + const packetOverheadFactor = 2 + curMSS := e.snd.MaxPayloadSize + numSeg := InitialCwnd + if numSeg < e.snd.SndCwnd { + numSeg = e.snd.SndCwnd + } + + // SndCwnd indicates the number of segments that can be sent. This means + // that the sender can send upto #SndCwnd segments and the send buffer + // size should be set to SndCwnd*MSS to accommodate sending of all the + // segments. + newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor) + if newSndBufSz < curSndBufSz { + return curSndBufSz + } + if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz { + newSndBufSz = int64(ss.Max) + } + + return newSndBufSz +} diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index 952ccacdd..f2e8b3840 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -170,6 +170,7 @@ func (e *endpoint) Resume(s *stack.Stack) { snd.probeTimer.init(s.Clock(), &snd.probeWaker) } e.stack = s + e.protocol = protocolFromStack(s) e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) e.segmentQueue.thaw() epState := EndpointState(e.origEndpointState) diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go index 65c86823a..128ef09e3 100644 --- a/pkg/tcpip/transport/tcp/forwarder.go +++ b/pkg/tcpip/transport/tcp/forwarder.go @@ -54,7 +54,7 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward maxInFlight: maxInFlight, handler: handler, inFlight: make(map[stack.TransportEndpointID]struct{}), - listen: newListenContext(s, nil /* listenEP */, seqnum.Size(rcvWnd), true, 0), + listen: newListenContext(s, protocolFromStack(s), nil /* listenEP */, seqnum.Size(rcvWnd), true, 0), } } @@ -152,7 +152,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, } f := r.forwarder - ep, err := f.listen.performHandshake(r.segment, &header.TCPSynOptions{ + ep, err := f.listen.performHandshake(r.segment, header.TCPSynOptions{ MSS: r.synOptions.MSS, WS: r.synOptions.WS, TS: r.synOptions.TS, @@ -164,8 +164,9 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, return nil, err } - // Start the protocol goroutine. - ep.startAcceptedLoop() + // Start the protocol goroutine. Note that the endpoint is returned + // from performHandshake locked. + ep.startAcceptedLoop() // +checklocksforce return ep, nil } diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 2fc282e73..e4410ad93 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -23,8 +23,10 @@ import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/header/parse" + "gvisor.dev/gvisor/pkg/tcpip/internal/tcp" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" @@ -49,10 +51,6 @@ const ( // MaxBufferSize is the largest size a receive/send buffer can grow to. MaxBufferSize = 4 << 20 // 4MB - // MaxUnprocessedSegments is the maximum number of unprocessed segments - // that can be queued for a given endpoint. - MaxUnprocessedSegments = 300 - // DefaultTCPLingerTimeout is the amount of time that sockets linger in // FIN_WAIT_2 state before being marked closed. DefaultTCPLingerTimeout = 60 * time.Second @@ -96,6 +94,11 @@ type protocol struct { maxRetries uint32 synRetries uint8 dispatcher dispatcher + + // The following secrets are initialized once and stay unchanged after. + seqnumSecret uint32 + portOffsetSecret uint32 + tsOffsetSecret uint32 } // Number returns the tcp protocol number. @@ -105,7 +108,7 @@ func (*protocol) Number() tcpip.TransportProtocolNumber { // NewEndpoint creates a new tcp endpoint. func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { - return newEndpoint(p.stack, netProto, waiterQueue), nil + return newEndpoint(p.stack, p, netProto, waiterQueue), nil } // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently @@ -156,6 +159,24 @@ func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, return stack.UnknownDestinationPacketHandled } +func (p *protocol) tsOffset(src, dst tcpip.Address) tcp.TSOffset { + // Initialize a random tsOffset that will be added to the recentTS + // everytime the timestamp is sent when the Timestamp option is enabled. + // + // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on + // why this is required. + // + // TODO(https://gvisor.dev/issues/6473): This is not really secure as + // it does not use the recommended algorithm linked above. + h := jenkins.Sum32(p.tsOffsetSecret) + // Per hash.Hash.Writer: + // + // It never returns an error. + _, _ = h.Write([]byte(src)) + _, _ = h.Write([]byte(dst)) + return tcp.NewTSOffset(h.Sum32()) +} + // replyWithReset replies to the given segment with a reset segment. // // If the passed TTL is 0, then the route's default TTL will be used. @@ -292,22 +313,26 @@ func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip case *tcpip.TCPMinRTOOption: p.mu.Lock() + defer p.mu.Unlock() if *v < 0 { p.minRTO = MinRTO + } else if minRTO := time.Duration(*v); minRTO <= p.maxRTO { + p.minRTO = minRTO } else { - p.minRTO = time.Duration(*v) + return &tcpip.ErrInvalidOptionValue{} } - p.mu.Unlock() return nil case *tcpip.TCPMaxRTOOption: p.mu.Lock() + defer p.mu.Unlock() if *v < 0 { p.maxRTO = MaxRTO + } else if maxRTO := time.Duration(*v); maxRTO >= p.minRTO { + p.maxRTO = maxRTO } else { - p.maxRTO = time.Duration(*v) + return &tcpip.ErrInvalidOptionValue{} } - p.mu.Unlock() return nil case *tcpip.TCPMaxRetriesOption: @@ -478,9 +503,16 @@ func NewProtocol(s *stack.Stack) stack.TransportProtocol { minRTO: MinRTO, maxRTO: MaxRTO, maxRetries: MaxRetries, - // TODO(gvisor.dev/issue/5243): Set recovery to tcpip.TCPRACKLossDetection. - recovery: 0, + recovery: tcpip.TCPRACKLossDetection, + seqnumSecret: s.Rand().Uint32(), + portOffsetSecret: s.Rand().Uint32(), + tsOffsetSecret: s.Rand().Uint32(), } p.dispatcher.init(s.Rand(), runtime.GOMAXPROCS(0)) return &p } + +// protocolFromStack retrieves the tcp.protocol instance from stack s. +func protocolFromStack(s *stack.Stack) *protocol { + return s.TransportProtocolInstance(ProtocolNumber).(*protocol) +} diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go index 0da4eafaa..3b055c294 100644 --- a/pkg/tcpip/transport/tcp/rack.go +++ b/pkg/tcpip/transport/tcp/rack.go @@ -80,7 +80,6 @@ func (rc *rackControl) init(snd *sender, iss seqnum.Value) { // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2 func (rc *rackControl) update(seg *segment, ackSeg *segment) { rtt := rc.snd.ep.stack.Clock().NowMonotonic().Sub(seg.xmitTime) - tsOffset := rc.snd.ep.TSOffset // If the ACK is for a retransmitted packet, do not update if it is a // spurious inference which is determined by below checks: @@ -92,7 +91,7 @@ func (rc *rackControl) update(seg *segment, ackSeg *segment) { // step 2 if seg.xmitCount > 1 { if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 { - if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, tsOffset) { + if ackSeg.parsedOptions.TSEcr < rc.snd.ep.tsVal(seg.xmitTime) { return } } diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index 661ca604a..90e493978 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -477,7 +477,7 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err tcpip.Error) { // segments. This ensures that we always leave some space for the inorder // segments to arrive allowing pending segments to be processed and // delivered to the user. - if rcvBufSize := r.ep.ops.GetReceiveBufferSize(); rcvBufSize > 0 && r.PendingBufUsed < int(rcvBufSize)>>2 { + if rcvBufSize := r.ep.ops.GetReceiveBufferSize(); rcvBufSize > 0 && (r.PendingBufUsed+int(segLen)) < int(rcvBufSize)>>2 { r.ep.rcvQueueInfo.rcvQueueMu.Lock() r.PendingBufUsed += s.segMemSize() r.ep.rcvQueueInfo.rcvQueueMu.Unlock() @@ -559,7 +559,6 @@ func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn // (2) returns to TIME-WAIT state if the SYN turns out // to be an old duplicate". if s.flags.Contains(header.TCPFlagSyn) && r.RcvNxt.LessThan(segSeq) { - return false, true } diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 72d58dcff..2fabf1594 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -382,6 +382,9 @@ func (s *sender) updateRTO(rtt time.Duration) { if s.RTO < s.minRTO { s.RTO = s.minRTO } + if s.RTO > s.maxRTO { + s.RTO = s.maxRTO + } } // resendSegment resends the first unacknowledged segment. @@ -1154,6 +1157,13 @@ func (s *sender) walkSACK(rcvdSeg *segment) { idx := 0 n := len(rcvdSeg.parsedOptions.SACKBlocks) if checkDSACK(rcvdSeg) { + dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0] + numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize) + // numDSACK can be zero when DSACK is sent for subsegments. + if numDSACK < 1 { + numDSACK = 1 + } + s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK) s.rc.setDSACKSeen(true) idx = 1 n-- @@ -1335,10 +1345,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { // some new data, i.e., only if it advances the left edge of // the send window. if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { - // TSVal/Ecr values sent by Netstack are at a millisecond - // granularity. - elapsed := time.Duration(s.ep.timestamp()-rcvdSeg.parsedOptions.TSEcr) * time.Millisecond - s.updateRTO(elapsed) + s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr)) } if s.shouldSchedulePTO() { @@ -1408,9 +1415,6 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { ackLeft -= datalen } - // Update the send buffer usage and notify potential waiters. - s.ep.updateSndBufferUsage(int(acked)) - // Clear SACK information for all acked data. s.ep.scoreboard.Delete(s.SndUna) @@ -1430,6 +1434,9 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { } } + // Update the send buffer usage and notify potential waiters. + s.ep.updateSndBufferUsage(int(acked)) + // It is possible for s.outstanding to drop below zero if we get // a retransmit timeout, reset outstanding to zero but later // get an ack that cover previously sent data. diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go index ced3a9c58..84fb1c416 100644 --- a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go +++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go @@ -16,6 +16,7 @@ // iterations taking long enough that the retransmit timer can kick in causing // the congestion window measurements to fail due to extra packets etc. // +//go:build !race // +build !race package tcp_test diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go index d6cf786a1..c35db7c95 100644 --- a/pkg/tcpip/transport/tcp/tcp_rack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go @@ -33,12 +33,11 @@ const ( tsOptionSize = 12 maxTCPOptionSize = 40 mtu = header.TCPMinimumSize + header.IPv4MinimumSize + maxTCPOptionSize + maxPayload - latency = 5 * time.Millisecond ) -func setStackRACKPermitted(t *testing.T, c *context.Context) { +func setStackTCPRecovery(t *testing.T, c *context.Context, recovery int) { t.Helper() - opt := tcpip.TCPRACKLossDetection + opt := tcpip.TCPRecovery(recovery) if err := c.Stack().SetTransportProtocolOption(header.TCPProtocolNumber, &opt); err != nil { t.Fatalf("c.s.SetTransportProtocolOption(%d, &%v(%v)): %s", header.TCPProtocolNumber, opt, opt, err) } @@ -70,7 +69,6 @@ func TestRACKUpdate(t *testing.T) { close(probeDone) }) setStackSACKPermitted(t, c, true) - setStackRACKPermitted(t, c) createConnectedWithSACKAndTS(c) data := make([]byte, maxPayload) @@ -129,7 +127,6 @@ func TestRACKDetectReorder(t *testing.T) { close(probeDone) }) setStackSACKPermitted(t, c, true) - setStackRACKPermitted(t, c) createConnectedWithSACKAndTS(c) data := make([]byte, ackNumToVerify*maxPayload) for i := range data { @@ -162,10 +159,13 @@ func TestRACKDetectReorder(t *testing.T) { func sendAndReceiveWithSACK(t *testing.T, c *context.Context, numPackets int, enableRACK bool) []byte { setStackSACKPermitted(t, c, true) - if enableRACK { - setStackRACKPermitted(t, c) + if !enableRACK { + setStackTCPRecovery(t, c, 0) } - createConnectedWithSACKAndTS(c) + // The delay should be below initial RTO (1s) otherwise retransimission + // will start. Choose a relatively large value so that estimated RTT + // keeps high even after a few rounds of undelayed RTT samples. + c.CreateConnectedWithOptions(header.TCPSynOptions{SACKPermitted: c.SACKEnabled(), TS: true}, 800*time.Millisecond /* delay */) data := make([]byte, numPackets*maxPayload) for i := range data { @@ -183,9 +183,6 @@ func sendAndReceiveWithSACK(t *testing.T, c *context.Context, numPackets int, en for i := 0; i < numPackets; i++ { c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize) bytesRead += maxPayload - // This delay is added to increase RTT as low RTT can cause TLP - // before sending ACK. - time.Sleep(latency) } return data @@ -542,6 +539,28 @@ func TestRACKDetectDSACK(t *testing.T) { case invalidDSACKDetected: t.Fatalf("RACK DSACK detected when there is no duplicate SACK") } + + metricPollFn := func() error { + tcpStats := c.Stack().Stats().TCP + stats := []struct { + stat *tcpip.StatCounter + name string + want uint64 + }{ + // Check DSACK was received for one segment. + {tcpStats.SegmentsAckedWithDSACK, "stats.TCP.SegmentsAckedWithDSACK", 1}, + } + for _, s := range stats { + if got, want := s.stat.Value(), s.want; got != want { + return fmt.Errorf("got %s.Value() = %d, want = %d", s.name, got, want) + } + } + return nil + } + + if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil { + t.Error(err) + } } // TestRACKDetectDSACKWithOutOfOrder tests that RACK detects DSACK with out of @@ -682,6 +701,28 @@ func TestRACKDetectDSACKSingleDup(t *testing.T) { case invalidDSACKDetected: t.Fatalf("RACK DSACK detected when there is no duplicate SACK") } + + metricPollFn := func() error { + tcpStats := c.Stack().Stats().TCP + stats := []struct { + stat *tcpip.StatCounter + name string + want uint64 + }{ + // Check DSACK was received for a subsegment. + {tcpStats.SegmentsAckedWithDSACK, "stats.TCP.SegmentsAckedWithDSACK", 1}, + } + for _, s := range stats { + if got, want := s.stat.Value(), s.want; got != want { + return fmt.Errorf("got %s.Value() = %d, want = %d", s.name, got, want) + } + } + return nil + } + + if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil { + t.Error(err) + } } // TestRACKDetectDSACKDupWithCumulativeACK tests DSACK for two non-contiguous @@ -998,7 +1039,6 @@ func TestRACKWithWindowFull(t *testing.T) { defer c.Cleanup() setStackSACKPermitted(t, c, true) - setStackRACKPermitted(t, c) createConnectedWithSACKAndTS(c) seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1) diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go index 20c9761f2..6255355bb 100644 --- a/pkg/tcpip/transport/tcp/tcp_sack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go @@ -35,13 +35,13 @@ import ( // SACKPermitted option enabled if the stack in the context has the SACK support // enabled. func createConnectedWithSACKPermittedOption(c *context.Context) *context.RawEndpoint { - return c.CreateConnectedWithOptions(header.TCPSynOptions{SACKPermitted: c.SACKEnabled()}) + return c.CreateConnectedWithOptionsNoDelay(header.TCPSynOptions{SACKPermitted: c.SACKEnabled()}) } // createConnectedWithSACKAndTS creates and connects c.ep with the SACK & TS // option enabled if the stack in the context has SACK and TS enabled. func createConnectedWithSACKAndTS(c *context.Context) *context.RawEndpoint { - return c.CreateConnectedWithOptions(header.TCPSynOptions{SACKPermitted: c.SACKEnabled(), TS: true}) + return c.CreateConnectedWithOptionsNoDelay(header.TCPSynOptions{SACKPermitted: c.SACKEnabled(), TS: true}) } func setStackSACKPermitted(t *testing.T, c *context.Context, enable bool) { @@ -61,6 +61,7 @@ func TestSackPermittedConnect(t *testing.T) { defer c.Cleanup() setStackSACKPermitted(t, c, sackEnabled) + setStackTCPRecovery(t, c, 0) rep := createConnectedWithSACKPermittedOption(c) data := []byte{1, 2, 3} @@ -105,8 +106,9 @@ func TestSackDisabledConnect(t *testing.T) { defer c.Cleanup() setStackSACKPermitted(t, c, sackEnabled) + setStackTCPRecovery(t, c, 0) - rep := c.CreateConnectedWithOptions(header.TCPSynOptions{}) + rep := c.CreateConnectedWithOptionsNoDelay(header.TCPSynOptions{}) data := []byte{1, 2, 3} @@ -166,8 +168,9 @@ func TestSackPermittedAccept(t *testing.T) { } } setStackSACKPermitted(t, c, sackEnabled) + setStackTCPRecovery(t, c, 0) - rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS, SACKPermitted: tc.sackPermitted}) + rep := c.AcceptWithOptionsNoDelay(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS, SACKPermitted: tc.sackPermitted}) // Now verify no SACK blocks are // received when sack is disabled. data := []byte{1, 2, 3} @@ -239,8 +242,9 @@ func TestSackDisabledAccept(t *testing.T) { } setStackSACKPermitted(t, c, sackEnabled) + setStackTCPRecovery(t, c, 0) - rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS}) + rep := c.AcceptWithOptionsNoDelay(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS}) // Now verify no SACK blocks are // received when sack is disabled. @@ -386,6 +390,7 @@ func TestSACKRecovery(t *testing.T) { log.Printf("state: %+v\n", s) }) setStackSACKPermitted(t, c, true) + setStackTCPRecovery(t, c, 0) createConnectedWithSACKAndTS(c) const iterations = 3 diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 9bbe9bc3e..58817371e 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -28,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checker" + "gvisor.dev/gvisor/pkg/tcpip/faketime" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" @@ -1381,8 +1382,12 @@ func TestListenerReadinessOnEvent(t *testing.T) { if err := s.CreateNIC(id, ep); err != nil { t.Fatalf("CreateNIC(%d, %T): %s", id, ep, err) } - if err := s.AddAddress(id, ipv4.ProtocolNumber, context.StackAddr); err != nil { - t.Fatalf("AddAddress(%d, ipv4.ProtocolNumber, %s): %s", id, context.StackAddr, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: tcpip.Address(context.StackAddr).WithPrefix(), + } + if err := s.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", id, protocolAddr, err) } s.SetRouteTable([]tcpip.Route{ {Destination: header.IPv4EmptySubnet, NIC: id}, @@ -2127,6 +2132,214 @@ func TestFullWindowReceive(t *testing.T) { ) } +func TestSmallReceiveBufferReadiness(t *testing.T) { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol}, + }) + + ep := loopback.New() + if testing.Verbose() { + ep = sniffer.New(ep) + } + + const nicID = 1 + nicOpts := stack.NICOptions{Name: "nic1"} + if err := s.CreateNICWithOptions(nicID, ep, nicOpts); err != nil { + t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %s", nicOpts, err) + } + + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: tcpip.Address("\x7f\x00\x00\x01"), + PrefixLen: 8, + }, + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}) failed: %s", nicID, protocolAddr, err) + } + + { + subnet, err := tcpip.NewSubnet("\x7f\x00\x00\x00", "\xff\x00\x00\x00") + if err != nil { + t.Fatalf("tcpip.NewSubnet failed: %s", err) + } + s.SetRouteTable([]tcpip.Route{ + { + Destination: subnet, + NIC: nicID, + }, + }) + } + + listenerEntry, listenerCh := waiter.NewChannelEntry(nil) + var listenerWQ waiter.Queue + listener, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &listenerWQ) + if err != nil { + t.Fatalf("NewEndpoint failed: %s", err) + } + defer listener.Close() + listenerWQ.EventRegister(&listenerEntry, waiter.ReadableEvents) + defer listenerWQ.EventUnregister(&listenerEntry) + + if err := listener.Bind(tcpip.FullAddress{}); err != nil { + t.Fatalf("Bind failed: %s", err) + } + if err := listener.Listen(1); err != nil { + t.Fatalf("Bind failed: %s", err) + } + + localAddress, err := listener.GetLocalAddress() + if err != nil { + t.Fatalf("GetLocalAddress failed: %s", err) + } + + for i := 8; i > 0; i /= 2 { + size := int64(i << 10) + t.Run(fmt.Sprintf("size=%d", size), func(t *testing.T) { + var clientWQ waiter.Queue + client, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &clientWQ) + if err != nil { + t.Fatalf("NewEndpoint failed: %s", err) + } + defer client.Close() + switch err := client.Connect(localAddress).(type) { + case nil: + t.Fatal("Connect returned nil error") + case *tcpip.ErrConnectStarted: + default: + t.Fatalf("Connect failed: %s", err) + } + + <-listenerCh + server, serverWQ, err := listener.Accept(nil) + if err != nil { + t.Fatalf("Accept failed: %s", err) + } + defer server.Close() + + client.SocketOptions().SetReceiveBufferSize(size, true) + // Send buffer size doesn't seem to affect this test. + // server.SocketOptions().SetSendBufferSize(size, true) + + clientEntry, clientCh := waiter.NewChannelEntry(nil) + clientWQ.EventRegister(&clientEntry, waiter.ReadableEvents) + defer clientWQ.EventUnregister(&clientEntry) + + serverEntry, serverCh := waiter.NewChannelEntry(nil) + serverWQ.EventRegister(&serverEntry, waiter.WritableEvents) + defer serverWQ.EventUnregister(&serverEntry) + + var total int64 + for { + var b [64 << 10]byte + var r bytes.Reader + r.Reset(b[:]) + switch n, err := server.Write(&r, tcpip.WriteOptions{}); err.(type) { + case nil: + t.Logf("wrote %d bytes", n) + total += n + continue + case *tcpip.ErrWouldBlock: + select { + case <-serverCh: + continue + case <-time.After(100 * time.Millisecond): + // Well and truly full. + t.Logf("send and receive queues are full") + } + default: + t.Fatalf("Write failed: %s", err) + } + break + } + t.Logf("wrote %d bytes in total", total) + + var wg sync.WaitGroup + defer wg.Wait() + + wg.Add(2) + go func() { + defer wg.Done() + + var b [64 << 10]byte + var r bytes.Reader + r.Reset(b[:]) + if err := func() error { + var total int64 + defer t.Logf("wrote %d bytes in total", total) + for r.Len() != 0 { + switch n, err := server.Write(&r, tcpip.WriteOptions{}); err.(type) { + case nil: + t.Logf("wrote %d bytes", n) + total += n + case *tcpip.ErrWouldBlock: + for { + t.Logf("waiting on server") + select { + case <-serverCh: + case <-time.After(time.Second): + if readiness := server.Readiness(waiter.WritableEvents); readiness != 0 { + t.Logf("server.Readiness(%b) = %b but channel not signaled", waiter.WritableEvents, readiness) + } + continue + } + break + } + default: + return fmt.Errorf("server.Write failed: %s", err) + } + } + if err := server.Shutdown(tcpip.ShutdownWrite); err != nil { + return fmt.Errorf("server.Shutdown failed: %s", err) + } + t.Logf("server end shutdown done") + return nil + }(); err != nil { + t.Error(err) + } + }() + + go func() { + defer wg.Done() + + if err := func() error { + total := 0 + defer t.Logf("read %d bytes in total", total) + for { + switch res, err := client.Read(ioutil.Discard, tcpip.ReadOptions{}); err.(type) { + case nil: + t.Logf("read %d bytes", res.Count) + total += res.Count + t.Logf("read total %d bytes till now", total) + case *tcpip.ErrClosedForReceive: + return nil + case *tcpip.ErrWouldBlock: + for { + t.Logf("waiting on client") + select { + case <-clientCh: + case <-time.After(time.Second): + if readiness := client.Readiness(waiter.ReadableEvents); readiness != 0 { + return fmt.Errorf("client.Readiness(%b) = %b but channel not signaled", waiter.ReadableEvents, readiness) + } + continue + } + break + } + default: + return fmt.Errorf("client.Write failed: %s", err) + } + } + }(); err != nil { + t.Error(err) + } + }() + }) + } +} + // Test the stack receive window advertisement on receiving segments smaller than // segment overhead. It tests for the right edge of the window to not grow when // the endpoint is not being read from. @@ -2143,11 +2356,11 @@ func TestSmallSegReceiveWindowAdvertisement(t *testing.T) { t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err) } - c.AcceptWithOptions(tcp.FindWndScale(seqnum.Size(opt.Default)), header.TCPSynOptions{MSS: defaultIPv4MSS}) + c.AcceptWithOptionsNoDelay(tcp.FindWndScale(seqnum.Size(opt.Default)), header.TCPSynOptions{MSS: defaultIPv4MSS}) // Bump up the receive buffer size such that, when the receive window grows, // the scaled window exceeds maxUint16. - c.EP.SocketOptions().SetReceiveBufferSize(int64(opt.Max), true) + c.EP.SocketOptions().SetReceiveBufferSize(int64(opt.Max)*2, true /* notify */) // Keep the payload size < segment overhead and such that it is a multiple // of the window scaled value. This enables the test to perform equality @@ -2267,7 +2480,7 @@ func TestNoWindowShrinking(t *testing.T) { initialWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() << c.RcvdWindowScale initialLastAcceptableSeq := iss.Add(seqnum.Size(initialWnd)) // Now shrink the receive buffer to half its original size. - c.EP.SocketOptions().SetReceiveBufferSize(int64(rcvBufSize/2), true) + c.EP.SocketOptions().SetReceiveBufferSize(int64(rcvBufSize), true /* notify */) data := generateRandomPayload(t, rcvBufSize) // Send a payload of half the size of rcvBufSize. @@ -2523,7 +2736,7 @@ func TestScaledWindowAccept(t *testing.T) { defer ep.Close() // Set the window size greater than the maximum non-scaled window. - ep.SocketOptions().SetReceiveBufferSize(65535*3, true) + ep.SocketOptions().SetReceiveBufferSize(65535*6, true /* notify */) if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) @@ -2535,7 +2748,7 @@ func TestScaledWindowAccept(t *testing.T) { // Do 3-way handshake. // wndScale expected is 3 as 65535 * 3 * 2 < 65535 * 2^3 but > 65535 *2 *2 - c.PassiveConnectWithOptions(100, 3 /* wndScale */, header.TCPSynOptions{MSS: defaultIPv4MSS}) + c.PassiveConnectWithOptions(100, 3 /* wndScale */, header.TCPSynOptions{MSS: defaultIPv4MSS}, 0 /* delay */) // Try to accept the connection. we, ch := waiter.NewChannelEntry(nil) @@ -2595,7 +2808,7 @@ func TestNonScaledWindowAccept(t *testing.T) { defer ep.Close() // Set the window size greater than the maximum non-scaled window. - ep.SocketOptions().SetReceiveBufferSize(65535*3, true) + ep.SocketOptions().SetReceiveBufferSize(65535*6, true /* notify */) if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) @@ -3188,7 +3401,7 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) { // Set the buffer size to a deterministic size so that we can check the // window scaling option. const rcvBufferSize = 0x20000 - ep.SocketOptions().SetReceiveBufferSize(rcvBufferSize, true) + ep.SocketOptions().SetReceiveBufferSize(rcvBufferSize*2, true /* notify */) if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) @@ -3327,7 +3540,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) { // window scaling option. const rcvBufferSize = 0x20000 const wndScale = 3 - c.EP.SocketOptions().SetReceiveBufferSize(rcvBufferSize, true) + c.EP.SocketOptions().SetReceiveBufferSize(rcvBufferSize*2, true /* notify */) // Start connection attempt. we, ch := waiter.NewChannelEntry(nil) @@ -3451,17 +3664,13 @@ loop: for { switch _, err := c.EP.Read(ioutil.Discard, tcpip.ReadOptions{}); err.(type) { case *tcpip.ErrWouldBlock: - select { - case <-ch: - // Expect the state to be StateError and subsequent Reads to fail with HardError. - _, err := c.EP.Read(ioutil.Discard, tcpip.ReadOptions{}) - if d := cmp.Diff(&tcpip.ErrConnectionReset{}, err); d != "" { - t.Fatalf("c.EP.Read() mismatch (-want +got):\n%s", d) - } - break loop - case <-time.After(1 * time.Second): - t.Fatalf("Timed out waiting for reset to arrive") + <-ch + // Expect the state to be StateError and subsequent Reads to fail with HardError. + _, err := c.EP.Read(ioutil.Discard, tcpip.ReadOptions{}) + if d := cmp.Diff(&tcpip.ErrConnectionReset{}, err); d != "" { + t.Fatalf("c.EP.Read() mismatch (-want +got):\n%s", d) } + break loop case *tcpip.ErrConnectionReset: break loop default: @@ -3472,14 +3681,27 @@ loop: if tcp.EndpointState(c.EP.State()) != tcp.StateError { t.Fatalf("got EP state is not StateError") } - if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 { - t.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got) + + checkValid := func() []error { + var errors []error + if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 { + errors = append(errors, fmt.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got)) + } + if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 { + errors = append(errors, fmt.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)) + } + if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 { + errors = append(errors, fmt.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)) + } + return errors } - if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 { - t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got) + + start := time.Now() + for time.Since(start) < time.Minute && len(checkValid()) > 0 { + time.Sleep(50 * time.Millisecond) } - if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 { - t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got) + for _, err := range checkValid() { + t.Error(err) } } @@ -3523,6 +3745,12 @@ func TestMaxRetransmitsTimeout(t *testing.T) { t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) } + // Wait for the connection to timeout after MaxRetries retransmits. + initRTO := time.Second + minRTOOpt := tcpip.TCPMinRTOOption(initRTO) + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &minRTOOpt); err != nil { + t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, minRTOOpt, minRTOOpt, err) + } c.CreateConnected(context.TestInitialSequenceNumber, 30000 /* rcvWnd */, -1 /* epRcvBuf */) waitEntry, notifyCh := waiter.NewChannelEntry(nil) @@ -3545,8 +3773,6 @@ func TestMaxRetransmitsTimeout(t *testing.T) { ), ) } - // Wait for the connection to timeout after MaxRetries retransmits. - initRTO := 1 * time.Second select { case <-notifyCh: case <-time.After((2 << numRetries) * initRTO): @@ -3581,9 +3807,13 @@ func TestMaxRTO(t *testing.T) { defer c.Cleanup() rto := 1 * time.Second - opt := tcpip.TCPMaxRTOOption(rto) - if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) + minRTOOpt := tcpip.TCPMinRTOOption(rto / 2) + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &minRTOOpt); err != nil { + t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, minRTOOpt, minRTOOpt, err) + } + maxRTOOpt := tcpip.TCPMaxRTOOption(rto) + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &maxRTOOpt); err != nil { + t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, maxRTOOpt, maxRTOOpt, err) } c.CreateConnected(context.TestInitialSequenceNumber, 30000 /* rcvWnd */, -1 /* epRcvBuf */) @@ -3609,12 +3839,44 @@ func TestMaxRTO(t *testing.T) { checker.TCPFlagsMatch(header.TCPFlagAck, ^header.TCPFlagPsh), ), ) - if time.Since(start).Round(time.Second).Seconds() != rto.Seconds() { - t.Errorf("Retransmit interval not capped to MaxRTO.\n") + if elapsed := time.Since(start); elapsed.Round(time.Second).Seconds() != rto.Seconds() { + t.Errorf("Retransmit interval not capped to MaxRTO(%s). %s", rto, elapsed) } } } +// TestZeroSizedWriteRetransmit tests that a zero sized write should not +// result in a panic on an RTO as no segment should have been queued for +// a zero sized write. +func TestZeroSizedWriteRetransmit(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + c.CreateConnected(context.TestInitialSequenceNumber, 30000 /* rcvWnd */, -1 /* epRcvBuf */) + + var r bytes.Reader + _, err := c.EP.Write(&r, tcpip.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed: %s", err) + } + // Now do a non-zero sized write to trigger actual sending of data. + r.Reset(make([]byte, 1)) + _, err = c.EP.Write(&r, tcpip.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed: %s", err) + } + // Do not ACK the packet and expect an original transmit and a + // retransmit. This should not cause a panic. + for i := 0; i < 2; i++ { + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlagsMatch(header.TCPFlagAck, ^header.TCPFlagPsh), + ), + ) + } +} + // TestRetransmitIPv4IDUniqueness tests that the IPv4 Identification field is // unique on retransmits. func TestRetransmitIPv4IDUniqueness(t *testing.T) { @@ -3629,6 +3891,10 @@ func TestRetransmitIPv4IDUniqueness(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() + minRTOOpt := tcpip.TCPMinRTOOption(time.Second) + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &minRTOOpt); err != nil { + t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, minRTOOpt, minRTOOpt, err) + } c.CreateConnected(context.TestInitialSequenceNumber, 30000 /* rcvWnd */, -1 /* epRcvBuf */) // Disabling PMTU discovery causes all packets sent from this socket to @@ -4628,52 +4894,6 @@ func TestDefaultBufferSizes(t *testing.T) { checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*3) } -func TestMinMaxBufferSizes(t *testing.T) { - s := stack.New(stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol}, - TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol}, - }) - - // Check the default values. - ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{}) - if err != nil { - t.Fatalf("NewEndpoint failed; %s", err) - } - defer ep.Close() - - // Change the min/max values for send/receive - { - opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20} - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err) - } - } - - { - opt := tcpip.TCPSendBufferSizeRangeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30} - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err) - } - } - - // Set values below the min/2. - ep.SocketOptions().SetReceiveBufferSize(99, true) - checkRecvBufferSize(t, ep, 200) - - ep.SocketOptions().SetSendBufferSize(149, true) - - checkSendBufferSize(t, ep, 300) - - // Set values above the max. - ep.SocketOptions().SetReceiveBufferSize(1+tcp.DefaultReceiveBufferSize*20, true) - // Values above max are capped at max and then doubled. - checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20*2) - - ep.SocketOptions().SetSendBufferSize(1+tcp.DefaultSendBufferSize*30, true) - // Values above max are capped at max and then doubled. - checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30*2) -} - func TestBindToDeviceOption(t *testing.T) { s := stack.New(stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol}, @@ -4741,13 +4961,17 @@ func makeStack() (*stack.Stack, tcpip.Error) { } for _, ct := range []struct { - number tcpip.NetworkProtocolNumber - address tcpip.Address + number tcpip.NetworkProtocolNumber + addrWithPrefix tcpip.AddressWithPrefix }{ - {ipv4.ProtocolNumber, context.StackAddr}, - {ipv6.ProtocolNumber, context.StackV6Addr}, + {ipv4.ProtocolNumber, context.StackAddrWithPrefix}, + {ipv6.ProtocolNumber, context.StackV6AddrWithPrefix}, } { - if err := s.AddAddress(1, ct.number, ct.address); err != nil { + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ct.number, + AddressWithPrefix: ct.addrWithPrefix, + } + if err := s.AddProtocolAddress(1, protocolAddr, stack.AddressProperties{}); err != nil { return nil, err } } @@ -4951,7 +5175,7 @@ func TestConnectAvoidsBoundPorts(t *testing.T) { t.Fatalf("got s.SetPortRange(%d, %d) = %s, want = nil", start, end, err) } for i := start; i <= end; i++ { - if makeEP(exhaustedNetwork).Bind(tcpip.FullAddress{Addr: address(t, exhaustedAddressType, isAny), Port: uint16(i)}); err != nil { + if err := makeEP(exhaustedNetwork).Bind(tcpip.FullAddress{Addr: address(t, exhaustedAddressType, isAny), Port: uint16(i)}); err != nil { t.Fatalf("Bind(%d) failed: %s", i, err) } } @@ -6068,6 +6292,11 @@ func TestSynRcvdBadSeqNumber(t *testing.T) { // complete the connection to test that the large SEQ num // did not change the state from SYN-RCVD. + // Get setup to be notified about connection establishment. + we, ch := waiter.NewChannelEntry(nil) + c.WQ.EventRegister(&we, waiter.ReadableEvents) + defer c.WQ.EventUnregister(&we) + // Send ACK to move to ESTABLISHED state. c.SendPacket(nil, &context.Headers{ SrcPort: context.TestPort, @@ -6078,32 +6307,12 @@ func TestSynRcvdBadSeqNumber(t *testing.T) { RcvWnd: 30000, }) + <-ch newEP, _, err := c.EP.Accept(nil) - switch err.(type) { - case nil, *tcpip.ErrWouldBlock: - default: + if err != nil { t.Fatalf("Accept failed: %s", err) } - if cmp.Equal(&tcpip.ErrWouldBlock{}, err) { - // Try to accept the connections in the backlog. - we, ch := waiter.NewChannelEntry(nil) - c.WQ.EventRegister(&we, waiter.ReadableEvents) - defer c.WQ.EventUnregister(&we) - - // Wait for connection to be established. - select { - case <-ch: - newEP, _, err = c.EP.Accept(nil) - if err != nil { - t.Fatalf("Accept failed: %s", err) - } - - case <-time.After(1 * time.Second): - t.Fatalf("Timed out waiting for accept") - } - } - // Now verify that the TCP socket is usable and in a connected state. data := "Don't panic" var r strings.Reader @@ -6209,12 +6418,26 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) { RcvWnd: 30000, }) - time.Sleep(50 * time.Millisecond) - if got := stats.TCP.ListenOverflowSynDrop.Value(); got != want { - t.Errorf("got stats.TCP.ListenOverflowSynDrop.Value() = %d, want = %d", got, want) + checkValid := func() []error { + var errors []error + if got := stats.TCP.ListenOverflowSynDrop.Value(); got != want { + errors = append(errors, fmt.Errorf("got stats.TCP.ListenOverflowSynDrop.Value() = %d, want = %d", got, want)) + } + if got := c.EP.Stats().(*tcp.Stats).ReceiveErrors.ListenOverflowSynDrop.Value(); got != want { + errors = append(errors, fmt.Errorf("got EP stats Stats.ReceiveErrors.ListenOverflowSynDrop = %d, want = %d", got, want)) + } + return errors } - if got := c.EP.Stats().(*tcp.Stats).ReceiveErrors.ListenOverflowSynDrop.Value(); got != want { - t.Errorf("got EP stats Stats.ReceiveErrors.ListenOverflowSynDrop = %d, want = %d", got, want) + + start := time.Now() + for time.Since(start) < time.Minute && len(checkValid()) > 0 { + time.Sleep(50 * time.Millisecond) + } + for _, err := range checkValid() { + t.Error(err) + } + if t.Failed() { + t.FailNow() } we, ch := waiter.NewChannelEntry(nil) @@ -6225,15 +6448,10 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) { _, _, err = c.EP.Accept(nil) if cmp.Equal(&tcpip.ErrWouldBlock{}, err) { // Wait for connection to be established. - select { - case <-ch: - _, _, err = c.EP.Accept(nil) - if err != nil { - t.Fatalf("Accept failed: %s", err) - } - - case <-time.After(1 * time.Second): - t.Fatalf("Timed out waiting for accept") + <-ch + _, _, err = c.EP.Accept(nil) + if err != nil { + t.Fatalf("Accept failed: %s", err) } } } @@ -6315,7 +6533,7 @@ func TestEndpointBindListenAcceptState(t *testing.T) { t.Errorf("unexpected endpoint state: want %s, got %s", want, got) } - c.PassiveConnectWithOptions(100, 5, header.TCPSynOptions{MSS: defaultIPv4MSS}) + c.PassiveConnectWithOptions(100, 5, header.TCPSynOptions{MSS: defaultIPv4MSS}, 0 /* delay */) // Try to accept the connection. we, ch := waiter.NewChannelEntry(nil) @@ -6396,7 +6614,7 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) { // maximum buffer size defined above. c.WindowScale = uint8(tcp.FindWndScale(maxReceiveBufferSize)) - rawEP := c.CreateConnectedWithOptions(header.TCPSynOptions{TS: true, WS: 4}) + rawEP := c.CreateConnectedWithOptionsNoDelay(header.TCPSynOptions{TS: true, WS: 4}) // NOTE: The timestamp values in the sent packets are meaningless to the // peer so we just increment the timestamp value by 1 every batch as we @@ -6526,7 +6744,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) { // maximum buffer size used by stack. c.WindowScale = uint8(tcp.FindWndScale(maxReceiveBufferSize)) - rawEP := c.CreateConnectedWithOptions(header.TCPSynOptions{TS: true, WS: 4}) + rawEP := c.CreateConnectedWithOptionsNoDelay(header.TCPSynOptions{TS: true, WS: 4}) tsVal := rawEP.TSVal rawEP.NextSeqNum-- rawEP.SendPacketWithTS(nil, tsVal) @@ -7441,6 +7659,11 @@ func TestTCPUserTimeout(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() + initRTO := 1 * time.Second + minRTOOpt := tcpip.TCPMinRTOOption(initRTO) + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &minRTOOpt); err != nil { + t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, minRTOOpt, minRTOOpt, err) + } c.CreateConnected(context.TestInitialSequenceNumber, 30000, -1 /* epRcvBuf */) waitEntry, notifyCh := waiter.NewChannelEntry(nil) @@ -7451,7 +7674,6 @@ func TestTCPUserTimeout(t *testing.T) { // Ensure that on the next retransmit timer fire, the user timeout has // expired. - initRTO := 1 * time.Second userTimeout := initRTO / 2 v := tcpip.TCPUserTimeoutOption(userTimeout) if err := c.EP.SetSockOpt(&v); err != nil { @@ -7483,7 +7705,7 @@ func TestTCPUserTimeout(t *testing.T) { select { case <-notifyCh: case <-time.After(2 * initRTO): - t.Fatalf("connection still alive after %s, should have been closed after :%s", 2*initRTO, userTimeout) + t.Fatalf("connection still alive after %s, should have been closed after %s", 2*initRTO, userTimeout) } // No packet should be received as the connection should be silently @@ -7717,7 +7939,7 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) { // Increasing the buffer from should generate an ACK, // since window grew from small value to larger equal MSS - c.EP.SocketOptions().SetReceiveBufferSize(rcvBuf*2, true) + c.EP.SocketOptions().SetReceiveBufferSize(rcvBuf*4, true /* notify */) checker.IPv4(t, c.GetPacket(), checker.PayloadLen(header.TCPMinimumSize), checker.TCP( @@ -7965,6 +8187,151 @@ func TestSetStackTimeWaitReuse(t *testing.T) { } } +func TestHandshakeRTT(t *testing.T) { + type testCase struct { + connect bool + tsEnabled bool + useCookie bool + retrans bool + delay time.Duration + wantRTT time.Duration + } + var testCases []testCase + for _, connect := range []bool{false, true} { + for _, tsEnabled := range []bool{false, true} { + for _, useCookie := range []bool{false, true} { + for _, retrans := range []bool{false, true} { + if connect && useCookie { + continue + } + delay := 800 * time.Millisecond + if retrans { + delay = 1200 * time.Millisecond + } + wantRTT := delay + // If syncookie is enabled, sample RTT only when TS option is enabled. + if !retrans && useCookie && !tsEnabled { + wantRTT = 0 + } + // If retransmitted, sample RTT only when TS option is enabled. + if retrans && !tsEnabled { + wantRTT = 0 + } + testCases = append(testCases, testCase{connect, tsEnabled, useCookie, retrans, delay, wantRTT}) + } + } + } + } + for _, tt := range testCases { + tt := tt + t.Run(fmt.Sprintf("connect=%t,TS=%t,cookie=%t,retrans=%t)", tt.connect, tt.tsEnabled, tt.useCookie, tt.retrans), func(t *testing.T) { + t.Parallel() + c := context.New(t, defaultMTU) + if tt.useCookie { + opt := tcpip.TCPAlwaysUseSynCookies(true) + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } + } + synOpts := header.TCPSynOptions{} + if tt.tsEnabled { + synOpts.TS = true + synOpts.TSVal = 42 + } + if tt.connect { + c.CreateConnectedWithOptions(synOpts, tt.delay) + } else { + synOpts.MSS = defaultIPv4MSS + synOpts.WS = -1 + c.AcceptWithOptions(-1, synOpts, tt.delay) + } + var info tcpip.TCPInfoOption + if err := c.EP.GetSockOpt(&info); err != nil { + t.Fatalf("c.EP.GetSockOpt(&%T) = %s", info, err) + } + if got := info.RTT.Round(tt.wantRTT); got != tt.wantRTT { + t.Fatalf("got info.RTT=%s, expect %s", got, tt.wantRTT) + } + if info.RTTVar != 0 && tt.wantRTT == 0 { + t.Fatalf("got info.RTTVar=%s, expect 0", info.RTTVar) + } + if info.RTTVar == 0 && tt.wantRTT != 0 { + t.Fatalf("got info.RTTVar=0, expect non zero") + } + }) + } +} + +func TestSetRTO(t *testing.T) { + c := context.New(t, defaultMTU) + minRTO, maxRTO := tcpRTOMinMax(t, c) + for _, tt := range []struct { + name string + RTO time.Duration + minRTO time.Duration + maxRTO time.Duration + err tcpip.Error + }{ + { + name: "invalid minRTO", + minRTO: maxRTO + time.Second, + err: &tcpip.ErrInvalidOptionValue{}, + }, + { + name: "invalid maxRTO", + maxRTO: minRTO - time.Millisecond, + err: &tcpip.ErrInvalidOptionValue{}, + }, + { + name: "valid minRTO", + minRTO: maxRTO - time.Second, + }, + { + name: "valid maxRTO", + maxRTO: minRTO + time.Millisecond, + }, + } { + t.Run(tt.name, func(t *testing.T) { + c := context.New(t, defaultMTU) + var opt tcpip.SettableTransportProtocolOption + if tt.minRTO > 0 { + min := tcpip.TCPMinRTOOption(tt.minRTO) + opt = &min + } + if tt.maxRTO > 0 { + max := tcpip.TCPMaxRTOOption(tt.maxRTO) + opt = &max + } + err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt) + if got, want := err, tt.err; got != want { + t.Fatalf("c.Stack().SetTransportProtocolOption(TCP, &%T(%v)) = %v, want = %v", opt, opt, got, want) + } + if tt.err == nil { + minRTO, maxRTO := tcpRTOMinMax(t, c) + if tt.minRTO > 0 && tt.minRTO != minRTO { + t.Fatalf("got minRTO = %s, want %s", minRTO, tt.minRTO) + } + if tt.maxRTO > 0 && tt.maxRTO != maxRTO { + t.Fatalf("got maxRTO = %s, want %s", maxRTO, tt.maxRTO) + } + } + }) + } +} + +func tcpRTOMinMax(t *testing.T, c *context.Context) (time.Duration, time.Duration) { + t.Helper() + var minOpt tcpip.TCPMinRTOOption + var maxOpt tcpip.TCPMaxRTOOption + if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &minOpt); err != nil { + t.Fatalf("c.Stack().TransportProtocolOption(TCP, %T): %s", minOpt, err) + } + if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &maxOpt); err != nil { + t.Fatalf("c.Stack().TransportProtocolOption(TCP, %T): %s", maxOpt, err) + } + return time.Duration(minOpt), time.Duration(maxOpt) +} + // generateRandomPayload generates a random byte slice of the specified length // causing a fatal test failure if it is unable to do so. func generateRandomPayload(t *testing.T, n int) []byte { @@ -7975,3 +8342,192 @@ func generateRandomPayload(t *testing.T, n int) []byte { } return buf } + +func TestSendBufferTuning(t *testing.T) { + const maxPayload = 536 + const mtu = header.TCPMinimumSize + header.IPv4MinimumSize + maxTCPOptionSize + maxPayload + const packetOverheadFactor = 2 + + testCases := []struct { + name string + autoTuningDisabled bool + }{ + {"autoTuningDisabled", true}, + {"autoTuningEnabled", false}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + c := context.New(t, mtu) + defer c.Cleanup() + + // Set the stack option for send buffer size. + const defaultSndBufSz = maxPayload * tcp.InitialCwnd + const maxSndBufSz = defaultSndBufSz * 10 + { + opt := tcpip.TCPSendBufferSizeRangeOption{Min: 1, Default: defaultSndBufSz, Max: maxSndBufSz} + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err) + } + } + + c.CreateConnected(context.TestInitialSequenceNumber, 30000, -1 /* epRcvBuf */) + + oldSz := c.EP.SocketOptions().GetSendBufferSize() + if oldSz != defaultSndBufSz { + t.Fatalf("Wrong send buffer size got %d want %d", oldSz, defaultSndBufSz) + } + + if tc.autoTuningDisabled { + c.EP.SocketOptions().SetSendBufferSize(defaultSndBufSz, true /* notify */) + } + + data := make([]byte, maxPayload) + for i := range data { + data[i] = byte(i) + } + + w, ch := waiter.NewChannelEntry(nil) + c.WQ.EventRegister(&w, waiter.WritableEvents) + defer c.WQ.EventUnregister(&w) + + bytesRead := 0 + for { + // Packets will be sent till the send buffer + // size is reached. + var r bytes.Reader + r.Reset(data[bytesRead : bytesRead+maxPayload]) + _, err := c.EP.Write(&r, tcpip.WriteOptions{}) + if cmp.Equal(&tcpip.ErrWouldBlock{}, err) { + break + } + + c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, 0) + bytesRead += maxPayload + data = append(data, data...) + } + + // Send an ACK and wait for connection to become writable again. + c.SendAck(seqnum.Value(context.TestInitialSequenceNumber).Add(1), bytesRead) + select { + case <-ch: + if err := c.EP.LastError(); err != nil { + t.Fatalf("Write failed: %s", err) + } + case <-time.After(1 * time.Second): + t.Fatalf("Timed out waiting for connection") + } + + outSz := int64(defaultSndBufSz) + if !tc.autoTuningDisabled { + // Calculate the new auto tuned send buffer. + var info tcpip.TCPInfoOption + if err := c.EP.GetSockOpt(&info); err != nil { + t.Fatalf("GetSockOpt failed: %v", err) + } + outSz = int64(info.SndCwnd) * packetOverheadFactor * maxPayload + } + + if newSz := c.EP.SocketOptions().GetSendBufferSize(); newSz != outSz { + t.Fatalf("Wrong send buffer size, got %d want %d", newSz, outSz) + } + }) + } +} + +func TestTimestampSynCookies(t *testing.T) { + clock := faketime.NewManualClock() + tsNow := func() uint32 { + return uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Milliseconds()) + } + // Advance the clock so that NowMonotonic is non-zero. + clock.Advance(time.Second) + c := context.NewWithOpts(t, context.Options{ + EnableV4: true, + EnableV6: true, + MTU: defaultMTU, + Clock: clock, + }) + defer c.Cleanup() + opt := tcpip.TCPAlwaysUseSynCookies(true) + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } + wq := &waiter.Queue{} + ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq) + if err != nil { + t.Fatalf("NewEndpoint failed: %s", err) + } + defer ep.Close() + + tcpOpts := [12]byte{header.TCPOptionNOP, header.TCPOptionNOP} + header.EncodeTSOption(42, 0, tcpOpts[2:]) + if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { + t.Fatalf("Bind failed: %s", err) + } + if err := ep.Listen(10); err != nil { + t.Fatalf("Listen failed: %s", err) + } + iss := seqnum.Value(context.TestInitialSequenceNumber) + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagSyn, + RcvWnd: seqnum.Size(512), + SeqNum: iss, + TCPOpts: tcpOpts[:], + }) + // Get the TSVal of SYN-ACK. + b := c.GetPacket() + tcpHdr := header.TCP(header.IPv4(b).Payload()) + c.IRS = seqnum.Value(tcpHdr.SequenceNumber()) + initialTSVal := tcpHdr.ParsedOptions().TSVal + // derive the tsOffset. + tsOffset := initialTSVal - tsNow() + + header.EncodeTSOption(420, initialTSVal, tcpOpts[2:]) + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + RcvWnd: seqnum.Size(512), + SeqNum: iss + 1, + AckNum: c.IRS + 1, + TCPOpts: tcpOpts[:], + }) + c.EP, _, err = ep.Accept(nil) + // Try to accept the connection. + we, ch := waiter.NewChannelEntry(nil) + wq.EventRegister(&we, waiter.ReadableEvents) + defer wq.EventUnregister(&we) + if cmp.Equal(&tcpip.ErrWouldBlock{}, err) { + // Wait for connection to be established. + select { + case <-ch: + c.EP, _, err = ep.Accept(nil) + if err != nil { + t.Fatalf("Accept failed: %s", err) + } + + case <-time.After(1 * time.Second): + t.Fatalf("Timed out waiting for accept") + } + } else if err != nil { + t.Fatalf("failed to accept: %s", err) + } + + // Advance the clock again so that we expect the next TSVal to change. + clock.Advance(time.Second) + data := []byte{1, 2, 3} + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { + t.Fatalf("Write failed: %s", err) + } + + // The endpoint should have a correct TSOffset so that the received TSVal + // should match our expectation. + if got, want := header.TCP(header.IPv4(c.GetPacket()).Payload()).ParsedOptions().TSVal, tsNow()+tsOffset; got != want { + t.Fatalf("got TSVal = %d, want %d", got, want) + } +} diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go index 1deb1fe4d..65925daa5 100644 --- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go @@ -32,7 +32,7 @@ import ( // createConnectedWithTimestampOption creates and connects c.ep with the // timestamp option enabled. func createConnectedWithTimestampOption(c *context.Context) *context.RawEndpoint { - return c.CreateConnectedWithOptions(header.TCPSynOptions{TS: true, TSVal: 1}) + return c.CreateConnectedWithOptionsNoDelay(header.TCPSynOptions{TS: true, TSVal: 1}) } // TestTimeStampEnabledConnect tests that netstack sends the timestamp option on @@ -131,7 +131,7 @@ func TestTimeStampDisabledConnect(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() - c.CreateConnectedWithOptions(header.TCPSynOptions{}) + c.CreateConnectedWithOptionsNoDelay(header.TCPSynOptions{}) } func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndSize uint16) { @@ -147,7 +147,7 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS t.Logf("Test w/ CookieEnabled = %v", cookieEnabled) tsVal := rand.Uint32() - c.AcceptWithOptions(wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS, TS: true, TSVal: tsVal}) + c.AcceptWithOptionsNoDelay(wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS, TS: true, TSVal: tsVal}) // Now send some data and validate that timestamp is echoed correctly in the ACK. data := []byte{1, 2, 3} @@ -209,7 +209,7 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd } t.Logf("Test w/ CookieEnabled = %v", cookieEnabled) - c.AcceptWithOptions(wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS}) + c.AcceptWithOptionsNoDelay(wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS}) // Now send some data with the accepted connection endpoint and validate // that no timestamp option is sent in the TCP segment. diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index 53efecc5a..88bb99354 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -122,6 +122,9 @@ type Options struct { // MTU indicates the maximum transmission unit on the link layer. MTU uint32 + + // Clock that is used by Stack. + Clock tcpip.Clock } // Context provides an initialized Network stack and a link layer endpoint @@ -182,6 +185,7 @@ func NewWithOpts(t *testing.T, opts Options) *Context { stackOpts := stack.Options{ TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol}, + Clock: opts.Clock, } if opts.EnableV4 { stackOpts.NetworkProtocols = append(stackOpts.NetworkProtocols, ipv4.NewProtocol) @@ -239,8 +243,8 @@ func NewWithOpts(t *testing.T, opts Options) *Context { Protocol: ipv4.ProtocolNumber, AddressWithPrefix: StackAddrWithPrefix, } - if err := s.AddProtocolAddress(1, v4ProtocolAddr); err != nil { - t.Fatalf("AddProtocolAddress(1, %#v): %s", v4ProtocolAddr, err) + if err := s.AddProtocolAddress(1, v4ProtocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(1, %+v, {}): %s", v4ProtocolAddr, err) } routeTable = append(routeTable, tcpip.Route{ Destination: header.IPv4EmptySubnet, @@ -253,8 +257,8 @@ func NewWithOpts(t *testing.T, opts Options) *Context { Protocol: ipv6.ProtocolNumber, AddressWithPrefix: StackV6AddrWithPrefix, } - if err := s.AddProtocolAddress(1, v6ProtocolAddr); err != nil { - t.Fatalf("AddProtocolAddress(1, %#v): %s", v6ProtocolAddr, err) + if err := s.AddProtocolAddress(1, v6ProtocolAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(1, %+v, {}): %s", v6ProtocolAddr, err) } routeTable = append(routeTable, tcpip.Route{ Destination: header.IPv6EmptySubnet, @@ -757,7 +761,7 @@ func (c *Context) Create(epRcvBuf int) { } if epRcvBuf != -1 { - c.EP.SocketOptions().SetReceiveBufferSize(int64(epRcvBuf), true /* notify */) + c.EP.SocketOptions().SetReceiveBufferSize(int64(epRcvBuf)*2, true /* notify */) } } @@ -879,13 +883,21 @@ func (r *RawEndpoint) VerifyACKHasSACK(sackBlocks []header.SACKBlock) { ) } +// CreateConnectedWithOptionsNoDelay just calls CreateConnectedWithOptions +// without delay. +func (c *Context) CreateConnectedWithOptionsNoDelay(wantOptions header.TCPSynOptions) *RawEndpoint { + return c.CreateConnectedWithOptions(wantOptions, 0 /* delay */) +} + // CreateConnectedWithOptions creates and connects c.ep with the specified TCP // options enabled and returns a RawEndpoint which represents the other end of -// the connection. +// the connection. It delays before a SYNACK is sent. This makes c.EP have a +// higher RTT estimate so that spurious TLPs aren't sent in tests, which helps +// reduce flakiness. // // It also verifies where required(eg.Timestamp) that the ACK to the SYN-ACK // does not carry an option that was not requested. -func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *RawEndpoint { +func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions, delay time.Duration) *RawEndpoint { var err tcpip.Error c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ) if err != nil { @@ -911,18 +923,17 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) * // TS value. mss := uint16(c.linkEP.MTU() - header.IPv4MinimumSize - header.TCPMinimumSize) - checker.IPv4(c.t, b, - checker.TCP( - checker.DstPort(TestPort), - checker.TCPFlags(header.TCPFlagSyn), - checker.TCPSynOptions(header.TCPSynOptions{ - MSS: mss, - TS: true, - WS: int(c.WindowScale), - SACKPermitted: c.SACKEnabled(), - }), - ), + synChecker := checker.TCP( + checker.DstPort(TestPort), + checker.TCPFlags(header.TCPFlagSyn), + checker.TCPSynOptions(header.TCPSynOptions{ + MSS: mss, + TS: true, + WS: int(c.WindowScale), + SACKPermitted: c.SACKEnabled(), + }), ) + checker.IPv4(c.t, b, synChecker) if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want { c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got) } @@ -948,6 +959,10 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) * // Build SYN-ACK. c.IRS = seqnum.Value(tcpSeg.SequenceNumber()) iss := seqnum.Value(TestInitialSequenceNumber) + if delay > 0 { + // Sleep so that RTT is increased. + time.Sleep(delay) + } c.SendPacket(nil, &Headers{ SrcPort: tcpSeg.DestinationPort(), DstPort: tcpSeg.SourcePort(), @@ -959,7 +974,17 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) * }) // Read ACK. - ackPacket := c.GetPacket() + var ackPacket []byte + // Ignore retransimitted SYN packets. + for { + packet := c.GetPacket() + if header.TCP(header.IPv4(packet).Payload()).Flags()&header.TCPFlagSyn != 0 { + checker.IPv4(c.t, packet, synChecker) + } else { + ackPacket = packet + break + } + } // Verify TCP header fields. tcpCheckers := []checker.TransportChecker{ @@ -1016,13 +1041,19 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) * } } -// AcceptWithOptions initializes a listening endpoint and connects to it with the -// provided options enabled. It also verifies that the SYN-ACK has the expected -// values for the provided options. +// AcceptWithOptionsNoDelay delegates call to AcceptWithOptions without delay. +func (c *Context) AcceptWithOptionsNoDelay(wndScale int, synOptions header.TCPSynOptions) *RawEndpoint { + return c.AcceptWithOptions(wndScale, synOptions, 0 /* delay */) +} + +// AcceptWithOptions initializes a listening endpoint and connects to it with +// the provided options enabled. It delays before the final ACK of the 3WHS is +// sent. It also verifies that the SYN-ACK has the expected values for the +// provided options. // // The function returns a RawEndpoint representing the other end of the accepted // endpoint. -func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOptions) *RawEndpoint { +func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOptions, delay time.Duration) *RawEndpoint { // Create EP and start listening. wq := &waiter.Queue{} ep, err := c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq) @@ -1045,7 +1076,7 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got) } - rep := c.PassiveConnectWithOptions(100, wndScale, synOptions) + rep := c.PassiveConnectWithOptions(100, wndScale, synOptions, delay) // Try to accept the connection. we, ch := waiter.NewChannelEntry(nil) @@ -1077,13 +1108,14 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption // PassiveConnectWithOptions. func (c *Context) PassiveConnect(maxPayload, wndScale int, synOptions header.TCPSynOptions) { synOptions.WS = -1 - c.PassiveConnectWithOptions(maxPayload, wndScale, synOptions) + c.PassiveConnectWithOptions(maxPayload, wndScale, synOptions, 0 /* delay */) } // PassiveConnectWithOptions initiates a new connection (with the specified TCP // options enabled) to the port on which the Context.ep is listening for new // connections. It also validates that the SYN-ACK has the expected values for -// the enabled options. +// the enabled options. The final ACK of the handshake is delayed by specified +// duration. // // NOTE: MSS is not a negotiated option and it can be asymmetric // in each direction. This function uses the maxPayload to set the MSS to be @@ -1093,7 +1125,7 @@ func (c *Context) PassiveConnect(maxPayload, wndScale int, synOptions header.TCP // wndScale is the expected window scale in the SYN-ACK and synOptions.WS is the // value of the window scaling option to be sent in the SYN. If synOptions.WS > // 0 then we send the WindowScale option. -func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions header.TCPSynOptions) *RawEndpoint { +func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions header.TCPSynOptions, delay time.Duration) *RawEndpoint { c.t.Helper() opts := make([]byte, header.TCPOptionsMaximumSize) offset := 0 @@ -1180,7 +1212,10 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions ackHeaders.TCPOpts = opts[:] } - // Send ACK. + // Send ACK, delay if needed. + if delay > 0 { + time.Sleep(delay) + } c.SendPacket(nil, ackHeaders) c.RcvdWindowScale = uint8(rcvdSynOptions.WS) diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/tcpip/transport/transport.go index c96002e19..4c2ae87f4 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go +++ b/pkg/tcpip/transport/transport.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,19 +12,5 @@ // See the License for the specific language governing permissions and // limitations under the License. -package disklayout - -import ( - "testing" -) - -// TestExtentSize tests that the extent structs are of the correct -// size. -func TestExtentSize(t *testing.T) { - var h ExtentHeader - assertSize(t, &h, ExtentHeaderSize) - var i ExtentIdx - assertSize(t, &i, ExtentEntrySize) - var e Extent - assertSize(t, &e, ExtentEntrySize) -} +// Package transport supports transport protocols. +package transport diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index cdc344ab7..5cc7a2886 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -35,6 +35,8 @@ go_library( "//pkg/tcpip/header/parse", "//pkg/tcpip/ports", "//pkg/tcpip/stack", + "//pkg/tcpip/transport", + "//pkg/tcpip/transport/internal/network", "//pkg/tcpip/transport/raw", "//pkg/waiter", ], diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index def9d7186..4255457f9 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -15,8 +15,8 @@ package udp import ( + "fmt" "io" - "sync/atomic" "time" "gvisor.dev/gvisor/pkg/sync" @@ -25,12 +25,15 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport" + "gvisor.dev/gvisor/pkg/tcpip/transport/internal/network" "gvisor.dev/gvisor/pkg/waiter" ) // +stateify savable type udpPacket struct { udpPacketEntry + netProto tcpip.NetworkProtocolNumber senderAddress tcpip.FullAddress destinationAddress tcpip.FullAddress packetInfo tcpip.IPPacketInfo @@ -40,36 +43,6 @@ type udpPacket struct { tos uint8 } -// EndpointState represents the state of a UDP endpoint. -type EndpointState tcpip.EndpointState - -// Endpoint states. Note that are represented in a netstack-specific manner and -// may not be meaningful externally. Specifically, they need to be translated to -// Linux's representation for these states if presented to userspace. -const ( - _ EndpointState = iota - StateInitial - StateBound - StateConnected - StateClosed -) - -// String implements fmt.Stringer. -func (s EndpointState) String() string { - switch s { - case StateInitial: - return "INITIAL" - case StateBound: - return "BOUND" - case StateConnected: - return "CONNECTING" - case StateClosed: - return "CLOSED" - default: - return "UNKNOWN" - } -} - // endpoint represents a UDP endpoint. This struct serves as the interface // between users of the endpoint and the protocol implementation; it is legal to // have concurrent goroutines make calls into the endpoint, they are properly @@ -79,7 +52,6 @@ func (s EndpointState) String() string { // // +stateify savable type endpoint struct { - stack.TransportEndpointInfo tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and do not @@ -87,6 +59,10 @@ type endpoint struct { stack *stack.Stack `state:"manual"` waiterQueue *waiter.Queue uniqueID uint64 + net network.Endpoint + // TODO(b/142022063): Add ability to save and restore per endpoint stats. + stats tcpip.TransportEndpointStats `state:"nosave"` + ops tcpip.SocketOptions // The following fields are used to manage the receive queue, and are // protected by rcvMu. @@ -96,37 +72,19 @@ type endpoint struct { rcvBufSize int rcvClosed bool - // The following fields are protected by the mu mutex. - mu sync.RWMutex `state:"nosave"` - // state must be read/set using the EndpointState()/setEndpointState() - // methods. - state uint32 - route *stack.Route `state:"manual"` - dstPort uint16 - ttl uint8 - multicastTTL uint8 - multicastAddr tcpip.Address - multicastNICID tcpip.NICID - portFlags ports.Flags - lastErrorMu sync.Mutex `state:"nosave"` lastError tcpip.Error + // The following fields are protected by the mu mutex. + mu sync.RWMutex `state:"nosave"` + portFlags ports.Flags + // Values used to reserve a port or register a transport endpoint. // (which ever happens first). boundBindToDevice tcpip.NICID boundPortFlags ports.Flags - // sendTOS represents IPv4 TOS or IPv6 TrafficClass, - // applied while sending packets. Defaults to 0 as on Linux. - sendTOS uint8 - - // shutdownFlags represent the current shutdown state of the endpoint. - shutdownFlags tcpip.ShutdownFlags - - // multicastMemberships that need to be remvoed when the endpoint is - // closed. Protected by the mu mutex. - multicastMemberships map[multicastMembership]struct{} + readShutdown bool // effectiveNetProtos contains the network protocols actually in use. In // most cases it will only contain "netProto", but in cases like IPv6 @@ -136,55 +94,25 @@ type endpoint struct { // address). effectiveNetProtos []tcpip.NetworkProtocolNumber - // TODO(b/142022063): Add ability to save and restore per endpoint stats. - stats tcpip.TransportEndpointStats `state:"nosave"` - - // owner is used to get uid and gid of the packet. - owner tcpip.PacketOwner - - // ops is used to get socket level options. - ops tcpip.SocketOptions - // frozen indicates if the packets should be delivered to the endpoint // during restore. frozen bool -} -// +stateify savable -type multicastMembership struct { - nicID tcpip.NICID - multicastAddr tcpip.Address + localPort uint16 + remotePort uint16 } func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { e := &endpoint{ - stack: s, - TransportEndpointInfo: stack.TransportEndpointInfo{ - NetProto: netProto, - TransProto: header.UDPProtocolNumber, - }, + stack: s, waiterQueue: waiterQueue, - // RFC 1075 section 5.4 recommends a TTL of 1 for membership - // requests. - // - // RFC 5135 4.2.1 appears to assume that IGMP messages have a - // TTL of 1. - // - // RFC 5135 Appendix A defines TTL=1: A multicast source that - // wants its traffic to not traverse a router (e.g., leave a - // home network) may find it useful to send traffic with IP - // TTL=1. - // - // Linux defaults to TTL=1. - multicastTTL: 1, - multicastMemberships: make(map[multicastMembership]struct{}), - state: uint32(StateInitial), - uniqueID: s.UniqueID(), + uniqueID: s.UniqueID(), } e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) e.ops.SetMulticastLoop(true) e.ops.SetSendBufferSize(32*1024, false /* notify */) e.ops.SetReceiveBufferSize(32*1024, false /* notify */) + e.net.Init(s, netProto, header.UDPProtocolNumber, &e.ops) // Override with stack defaults. var ss tcpip.SendBufferSizeOption @@ -200,20 +128,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue return e } -// setEndpointState updates the state of the endpoint to state atomically. This -// method is unexported as the only place we should update the state is in this -// package but we allow the state to be read freely without holding e.mu. -// -// Precondition: e.mu must be held to call this method. -func (e *endpoint) setEndpointState(state EndpointState) { - atomic.StoreUint32(&e.state, uint32(state)) -} - -// EndpointState() returns the current state of the endpoint. -func (e *endpoint) EndpointState() EndpointState { - return EndpointState(atomic.LoadUint32(&e.state)) -} - // UniqueID implements stack.TransportEndpoint. func (e *endpoint) UniqueID() uint64 { return e.uniqueID @@ -244,16 +158,22 @@ func (e *endpoint) Abort() { // associated with it. func (e *endpoint) Close() { e.mu.Lock() - e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite - switch e.EndpointState() { - case StateBound, StateConnected: - e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) + switch state := e.net.State(); state { + case transport.DatagramEndpointStateInitial: + case transport.DatagramEndpointStateClosed: + e.mu.Unlock() + return + case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: + id := e.net.Info().ID + id.LocalPort = e.localPort + id.RemotePort = e.remotePort + e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, id, e, e.boundPortFlags, e.boundBindToDevice) portRes := ports.Reservation{ Networks: e.effectiveNetProtos, Transport: ProtocolNumber, - Addr: e.ID.LocalAddress, - Port: e.ID.LocalPort, + Addr: id.LocalAddress, + Port: id.LocalPort, Flags: e.boundPortFlags, BindToDevice: e.boundBindToDevice, Dest: tcpip.FullAddress{}, @@ -261,13 +181,10 @@ func (e *endpoint) Close() { e.stack.ReleasePort(portRes) e.boundBindToDevice = 0 e.boundPortFlags = ports.Flags{} + default: + panic(fmt.Sprintf("unhandled state = %s", state)) } - for mem := range e.multicastMemberships { - e.stack.LeaveGroup(e.NetProto, mem.nicID, mem.multicastAddr) - } - e.multicastMemberships = make(map[multicastMembership]struct{}) - // Close the receive list and drain it. e.rcvMu.Lock() e.rcvClosed = true @@ -278,14 +195,9 @@ func (e *endpoint) Close() { } e.rcvMu.Unlock() - if e.route != nil { - e.route.Release() - e.route = nil - } - - // Update the state. - e.setEndpointState(StateClosed) - + e.net.Shutdown() + e.net.Close() + e.readShutdown = true e.mu.Unlock() e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) @@ -324,14 +236,21 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult HasTimestamp: true, Timestamp: p.receivedAt.UnixNano(), } - if e.ops.GetReceiveTOS() { - cm.HasTOS = true - cm.TOS = p.tos - } - if e.ops.GetReceiveTClass() { - cm.HasTClass = true - // Although TClass is an 8-bit value it's read in the CMsg as a uint32. - cm.TClass = uint32(p.tos) + + switch p.netProto { + case header.IPv4ProtocolNumber: + if e.ops.GetReceiveTOS() { + cm.HasTOS = true + cm.TOS = p.tos + } + case header.IPv6ProtocolNumber: + if e.ops.GetReceiveTClass() { + cm.HasTClass = true + // Although TClass is an 8-bit value it's read in the CMsg as a uint32. + cm.TClass = uint32(p.tos) + } + default: + panic(fmt.Sprintf("unrecognized network protocol = %d", p.netProto)) } if e.ops.GetReceivePacketInfo() { cm.HasIPPacketInfo = true @@ -359,18 +278,19 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult return res, nil } -// prepareForWrite prepares the endpoint for sending data. In particular, it -// binds it if it's still in the initial state. To do so, it must first +// prepareForWriteInner prepares the endpoint for sending data. In particular, +// it binds it if it's still in the initial state. To do so, it must first // reacquire the mutex in exclusive mode. // // Returns true for retry if preparation should be retried. -func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) { - switch e.EndpointState() { - case StateInitial: - case StateConnected: +// +checklocks:e.mu +func (e *endpoint) prepareForWriteInner(to *tcpip.FullAddress) (retry bool, err tcpip.Error) { + switch e.net.State() { + case transport.DatagramEndpointStateInitial: + case transport.DatagramEndpointStateConnected: return false, nil - case StateBound: + case transport.DatagramEndpointStateBound: if to == nil { return false, &tcpip.ErrDestinationRequired{} } @@ -380,14 +300,12 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip } e.mu.RUnlock() - defer e.mu.RLock() - e.mu.Lock() - defer e.mu.Unlock() + defer e.mu.DowngradeLock() // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. - if e.EndpointState() != StateInitial { + if e.net.State() != transport.DatagramEndpointStateInitial { return true, nil } @@ -399,33 +317,6 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip return true, nil } -// connectRoute establishes a route to the specified interface or the -// configured multicast interface if no interface is specified and the -// specified address is a multicast address. -func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (*stack.Route, tcpip.NICID, tcpip.Error) { - localAddr := e.ID.LocalAddress - if e.isBroadcastOrMulticast(nicID, netProto, localAddr) { - // A packet can only originate from a unicast address (i.e., an interface). - localAddr = "" - } - - if header.IsV4MulticastAddress(addr.Addr) || header.IsV6MulticastAddress(addr.Addr) { - if nicID == 0 { - nicID = e.multicastNICID - } - if localAddr == "" && nicID == 0 { - localAddr = e.multicastAddr - } - } - - // Find a route to the desired destination. - r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.ops.GetMulticastLoop()) - if err != nil { - return nil, 0, err - } - return r, nicID, nil -} - // Write writes data to the endpoint's peer. This method does not block // if the data cannot be written. func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { @@ -449,37 +340,15 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp return n, err } -func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { - if err := e.LastError(); err != nil { - return 0, err - } - - // MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.) - if opts.More { - return 0, &tcpip.ErrInvalidOptionValue{} - } - - to := opts.To - +func (e *endpoint) prepareForWrite(p tcpip.Payloader, opts tcpip.WriteOptions) (udpPacketInfo, tcpip.Error) { e.mu.RLock() - lockReleased := false - defer func() { - if lockReleased { - return - } - e.mu.RUnlock() - }() - - // If we've shutdown with SHUT_WR we are in an invalid state for sending. - if e.shutdownFlags&tcpip.ShutdownWrite != 0 { - return 0, &tcpip.ErrClosedForSend{} - } + defer e.mu.RUnlock() // Prepare for write. for { - retry, err := e.prepareForWrite(to) + retry, err := e.prepareForWriteInner(opts.To) if err != nil { - return 0, err + return udpPacketInfo{}, err } if !retry { @@ -487,50 +356,29 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp } } - route := e.route - dstPort := e.dstPort - if to != nil { - // Reject destination address if it goes through a different - // NIC than the endpoint was bound to. - nicID := to.NIC - if nicID == 0 { - nicID = tcpip.NICID(e.ops.GetBindToDevice()) - } - if e.BindNICID != 0 { - if nicID != 0 && nicID != e.BindNICID { - return 0, &tcpip.ErrNoRoute{} - } - - nicID = e.BindNICID - } - - if to.Port == 0 { + dst, connected := e.net.GetRemoteAddress() + dst.Port = e.remotePort + if opts.To != nil { + if opts.To.Port == 0 { // Port 0 is an invalid port to send to. - return 0, &tcpip.ErrInvalidEndpointState{} - } - - dst, netProto, err := e.checkV4MappedLocked(*to) - if err != nil { - return 0, err + return udpPacketInfo{}, &tcpip.ErrInvalidEndpointState{} } - r, _, err := e.connectRoute(nicID, dst, netProto) - if err != nil { - return 0, err - } - defer r.Release() - - route = r - dstPort = dst.Port + dst = *opts.To + } else if !connected { + return udpPacketInfo{}, &tcpip.ErrDestinationRequired{} } - if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() { - return 0, &tcpip.ErrBroadcastDisabled{} + ctx, err := e.net.AcquireContextForWrite(opts) + if err != nil { + return udpPacketInfo{}, err } + // TODO(https://gvisor.dev/issue/6538): Avoid this allocation. v := make([]byte, p.Len()) if _, err := io.ReadFull(p, v); err != nil { - return 0, &tcpip.ErrBadBuffer{} + ctx.Release() + return udpPacketInfo{}, &tcpip.ErrBadBuffer{} } if len(v) > header.UDPMaximumPacketSize { // Payload can't possibly fit in a packet. @@ -538,35 +386,25 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp if so.GetRecvError() { so.QueueLocalErr( &tcpip.ErrMessageTooLong{}, - route.NetProto(), + e.net.NetProto(), header.UDPMaximumPacketSize, - tcpip.FullAddress{ - NIC: route.NICID(), - Addr: route.RemoteAddress(), - Port: dstPort, - }, + dst, v, ) } - return 0, &tcpip.ErrMessageTooLong{} + ctx.Release() + return udpPacketInfo{}, &tcpip.ErrMessageTooLong{} } - ttl := e.ttl - useDefaultTTL := ttl == 0 - - if header.IsV4MulticastAddress(route.RemoteAddress()) || header.IsV6MulticastAddress(route.RemoteAddress()) { - ttl = e.multicastTTL - // Multicast allows a 0 TTL. - useDefaultTTL = false - } - - localPort := e.ID.LocalPort - sendTOS := e.sendTOS - owner := e.owner - noChecksum := e.SocketOptions().GetNoChecksum() - lockReleased = true - e.mu.RUnlock() + return udpPacketInfo{ + ctx: ctx, + data: v, + localPort: e.localPort, + remotePort: dst.Port, + }, nil +} +func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { // Do not hold lock when sending as loopback is synchronous and if the UDP // datagram ends up generating an ICMP response then it can result in a // deadlock where the ICMP response handling ends up acquiring this endpoint's @@ -577,10 +415,53 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcp // // See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read // locking is prohibited. - if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil { + + if err := e.LastError(); err != nil { return 0, err } - return int64(len(v)), nil + + udpInfo, err := e.prepareForWrite(p, opts) + if err != nil { + return 0, err + } + defer udpInfo.ctx.Release() + + pktInfo := udpInfo.ctx.PacketInfo() + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: header.UDPMinimumSize + int(pktInfo.MaxHeaderLength), + Data: udpInfo.data.ToVectorisedView(), + }) + + // Initialize the UDP header. + udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize)) + pkt.TransportProtocolNumber = ProtocolNumber + + length := uint16(pkt.Size()) + udp.Encode(&header.UDPFields{ + SrcPort: udpInfo.localPort, + DstPort: udpInfo.remotePort, + Length: length, + }) + + // Set the checksum field unless TX checksum offload is enabled. + // On IPv4, UDP checksum is optional, and a zero value indicates the + // transmitter skipped the checksum generation (RFC768). + // On IPv6, UDP checksum is not optional (RFC2460 Section 8.1). + if pktInfo.RequiresTXTransportChecksum && + (!e.ops.GetNoChecksum() || pktInfo.NetProto == header.IPv6ProtocolNumber) { + udp.SetChecksum(^udp.CalculateChecksum(header.ChecksumCombine( + header.PseudoHeaderChecksum(ProtocolNumber, pktInfo.LocalAddress, pktInfo.RemoteAddress, length), + pkt.Data().AsRange().Checksum(), + ))) + } + if err := udpInfo.ctx.WritePacket(pkt, false /* headerIncluded */); err != nil { + e.stack.Stats().UDP.PacketSendErrors.Increment() + return 0, err + } + + // Track count of packets sent. + e.stack.Stats().UDP.PacketsSent.Increment() + return int64(len(udpInfo.data)), nil } // OnReuseAddressSet implements tcpip.SocketOptionsHandler. @@ -599,36 +480,7 @@ func (e *endpoint) OnReusePortSet(v bool) { // SetSockOptInt implements tcpip.Endpoint. func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { - switch opt { - case tcpip.MTUDiscoverOption: - // Return not supported if the value is not disabling path - // MTU discovery. - if v != tcpip.PMTUDiscoveryDont { - return &tcpip.ErrNotSupported{} - } - - case tcpip.MulticastTTLOption: - e.mu.Lock() - e.multicastTTL = uint8(v) - e.mu.Unlock() - - case tcpip.TTLOption: - e.mu.Lock() - e.ttl = uint8(v) - e.mu.Unlock() - - case tcpip.IPv4TOSOption: - e.mu.Lock() - e.sendTOS = uint8(v) - e.mu.Unlock() - - case tcpip.IPv6TrafficClassOption: - e.mu.Lock() - e.sendTOS = uint8(v) - e.mu.Unlock() - } - - return nil + return e.net.SetSockOptInt(opt, v) } var _ tcpip.SocketOptionsHandler = (*endpoint)(nil) @@ -640,145 +492,12 @@ func (e *endpoint) HasNIC(id int32) bool { // SetSockOpt implements tcpip.Endpoint. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { - switch v := opt.(type) { - case *tcpip.MulticastInterfaceOption: - e.mu.Lock() - defer e.mu.Unlock() - - fa := tcpip.FullAddress{Addr: v.InterfaceAddr} - fa, netProto, err := e.checkV4MappedLocked(fa) - if err != nil { - return err - } - nic := v.NIC - addr := fa.Addr - - if nic == 0 && addr == "" { - e.multicastAddr = "" - e.multicastNICID = 0 - break - } - - if nic != 0 { - if !e.stack.CheckNIC(nic) { - return &tcpip.ErrBadLocalAddress{} - } - } else { - nic = e.stack.CheckLocalAddress(0, netProto, addr) - if nic == 0 { - return &tcpip.ErrBadLocalAddress{} - } - } - - if e.BindNICID != 0 && e.BindNICID != nic { - return &tcpip.ErrInvalidEndpointState{} - } - - e.multicastNICID = nic - e.multicastAddr = addr - - case *tcpip.AddMembershipOption: - if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) { - return &tcpip.ErrInvalidOptionValue{} - } - - nicID := v.NIC - - if v.InterfaceAddr.Unspecified() { - if nicID == 0 { - if r, err := e.stack.FindRoute(0, "", v.MulticastAddr, e.NetProto, false /* multicastLoop */); err == nil { - nicID = r.NICID() - r.Release() - } - } - } else { - nicID = e.stack.CheckLocalAddress(nicID, e.NetProto, v.InterfaceAddr) - } - if nicID == 0 { - return &tcpip.ErrUnknownDevice{} - } - - memToInsert := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr} - - e.mu.Lock() - defer e.mu.Unlock() - - if _, ok := e.multicastMemberships[memToInsert]; ok { - return &tcpip.ErrPortInUse{} - } - - if err := e.stack.JoinGroup(e.NetProto, nicID, v.MulticastAddr); err != nil { - return err - } - - e.multicastMemberships[memToInsert] = struct{}{} - - case *tcpip.RemoveMembershipOption: - if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) { - return &tcpip.ErrInvalidOptionValue{} - } - - nicID := v.NIC - if v.InterfaceAddr.Unspecified() { - if nicID == 0 { - if r, err := e.stack.FindRoute(0, "", v.MulticastAddr, e.NetProto, false /* multicastLoop */); err == nil { - nicID = r.NICID() - r.Release() - } - } - } else { - nicID = e.stack.CheckLocalAddress(nicID, e.NetProto, v.InterfaceAddr) - } - if nicID == 0 { - return &tcpip.ErrUnknownDevice{} - } - - memToRemove := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr} - - e.mu.Lock() - defer e.mu.Unlock() - - if _, ok := e.multicastMemberships[memToRemove]; !ok { - return &tcpip.ErrBadLocalAddress{} - } - - if err := e.stack.LeaveGroup(e.NetProto, nicID, v.MulticastAddr); err != nil { - return err - } - - delete(e.multicastMemberships, memToRemove) - - case *tcpip.SocketDetachFilterOption: - return nil - } - return nil + return e.net.SetSockOpt(opt) } // GetSockOptInt implements tcpip.Endpoint. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { - case tcpip.IPv4TOSOption: - e.mu.RLock() - v := int(e.sendTOS) - e.mu.RUnlock() - return v, nil - - case tcpip.IPv6TrafficClassOption: - e.mu.RLock() - v := int(e.sendTOS) - e.mu.RUnlock() - return v, nil - - case tcpip.MTUDiscoverOption: - // The only supported setting is path MTU discovery disabled. - return tcpip.PMTUDiscoveryDont, nil - - case tcpip.MulticastTTLOption: - e.mu.Lock() - v := int(e.multicastTTL) - e.mu.Unlock() - return v, nil - case tcpip.ReceiveQueueSizeOption: v := 0 e.rcvMu.Lock() @@ -789,92 +508,22 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { e.rcvMu.Unlock() return v, nil - case tcpip.TTLOption: - e.mu.Lock() - v := int(e.ttl) - e.mu.Unlock() - return v, nil - default: - return -1, &tcpip.ErrUnknownProtocolOption{} + return e.net.GetSockOptInt(opt) } } // GetSockOpt implements tcpip.Endpoint. func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { - switch o := opt.(type) { - case *tcpip.MulticastInterfaceOption: - e.mu.Lock() - *o = tcpip.MulticastInterfaceOption{ - NIC: e.multicastNICID, - InterfaceAddr: e.multicastAddr, - } - e.mu.Unlock() - - default: - return &tcpip.ErrUnknownProtocolOption{} - } - return nil -} - -// sendUDP sends a UDP segment via the provided network endpoint and under the -// provided identity. -func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner, noChecksum bool) tcpip.Error { - pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ - ReserveHeaderBytes: header.UDPMinimumSize + int(r.MaxHeaderLength()), - Data: data, - }) - pkt.Owner = owner - - // Initialize the UDP header. - udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize)) - pkt.TransportProtocolNumber = ProtocolNumber - - length := uint16(pkt.Size()) - udp.Encode(&header.UDPFields{ - SrcPort: localPort, - DstPort: remotePort, - Length: length, - }) - - // Set the checksum field unless TX checksum offload is enabled. - // On IPv4, UDP checksum is optional, and a zero value indicates the - // transmitter skipped the checksum generation (RFC768). - // On IPv6, UDP checksum is not optional (RFC2460 Section 8.1). - if r.RequiresTXTransportChecksum() && - (!noChecksum || r.NetProto() == header.IPv6ProtocolNumber) { - xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) - for _, v := range data.Views() { - xsum = header.Checksum(v, xsum) - } - udp.SetChecksum(^udp.CalculateChecksum(xsum)) - } - - if useDefaultTTL { - ttl = r.DefaultTTL() - } - if err := r.WritePacket(stack.NetworkHeaderParams{ - Protocol: ProtocolNumber, - TTL: ttl, - TOS: tos, - }, pkt); err != nil { - r.Stats().UDP.PacketSendErrors.Increment() - return err - } - - // Track count of packets sent. - r.Stats().UDP.PacketsSent.Increment() - return nil + return e.net.GetSockOpt(opt) } -// checkV4MappedLocked determines the effective network protocol and converts -// addr to its canonical form. -func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { - unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) - if err != nil { - return tcpip.FullAddress{}, 0, err - } - return unwrapped, netProto, nil +// udpPacketInfo holds information needed to send a UDP packet. +type udpPacketInfo struct { + ctx network.WriteContext + data buffer.View + localPort uint16 + remotePort uint16 } // Disconnect implements tcpip.Endpoint. @@ -882,7 +531,7 @@ func (e *endpoint) Disconnect() tcpip.Error { e.mu.Lock() defer e.mu.Unlock() - if e.EndpointState() != StateConnected { + if e.net.State() != transport.DatagramEndpointStateConnected { return nil } var ( @@ -895,26 +544,28 @@ func (e *endpoint) Disconnect() tcpip.Error { boundPortFlags := e.boundPortFlags // Exclude ephemerally bound endpoints. - if e.BindNICID != 0 || e.ID.LocalAddress == "" { + info := e.net.Info() + info.ID.LocalPort = e.localPort + info.ID.RemotePort = e.remotePort + if e.net.WasBound() { var err tcpip.Error id = stack.TransportEndpointID{ - LocalPort: e.ID.LocalPort, - LocalAddress: e.ID.LocalAddress, + LocalPort: info.ID.LocalPort, + LocalAddress: info.ID.LocalAddress, } id, btd, err = e.registerWithStack(e.effectiveNetProtos, id) if err != nil { return err } - e.setEndpointState(StateBound) boundPortFlags = e.boundPortFlags } else { - if e.ID.LocalPort != 0 { + if info.ID.LocalPort != 0 { // Release the ephemeral port. portRes := ports.Reservation{ Networks: e.effectiveNetProtos, Transport: ProtocolNumber, - Addr: e.ID.LocalAddress, - Port: e.ID.LocalPort, + Addr: info.ID.LocalAddress, + Port: info.ID.LocalPort, Flags: boundPortFlags, BindToDevice: e.boundBindToDevice, Dest: tcpip.FullAddress{}, @@ -922,15 +573,14 @@ func (e *endpoint) Disconnect() tcpip.Error { e.stack.ReleasePort(portRes) e.boundPortFlags = ports.Flags{} } - e.setEndpointState(StateInitial) } - e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice) - e.ID = id + e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, info.ID, e, boundPortFlags, e.boundBindToDevice) e.boundBindToDevice = btd - e.route.Release() - e.route = nil - e.dstPort = 0 + e.localPort = id.LocalPort + e.remotePort = id.RemotePort + + e.net.Disconnect() return nil } @@ -940,88 +590,48 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() - nicID := addr.NIC - var localPort uint16 - switch e.EndpointState() { - case StateInitial: - case StateBound, StateConnected: - localPort = e.ID.LocalPort - if e.BindNICID == 0 { - break - } - - if nicID != 0 && nicID != e.BindNICID { - return &tcpip.ErrInvalidEndpointState{} + err := e.net.ConnectAndThen(addr, func(netProto tcpip.NetworkProtocolNumber, previousID, nextID stack.TransportEndpointID) tcpip.Error { + nextID.LocalPort = e.localPort + nextID.RemotePort = addr.Port + + // Even if we're connected, this endpoint can still be used to send + // packets on a different network protocol, so we register both even if + // v6only is set to false and this is an ipv6 endpoint. + netProtos := []tcpip.NetworkProtocolNumber{netProto} + if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() { + netProtos = []tcpip.NetworkProtocolNumber{ + header.IPv4ProtocolNumber, + header.IPv6ProtocolNumber, + } } - nicID = e.BindNICID - default: - return &tcpip.ErrInvalidEndpointState{} - } - - addr, netProto, err := e.checkV4MappedLocked(addr) - if err != nil { - return err - } - - r, nicID, err := e.connectRoute(nicID, addr, netProto) - if err != nil { - return err - } - - id := stack.TransportEndpointID{ - LocalAddress: e.ID.LocalAddress, - LocalPort: localPort, - RemotePort: addr.Port, - RemoteAddress: r.RemoteAddress(), - } - - if e.EndpointState() == StateInitial { - id.LocalAddress = r.LocalAddress() - } + oldPortFlags := e.boundPortFlags - // Even if we're connected, this endpoint can still be used to send - // packets on a different network protocol, so we register both even if - // v6only is set to false and this is an ipv6 endpoint. - netProtos := []tcpip.NetworkProtocolNumber{netProto} - if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() { - netProtos = []tcpip.NetworkProtocolNumber{ - header.IPv4ProtocolNumber, - header.IPv6ProtocolNumber, + nextID, btd, err := e.registerWithStack(netProtos, nextID) + if err != nil { + return err } - } - oldPortFlags := e.boundPortFlags + // Remove the old registration. + if e.localPort != 0 { + previousID.LocalPort = e.localPort + previousID.RemotePort = e.remotePort + e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, previousID, e, oldPortFlags, e.boundBindToDevice) + } - id, btd, err := e.registerWithStack(netProtos, id) + e.localPort = nextID.LocalPort + e.remotePort = nextID.RemotePort + e.boundBindToDevice = btd + e.effectiveNetProtos = netProtos + return nil + }) if err != nil { - r.Release() return err } - // Remove the old registration. - if e.ID.LocalPort != 0 { - e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, oldPortFlags, e.boundBindToDevice) - } - - e.ID = id - e.boundBindToDevice = btd - if e.route != nil { - // If the endpoint was already connected then make sure we release the - // previous route. - e.route.Release() - } - e.route = r - e.dstPort = addr.Port - e.RegisterNICID = nicID - e.effectiveNetProtos = netProtos - - e.setEndpointState(StateConnected) - e.rcvMu.Lock() e.rcvReady = true e.rcvMu.Unlock() - return nil } @@ -1036,15 +646,23 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() - // A socket in the bound state can still receive multicast messages, - // so we need to notify waiters on shutdown. - if state := e.EndpointState(); state != StateBound && state != StateConnected { + switch state := e.net.State(); state { + case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: return &tcpip.ErrNotConnected{} + case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: + default: + panic(fmt.Sprintf("unhandled state = %s", state)) } - e.shutdownFlags |= flags + if flags&tcpip.ShutdownWrite != 0 { + if err := e.net.Shutdown(); err != nil { + return err + } + } if flags&tcpip.ShutdownRead != 0 { + e.readShutdown = true + e.rcvMu.Lock() wasClosed := e.rcvClosed e.rcvClosed = true @@ -1070,7 +688,7 @@ func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpi func (e *endpoint) registerWithStack(netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, tcpip.Error) { bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) - if e.ID.LocalPort == 0 { + if e.localPort == 0 { portRes := ports.Reservation{ Networks: netProtos, Transport: ProtocolNumber, @@ -1108,56 +726,43 @@ func (e *endpoint) registerWithStack(netProtos []tcpip.NetworkProtocolNumber, id func (e *endpoint) bindLocked(addr tcpip.FullAddress) tcpip.Error { // Don't allow binding once endpoint is not in the initial state // anymore. - if e.EndpointState() != StateInitial { + if e.net.State() != transport.DatagramEndpointStateInitial { return &tcpip.ErrInvalidEndpointState{} } - addr, netProto, err := e.checkV4MappedLocked(addr) - if err != nil { - return err - } - - // Expand netProtos to include v4 and v6 if the caller is binding to a - // wildcard (empty) address, and this is an IPv6 endpoint with v6only - // set to false. - netProtos := []tcpip.NetworkProtocolNumber{netProto} - if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() && addr.Addr == "" { - netProtos = []tcpip.NetworkProtocolNumber{ - header.IPv6ProtocolNumber, - header.IPv4ProtocolNumber, + err := e.net.BindAndThen(addr, func(boundNetProto tcpip.NetworkProtocolNumber, boundAddr tcpip.Address) tcpip.Error { + // Expand netProtos to include v4 and v6 if the caller is binding to a + // wildcard (empty) address, and this is an IPv6 endpoint with v6only + // set to false. + netProtos := []tcpip.NetworkProtocolNumber{boundNetProto} + if boundNetProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() && boundAddr == "" { + netProtos = []tcpip.NetworkProtocolNumber{ + header.IPv6ProtocolNumber, + header.IPv4ProtocolNumber, + } } - } - nicID := addr.NIC - if len(addr.Addr) != 0 && !e.isBroadcastOrMulticast(addr.NIC, netProto, addr.Addr) { - // A local unicast address was specified, verify that it's valid. - nicID = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) - if nicID == 0 { - return &tcpip.ErrBadLocalAddress{} + id := stack.TransportEndpointID{ + LocalPort: addr.Port, + LocalAddress: boundAddr, + } + id, btd, err := e.registerWithStack(netProtos, id) + if err != nil { + return err } - } - id := stack.TransportEndpointID{ - LocalPort: addr.Port, - LocalAddress: addr.Addr, - } - id, btd, err := e.registerWithStack(netProtos, id) + e.localPort = id.LocalPort + e.boundBindToDevice = btd + e.effectiveNetProtos = netProtos + return nil + }) if err != nil { return err } - e.ID = id - e.boundBindToDevice = btd - e.RegisterNICID = nicID - e.effectiveNetProtos = netProtos - - // Mark endpoint as bound. - e.setEndpointState(StateBound) - e.rcvMu.Lock() e.rcvReady = true e.rcvMu.Unlock() - return nil } @@ -1172,9 +777,6 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { return err } - // Save the effective NICID generated by bindLocked. - e.BindNICID = e.RegisterNICID - return nil } @@ -1183,16 +785,9 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() - addr := e.ID.LocalAddress - if e.EndpointState() == StateConnected { - addr = e.route.LocalAddress() - } - - return tcpip.FullAddress{ - NIC: e.RegisterNICID, - Addr: addr, - Port: e.ID.LocalPort, - }, nil + addr := e.net.GetLocalAddress() + addr.Port = e.localPort + return addr, nil } // GetRemoteAddress returns the address to which the endpoint is connected. @@ -1200,15 +795,13 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() - if e.EndpointState() != StateConnected || e.dstPort == 0 { + addr, connected := e.net.GetRemoteAddress() + if !connected || e.remotePort == 0 { return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} } - return tcpip.FullAddress{ - NIC: e.RegisterNICID, - Addr: e.ID.RemoteAddress, - Port: e.ID.RemotePort, - }, nil + addr.Port = e.remotePort + return addr, nil } // Readiness returns the current readiness of the endpoint. For example, if @@ -1303,6 +896,7 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB // Push new packet into receive list and increment the buffer size. packet := &udpPacket{ + netProto: pkt.NetworkProtocolNumber, senderAddress: tcpip.FullAddress{ NIC: pkt.NICID, Addr: id.RemoteAddress, @@ -1358,19 +952,20 @@ func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, p payload = udp.Payload() } + id := e.net.Info().ID e.SocketOptions().QueueErr(&tcpip.SockError{ Err: err, Cause: transErr, Payload: payload, Dst: tcpip.FullAddress{ NIC: pkt.NICID, - Addr: e.ID.RemoteAddress, - Port: e.ID.RemotePort, + Addr: id.RemoteAddress, + Port: e.remotePort, }, Offender: tcpip.FullAddress{ NIC: pkt.NICID, - Addr: e.ID.LocalAddress, - Port: e.ID.LocalPort, + Addr: id.LocalAddress, + Port: e.localPort, }, NetProto: pkt.NetworkProtocolNumber, }) @@ -1385,7 +980,7 @@ func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketB // TODO(gvisor.dev/issues/5270): Handle all transport errors. switch transErr.Kind() { case stack.DestinationPortUnreachableTransportError: - if e.EndpointState() == StateConnected { + if e.net.State() == transport.DatagramEndpointStateConnected { e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt) } } @@ -1393,16 +988,17 @@ func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketB // State implements tcpip.Endpoint. func (e *endpoint) State() uint32 { - return uint32(e.EndpointState()) + return uint32(e.net.State()) } // Info returns a copy of the endpoint info. func (e *endpoint) Info() tcpip.EndpointInfo { e.mu.RLock() - // Make a copy of the endpoint info. - ret := e.TransportEndpointInfo - e.mu.RUnlock() - return &ret + defer e.mu.RUnlock() + info := e.net.Info() + info.ID.LocalPort = e.localPort + info.ID.RemotePort = e.remotePort + return &info } // Stats returns a pointer to the endpoint stats. @@ -1413,13 +1009,9 @@ func (e *endpoint) Stats() tcpip.EndpointStats { // Wait implements tcpip.Endpoint. func (*endpoint) Wait() {} -func (e *endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { - return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr) || e.stack.IsSubnetBroadcast(nicID, netProto, addr) -} - // SetOwner implements tcpip.Endpoint. func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { - e.owner = owner + e.net.SetOwner(owner) } // SocketOptions implements tcpip.Endpoint. diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go index 1f638c3f6..2ff8b0482 100644 --- a/pkg/tcpip/transport/udp/endpoint_state.go +++ b/pkg/tcpip/transport/udp/endpoint_state.go @@ -15,12 +15,13 @@ package udp import ( + "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" - "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport" ) // saveReceivedAt is invoked by stateify. @@ -35,17 +36,11 @@ func (p *udpPacket) loadReceivedAt(nsec int64) { // saveData saves udpPacket.data field. func (p *udpPacket) saveData() buffer.VectorisedView { - // We cannot save p.data directly as p.data.views may alias to p.views, - // which is not allowed by state framework (in-struct pointer). return p.data.Clone(nil) } // loadData loads udpPacket.data field. func (p *udpPacket) loadData(data buffer.VectorisedView) { - // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization - // here because data.views is not guaranteed to be loaded by now. Plus, - // data.views will be allocated anyway so there really is little point - // of utilizing p.views for data.views. p.data = data } @@ -66,50 +61,28 @@ func (e *endpoint) Resume(s *stack.Stack) { e.mu.Lock() defer e.mu.Unlock() + e.net.Resume(s) + e.stack = s e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) - for m := range e.multicastMemberships { - if err := e.stack.JoinGroup(e.NetProto, m.nicID, m.multicastAddr); err != nil { - panic(err) - } - } - - state := e.EndpointState() - if state != StateBound && state != StateConnected { - return - } - - netProto := e.effectiveNetProtos[0] - // Connect() and bindLocked() both assert - // - // netProto == header.IPv6ProtocolNumber - // - // before creating a multi-entry effectiveNetProtos. - if len(e.effectiveNetProtos) > 1 { - netProto = header.IPv6ProtocolNumber - } - - var err tcpip.Error - if state == StateConnected { - e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.ops.GetMulticastLoop()) + switch state := e.net.State(); state { + case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: + case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: + // Our saved state had a port, but we don't actually have a + // reservation. We need to remove the port from our state, but still + // pass it to the reservation machinery. + var err tcpip.Error + id := e.net.Info().ID + id.LocalPort = e.localPort + id.RemotePort = e.remotePort + id, e.boundBindToDevice, err = e.registerWithStack(e.effectiveNetProtos, id) if err != nil { panic(err) } - } else if len(e.ID.LocalAddress) != 0 && !e.isBroadcastOrMulticast(e.RegisterNICID, netProto, e.ID.LocalAddress) { // stateBound - // A local unicast address is specified, verify that it's valid. - if e.stack.CheckLocalAddress(e.RegisterNICID, netProto, e.ID.LocalAddress) == 0 { - panic(&tcpip.ErrBadLocalAddress{}) - } - } - - // Our saved state had a port, but we don't actually have a - // reservation. We need to remove the port from our state, but still - // pass it to the reservation machinery. - id := e.ID - e.ID.LocalPort = 0 - e.ID, e.boundBindToDevice, err = e.registerWithStack(e.effectiveNetProtos, id) - if err != nil { - panic(err) + e.localPort = id.LocalPort + e.remotePort = id.RemotePort + default: + panic(fmt.Sprintf("unhandled state = %s", state)) } } diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go index 7c357cb09..7238fc019 100644 --- a/pkg/tcpip/transport/udp/forwarder.go +++ b/pkg/tcpip/transport/udp/forwarder.go @@ -70,28 +70,29 @@ func (r *ForwarderRequest) ID() stack.TransportEndpointID { // CreateEndpoint creates a connected UDP endpoint for the session request. func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { + ep := newEndpoint(r.stack, r.pkt.NetworkProtocolNumber, queue) + ep.mu.Lock() + defer ep.mu.Unlock() + netHdr := r.pkt.Network() - route, err := r.stack.FindRoute(r.pkt.NICID, netHdr.DestinationAddress(), netHdr.SourceAddress(), r.pkt.NetworkProtocolNumber, false /* multicastLoop */) - if err != nil { + if err := ep.net.Bind(tcpip.FullAddress{NIC: r.pkt.NICID, Addr: netHdr.DestinationAddress(), Port: r.id.LocalPort}); err != nil { + return nil, err + } + + if err := ep.net.Connect(tcpip.FullAddress{NIC: r.pkt.NICID, Addr: netHdr.SourceAddress(), Port: r.id.RemotePort}); err != nil { return nil, err } - ep := newEndpoint(r.stack, r.pkt.NetworkProtocolNumber, queue) if err := r.stack.RegisterTransportEndpoint([]tcpip.NetworkProtocolNumber{r.pkt.NetworkProtocolNumber}, ProtocolNumber, r.id, ep, ep.portFlags, tcpip.NICID(ep.ops.GetBindToDevice())); err != nil { ep.Close() - route.Release() return nil, err } - ep.ID = r.id - ep.route = route - ep.dstPort = r.id.RemotePort + ep.localPort = r.id.LocalPort + ep.remotePort = r.id.RemotePort ep.effectiveNetProtos = []tcpip.NetworkProtocolNumber{r.pkt.NetworkProtocolNumber} - ep.RegisterNICID = r.pkt.NICID ep.boundPortFlags = ep.portFlags - ep.state = uint32(StateConnected) - ep.rcvMu.Lock() ep.rcvReady = true ep.rcvMu.Unlock() diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index 4008cacf2..2d15830a7 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -290,6 +290,7 @@ type testContext struct { t *testing.T linkEP *channel.Endpoint s *stack.Stack + nicID tcpip.NICID ep tcpip.Endpoint wq waiter.Queue @@ -301,6 +302,8 @@ func newDualTestContext(t *testing.T, mtu uint32) *testContext { } func newDualTestContextWithHandleLocal(t *testing.T, mtu uint32, handleLocal bool) *testContext { + const nicID = 1 + t.Helper() options := stack.Options{ @@ -316,32 +319,41 @@ func newDualTestContextWithHandleLocal(t *testing.T, mtu uint32, handleLocal boo if testing.Verbose() { wep = sniffer.New(ep) } - if err := s.CreateNIC(1, wep); err != nil { - t.Fatalf("CreateNIC failed: %s", err) + if err := s.CreateNIC(nicID, wep); err != nil { + t.Fatalf("CreateNIC(%d, _): %s", nicID, err) } - if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr); err != nil { - t.Fatalf("AddAddress failed: %s", err) + protocolAddrV4 := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: tcpip.Address(stackAddr).WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddrV4, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddrV4, err) } - if err := s.AddAddress(1, ipv6.ProtocolNumber, stackV6Addr); err != nil { - t.Fatalf("AddAddress failed: %s", err) + protocolAddrV6 := tcpip.ProtocolAddress{ + Protocol: ipv6.ProtocolNumber, + AddressWithPrefix: tcpip.Address(stackV6Addr).WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddrV6, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID, protocolAddrV6, err) } s.SetRouteTable([]tcpip.Route{ { Destination: header.IPv4EmptySubnet, - NIC: 1, + NIC: nicID, }, { Destination: header.IPv6EmptySubnet, - NIC: 1, + NIC: nicID, }, }) return &testContext{ t: t, s: s, + nicID: nicID, linkEP: ep, } } @@ -1644,8 +1656,10 @@ func TestSetTTL(t *testing.T) { } } +var v4PacketFlows = [...]testFlow{unicastV4, multicastV4, broadcast, unicastV4in6, multicastV4in6, broadcastIn6} + func TestSetTOS(t *testing.T) { - for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} { + for _, flow := range v4PacketFlows { t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) { c := newDualTestContext(t, defaultMTU) defer c.cleanup() @@ -1680,8 +1694,10 @@ func TestSetTOS(t *testing.T) { } } +var v6PacketFlows = [...]testFlow{unicastV6, unicastV6Only, multicastV6} + func TestSetTClass(t *testing.T) { - for _, flow := range []testFlow{unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, broadcastIn6} { + for _, flow := range v6PacketFlows { t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) { c := newDualTestContext(t, defaultMTU) defer c.cleanup() @@ -1725,8 +1741,14 @@ func TestReceiveTosTClass(t *testing.T) { name string tests []testFlow }{ - {RcvTOSOpt, []testFlow{unicastV4, broadcast}}, - {RcvTClassOpt, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}}, + { + name: RcvTOSOpt, + tests: v4PacketFlows[:], + }, + { + name: RcvTClassOpt, + tests: v6PacketFlows[:], + }, } for _, testCase := range testCases { for _, flow := range testCase.tests { @@ -1737,6 +1759,14 @@ func TestReceiveTosTClass(t *testing.T) { c.createEndpointForFlow(flow) name := testCase.name + if flow.isMulticast() { + netProto := flow.netProto() + addr := flow.getMcastAddr() + if err := c.s.JoinGroup(netProto, c.nicID, addr); err != nil { + c.t.Fatalf("JoinGroup(%d, %d, %s): %s", netProto, c.nicID, addr, err) + } + } + var optionGetter func() bool var optionSetter func(bool) switch name { @@ -2482,8 +2512,8 @@ func TestOutgoingSubnetBroadcast(t *testing.T) { if err := s.CreateNIC(nicID1, e); err != nil { t.Fatalf("CreateNIC(%d, _): %s", nicID1, err) } - if err := s.AddProtocolAddress(nicID1, test.nicAddr); err != nil { - t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID1, test.nicAddr, err) + if err := s.AddProtocolAddress(nicID1, test.nicAddr, stack.AddressProperties{}); err != nil { + t.Fatalf("AddProtocolAddress(%d, %+v, {}): %s", nicID1, test.nicAddr, err) } s.SetRouteTable(test.routes) diff --git a/pkg/test/testutil/testutil_runfiles.go b/pkg/test/testutil/testutil_runfiles.go index ece9ea9a1..1dbd48a47 100644 --- a/pkg/test/testutil/testutil_runfiles.go +++ b/pkg/test/testutil/testutil_runfiles.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package testutil import ( diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go index 0e9a829f6..0ef635a2f 100644 --- a/pkg/urpc/urpc.go +++ b/pkg/urpc/urpc.go @@ -27,6 +27,7 @@ import ( "os" "reflect" "runtime" + "time" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" @@ -458,29 +459,45 @@ func (s *Server) StartHandling(client *unet.Socket) { // No new requests should be initiated after calling Stop. Existing clients // will be closed after completing any pending RPCs. This method will block // until all clients have disconnected. -func (s *Server) Stop() { - // Wait for all outstanding requests. - defer s.wg.Wait() - +// +// timeout is the time for clients to complete ongoing RPCs. +func (s *Server) Stop(timeout time.Duration) { // Call any Stop callbacks. for _, stopper := range s.stoppers { stopper.Stop() } - // Close all known clients. - s.mu.Lock() - defer s.mu.Unlock() - for client, state := range s.clients { - switch state { - case idle: - // Close connection now. - client.Close() - s.clients[client] = closed - case processing: - // Request close when done. - s.clients[client] = closeRequested + done := make(chan bool, 1) + go func() { + if timeout != 0 { + timer := time.NewTicker(timeout) + defer timer.Stop() + select { + case <-done: + return + case <-timer.C: + } } - } + + // Close all known clients. + s.mu.Lock() + defer s.mu.Unlock() + for client, state := range s.clients { + switch state { + case idle: + // Close connection now. + client.Close() + s.clients[client] = closed + case processing: + // Request close when done. + s.clients[client] = closeRequested + } + } + }() + + // Wait for all outstanding requests. + s.wg.Wait() + done <- true } // Client is a urpc client. diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD index 3dba36f12..9c37a9626 100644 --- a/pkg/usermem/BUILD +++ b/pkg/usermem/BUILD @@ -7,16 +7,17 @@ go_library( srcs = [ "bytes_io.go", "bytes_io_unsafe.go", + "marshal.go", "usermem.go", ], visibility = ["//:sandbox"], deps = [ "//pkg/atomicbitops", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/gohacks", "//pkg/hostarch", "//pkg/safemem", - "//pkg/syserror", ], ) @@ -29,8 +30,8 @@ go_test( library = ":usermem", deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/safemem", - "//pkg/syserror", ], ) diff --git a/pkg/usermem/bytes_io.go b/pkg/usermem/bytes_io.go index 3da3c0294..777ac59a6 100644 --- a/pkg/usermem/bytes_io.go +++ b/pkg/usermem/bytes_io.go @@ -16,9 +16,9 @@ package usermem import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/syserror" ) const maxInt = int(^uint(0) >> 1) @@ -51,7 +51,7 @@ func (b *BytesIO) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, op // ZeroOut implements IO.ZeroOut. func (b *BytesIO) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts IOOpts) (int64, error) { if toZero > int64(maxInt) { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } rngN, rngErr := b.rangeCheck(addr, int(toZero)) if rngN == 0 { @@ -89,15 +89,15 @@ func (b *BytesIO) rangeCheck(addr hostarch.Addr, length int) (int, error) { return 0, nil } if length < 0 { - return 0, syserror.EINVAL + return 0, linuxerr.EINVAL } max := hostarch.Addr(len(b.Bytes)) if addr >= max { - return 0, syserror.EFAULT + return 0, linuxerr.EFAULT } end, ok := addr.AddLength(uint64(length)) if !ok || end > max { - return int(max - addr), syserror.EFAULT + return int(max - addr), linuxerr.EFAULT } return length, nil } diff --git a/pkg/usermem/marshal.go b/pkg/usermem/marshal.go new file mode 100644 index 000000000..5b5a662dc --- /dev/null +++ b/pkg/usermem/marshal.go @@ -0,0 +1,43 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" +) + +// IOCopyContext wraps an object implementing hostarch.IO to implement +// marshal.CopyContext. +type IOCopyContext struct { + Ctx context.Context + IO IO + Opts IOOpts +} + +// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer. +func (i *IOCopyContext) CopyScratchBuffer(size int) []byte { + return make([]byte, size) +} + +// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. +func (i *IOCopyContext) CopyOutBytes(addr hostarch.Addr, b []byte) (int, error) { + return i.IO.CopyOut(i.Ctx, addr, b, i.Opts) +} + +// CopyInBytes implements marshal.CopyContext.CopyInBytes. +func (i *IOCopyContext) CopyInBytes(addr hostarch.Addr, b []byte) (int, error) { + return i.IO.CopyIn(i.Ctx, addr, b, i.Opts) +} diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go index 0d6d25e50..f46a00e42 100644 --- a/pkg/usermem/usermem.go +++ b/pkg/usermem/usermem.go @@ -22,11 +22,10 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/gohacks" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/safemem" ) // IO provides access to the contents of a virtual memory space. @@ -163,7 +162,7 @@ func (rw *IOReadWriter) Read(dst []byte) (int, error) { // Disallow wraparound. rw.Addr = ^hostarch.Addr(0) if err != nil { - err = syserror.EFAULT + err = linuxerr.EFAULT } } return n, err @@ -179,7 +178,7 @@ func (rw *IOReadWriter) Write(src []byte) (int, error) { // Disallow wraparound. rw.Addr = ^hostarch.Addr(0) if err != nil { - err = syserror.EFAULT + err = linuxerr.EFAULT } } return n, err @@ -214,7 +213,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr hostarch.Addr, maxlen int, o } end, ok := addr.AddLength(uint64(readlen)) if !ok { - return gohacks.StringFromImmutableBytes(buf[:done]), syserror.EFAULT + return gohacks.StringFromImmutableBytes(buf[:done]), linuxerr.EFAULT } // Shorten the read to avoid crossing page boundaries, since faulting // in a page unnecessarily is expensive. This also ensures that partial @@ -244,7 +243,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr hostarch.Addr, maxlen int, o } addr = end } - return gohacks.StringFromImmutableBytes(buf), syserror.ENAMETOOLONG + return gohacks.StringFromImmutableBytes(buf), linuxerr.ENAMETOOLONG } // CopyOutVec copies bytes from src to the memory mapped at ars in uio. The @@ -382,7 +381,7 @@ func CopyInt32StringsInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSe // Parse a single value. val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32) if err != nil { - return int64(i), syserror.EINVAL + return int64(i), linuxerr.EINVAL } dsts[j] = int32(val) @@ -398,7 +397,7 @@ func CopyInt32StringsInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSe return int64(i), cperr } if j == 0 { - return int64(i), syserror.EINVAL + return int64(i), linuxerr.EINVAL } return int64(i), nil } @@ -430,7 +429,7 @@ type IOSequence struct { // return 0, nil // } // if f.availableBytes == 0 { -// return 0, syserror.ErrWouldBlock +// return 0, linuxerr.ErrWouldBlock // } // return ioseq.CopyOutFrom(..., reader) // diff --git a/pkg/usermem/usermem_test.go b/pkg/usermem/usermem_test.go index 9b697b593..a5e2fe69e 100644 --- a/pkg/usermem/usermem_test.go +++ b/pkg/usermem/usermem_test.go @@ -22,9 +22,9 @@ import ( "testing" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/syserror" ) // newContext returns a context.Context that we can use in these tests (we @@ -51,7 +51,7 @@ func TestBytesIOCopyOutSuccess(t *testing.T) { func TestBytesIOCopyOutFailure(t *testing.T) { b := newBytesIOString("ABC") n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) - if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := 2, linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("CopyOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := b.Bytes, []byte("Afo"); !bytes.Equal(got, want) { @@ -75,7 +75,7 @@ func TestBytesIOCopyInFailure(t *testing.T) { b := newBytesIOString("Afo") var dst [3]byte n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) - if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := 2, linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("CopyIn: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := dst[:], []byte("fo\x00"); !bytes.Equal(got, want) { @@ -97,7 +97,7 @@ func TestBytesIOZeroOutSuccess(t *testing.T) { func TestBytesIOZeroOutFailure(t *testing.T) { b := newBytesIOString("ABC") n, err := b.ZeroOut(newContext(), 1, 3, IOOpts{}) - if wantN, wantErr := int64(2), syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := int64(2), linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("ZeroOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := b.Bytes, []byte("A\x00\x00"); !bytes.Equal(got, want) { @@ -125,7 +125,7 @@ func TestBytesIOCopyOutFromFailure(t *testing.T) { {Start: 1, End: 4}, {Start: 4, End: 7}, }), safemem.FromIOReader{bytes.NewBufferString("foobar")}, IOOpts{}) - if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := int64(4), linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := b.Bytes, []byte("Afoob"); !bytes.Equal(got, want) { @@ -155,7 +155,7 @@ func TestBytesIOCopyInToFailure(t *testing.T) { {Start: 1, End: 4}, {Start: 4, End: 7}, }), safemem.FromIOWriter{&dst}, IOOpts{}) - if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { + if wantN, wantErr := int64(4), linuxerr.EFAULT; n != wantN || err != wantErr { t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) } if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { @@ -206,14 +206,14 @@ func TestCopyStringInVeryLong(t *testing.T) { func TestCopyStringInNoTerminatingZeroByte(t *testing.T) { want := strings.Repeat("A", copyStringIncrement-1) got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{}) - if wantErr := syserror.EFAULT; got != want || err != wantErr { + if wantErr := linuxerr.EFAULT; got != want || err != wantErr { t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) } } func TestCopyStringInTruncatedByMaxlen(t *testing.T) { got, err := CopyStringIn(newContext(), newBytesIOString(strings.Repeat("A", 10)), 0, 5, IOOpts{}) - if want, wantErr := strings.Repeat("A", 5), syserror.ENAMETOOLONG; got != want || err != wantErr { + if want, wantErr := strings.Repeat("A", 5), linuxerr.ENAMETOOLONG; got != want || err != wantErr { t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) } } @@ -272,8 +272,8 @@ func TestCopyInt32StringsInVecRequiresOneValidValue(t *testing.T) { src := BytesIOSequence([]byte(s)) initial := []int32{1, 2} dsts := append([]int32(nil), initial...) - if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); err != syserror.EINVAL { - t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, syserror.EINVAL) + if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); !linuxerr.Equals(linuxerr.EINVAL, err) { + t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, linuxerr.EINVAL) } if !reflect.DeepEqual(dsts, initial) { t.Errorf("dsts: got %v, wanted %v", dsts, initial) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index a79afbdc4..ff7a5a44b 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -15,6 +15,7 @@ go_library( "limits.go", "loader.go", "network.go", + "profile.go", "strace.go", "vfs.go", ], @@ -32,6 +33,7 @@ go_library( "//pkg/control/server", "//pkg/coverage", "//pkg/cpuid", + "//pkg/errors/linuxerr", "//pkg/eventchannel", "//pkg/fd", "//pkg/flipcall", @@ -44,6 +46,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", + "//pkg/sentry/control:control_go_proto", "//pkg/sentry/devices/memdev", "//pkg/sentry/devices/ttydev", "//pkg/sentry/devices/tundev", @@ -94,11 +97,10 @@ go_library( "//pkg/sentry/vfs", "//pkg/sentry/watchdog", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip", + "//pkg/tcpip/link/ethernet", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/loopback", - "//pkg/tcpip/link/packetsocket", "//pkg/tcpip/link/qdisc/fifo", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 9b270cbf2..76e1f596b 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -18,6 +18,7 @@ import ( "errors" "fmt" "os" + gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" @@ -25,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" + controlpb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" @@ -40,80 +42,89 @@ import ( ) const ( - // ContainerCheckpoint checkpoints a container. - ContainerCheckpoint = "containerManager.Checkpoint" + // ContMgrCheckpoint checkpoints a container. + ContMgrCheckpoint = "containerManager.Checkpoint" - // ContainerCreate creates a container. - ContainerCreate = "containerManager.Create" + // ContMgrCreateSubcontainer creates a sub-container. + ContMgrCreateSubcontainer = "containerManager.CreateSubcontainer" - // ContainerDestroy is used to stop a non-root container and free all + // ContMgrDestroySubcontainer is used to stop a sub-container and free all // associated resources in the sandbox. - ContainerDestroy = "containerManager.Destroy" + ContMgrDestroySubcontainer = "containerManager.DestroySubcontainer" - // ContainerEvent is the URPC endpoint for getting stats about the - // container used by "runsc events". - ContainerEvent = "containerManager.Event" + // ContMgrEvent gets stats about the container used by "runsc events". + ContMgrEvent = "containerManager.Event" - // ContainerExecuteAsync is the URPC endpoint for executing a command in a - // container. - ContainerExecuteAsync = "containerManager.ExecuteAsync" + // ContMgrExecuteAsync executes a command in a container. + ContMgrExecuteAsync = "containerManager.ExecuteAsync" - // ContainerPause pauses the container. - ContainerPause = "containerManager.Pause" + // ContMgrProcesses lists processes running in a container. + ContMgrProcesses = "containerManager.Processes" - // ContainerProcesses is the URPC endpoint for getting the list of - // processes running in a container. - ContainerProcesses = "containerManager.Processes" + // ContMgrRestore restores a container from a statefile. + ContMgrRestore = "containerManager.Restore" - // ContainerRestore restores a container from a statefile. - ContainerRestore = "containerManager.Restore" + // ContMgrSignal sends a signal to a container. + ContMgrSignal = "containerManager.Signal" - // ContainerResume unpauses the paused container. - ContainerResume = "containerManager.Resume" + // ContMgrStartSubcontainer starts a sub-container inside a running sandbox. + ContMgrStartSubcontainer = "containerManager.StartSubcontainer" - // ContainerSignal is used to send a signal to a container. - ContainerSignal = "containerManager.Signal" + // ContMgrWait waits on the init process of the container and returns its + // ExitStatus. + ContMgrWait = "containerManager.Wait" - // ContainerSignalProcess is used to send a signal to a particular - // process in a container. - ContainerSignalProcess = "containerManager.SignalProcess" + // ContMgrWaitPID waits on a process with a certain PID in the sandbox and + // return its ExitStatus. + ContMgrWaitPID = "containerManager.WaitPID" - // ContainerStart is the URPC endpoint for running a non-root container - // within a sandbox. - ContainerStart = "containerManager.Start" - - // ContainerWait is used to wait on the init process of the container - // and return its ExitStatus. - ContainerWait = "containerManager.Wait" - - // ContainerWaitPID is used to wait on a process with a certain PID in - // the sandbox and return its ExitStatus. - ContainerWaitPID = "containerManager.WaitPID" + // ContMgrRootContainerStart starts a new sandbox with a root container. + ContMgrRootContainerStart = "containerManager.StartRoot" +) - // NetworkCreateLinksAndRoutes is the URPC endpoint for creating links - // and routes in a network stack. +const ( + // NetworkCreateLinksAndRoutes creates links and routes in a network stack. NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes" - // RootContainerStart is the URPC endpoint for starting a new sandbox - // with root container. - RootContainerStart = "containerManager.StartRoot" - - // SandboxStacks collects sandbox stacks for debugging. - SandboxStacks = "debug.Stacks" + // DebugStacks collects sandbox stacks for debugging. + DebugStacks = "debug.Stacks" ) // Profiling related commands (see pprof.go for more details). const ( - CPUProfile = "Profile.CPU" - HeapProfile = "Profile.Heap" - BlockProfile = "Profile.Block" - MutexProfile = "Profile.Mutex" - Trace = "Profile.Trace" + ProfileCPU = "Profile.CPU" + ProfileHeap = "Profile.Heap" + ProfileBlock = "Profile.Block" + ProfileMutex = "Profile.Mutex" + ProfileTrace = "Profile.Trace" ) // Logging related commands (see logging.go for more details). const ( - ChangeLogging = "Logging.Change" + LoggingChange = "Logging.Change" +) + +// Lifecycle related commands (see lifecycle.go for more details). +const ( + LifecyclePause = "Lifecycle.Pause" + LifecycleResume = "Lifecycle.Resume" +) + +// Filesystem related commands (see fs.go for more details). +const ( + FsCat = "Fs.Cat" +) + +// Usage related commands (see usage.go for more details). +const ( + UsageCollect = "Usage.Collect" + UsageUsageFD = "Usage.UsageFD" + UsageReduce = "Usage.Reduce" +) + +// Events related commands (see events.go for more details). +const ( + EventsAttachDebugEmitter = "Events.AttachDebugEmitter" ) // ControlSocketAddr generates an abstract unix socket name for the given ID. @@ -155,18 +166,41 @@ func newController(fd int, l *Loader) (*controller, error) { ctrl.srv.Register(net) } - ctrl.srv.Register(&debug{}) - ctrl.srv.Register(&control.Logging{}) - - if l.root.conf.ProfileEnable { - ctrl.srv.Register(control.NewProfile(l.k)) + if l.root.conf.Controls.Controls != nil { + for _, c := range l.root.conf.Controls.Controls.AllowedControls { + switch c { + case controlpb.ControlConfig_EVENTS: + ctrl.srv.Register(&control.Events{}) + case controlpb.ControlConfig_FS: + ctrl.srv.Register(&control.Fs{Kernel: l.k}) + case controlpb.ControlConfig_LIFECYCLE: + ctrl.srv.Register(&control.Lifecycle{Kernel: l.k}) + case controlpb.ControlConfig_LOGGING: + ctrl.srv.Register(&control.Logging{}) + case controlpb.ControlConfig_PROFILE: + if l.root.conf.ProfileEnable { + ctrl.srv.Register(control.NewProfile(l.k)) + } + case controlpb.ControlConfig_USAGE: + ctrl.srv.Register(&control.Usage{Kernel: l.k}) + case controlpb.ControlConfig_PROC: + ctrl.srv.Register(&control.Proc{Kernel: l.k}) + case controlpb.ControlConfig_STATE: + ctrl.srv.Register(&control.State{Kernel: l.k}) + case controlpb.ControlConfig_DEBUG: + ctrl.srv.Register(&debug{}) + } + } } return ctrl, nil } +// stopRPCTimeout is the time for clients to complete ongoing RPCs. +const stopRPCTimeout = 15 * gtime.Second + func (c *controller) stop() { - c.srv.Stop() + c.srv.Stop(stopRPCTimeout) } // containerManager manages sandbox containers. @@ -210,9 +244,9 @@ type CreateArgs struct { urpc.FilePayload } -// Create creates a container within a sandbox. -func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error { - log.Debugf("containerManager.Create: %s", args.CID) +// CreateSubcontainer creates a container within a sandbox. +func (cm *containerManager) CreateSubcontainer(args *CreateArgs, _ *struct{}) error { + log.Debugf("containerManager.CreateSubcontainer: %s", args.CID) if len(args.Files) > 1 { return fmt.Errorf("start arguments must have at most 1 files for TTY") @@ -225,7 +259,7 @@ func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error { return fmt.Errorf("error dup'ing TTY file: %w", err) } } - return cm.l.createContainer(args.CID, tty) + return cm.l.createSubcontainer(args.CID, tty) } // StartArgs contains arguments to the Start method. @@ -245,13 +279,13 @@ type StartArgs struct { urpc.FilePayload } -// Start runs a created container within a sandbox. -func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { +// StartSubcontainer runs a created container within a sandbox. +func (cm *containerManager) StartSubcontainer(args *StartArgs, _ *struct{}) error { // Validate arguments. if args == nil { return errors.New("start missing arguments") } - log.Debugf("containerManager.Start, cid: %s, args: %+v", args.CID, args) + log.Debugf("containerManager.StartSubcontainer, cid: %s, args: %+v", args.CID, args) if args.Spec == nil { return errors.New("start arguments missing spec") } @@ -299,19 +333,19 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { } }() - if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil { - log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err) + if err := cm.l.startSubcontainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil { + log.Debugf("containerManager.StartSubcontainer failed, cid: %s, args: %+v, err: %v", args.CID, args, err) return err } log.Debugf("Container started, cid: %s", args.CID) return nil } -// Destroy stops a container if it is still running and cleans up its -// filesystem. -func (cm *containerManager) Destroy(cid *string, _ *struct{}) error { - log.Debugf("containerManager.destroy, cid: %s", *cid) - return cm.l.destroyContainer(*cid) +// DestroySubcontainer stops a container if it is still running and cleans up +// its filesystem. +func (cm *containerManager) DestroySubcontainer(cid *string, _ *struct{}) error { + log.Debugf("containerManager.DestroySubcontainer, cid: %s", *cid) + return cm.l.destroySubcontainer(*cid) } // ExecuteAsync starts running a command on a created or running sandbox. It @@ -330,6 +364,11 @@ func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) err // Checkpoint pauses a sandbox and saves its state. func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error { log.Debugf("containerManager.Checkpoint") + // TODO(gvisor.dev/issues/6243): save/restore not supported w/ hostinet + if cm.l.root.conf.Network == config.NetworkHost { + return errors.New("checkpoint not supported when using hostinet") + } + state := control.State{ Kernel: cm.l.k, Watchdog: cm.l.watchdog, @@ -337,13 +376,6 @@ func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error { return state.Save(o, nil) } -// Pause suspends a container. -func (cm *containerManager) Pause(_, _ *struct{}) error { - log.Debugf("containerManager.Pause") - cm.l.k.Pause() - return nil -} - // RestoreOpts contains options related to restoring a container's file system. type RestoreOpts struct { // FilePayload contains the state file to be restored, followed by the @@ -439,7 +471,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Load the state. loadOpts := state.LoadOpts{Source: specFile} - if err := loadOpts.Load(ctx, k, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil { + if err := loadOpts.Load(ctx, k, nil, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil { return err } @@ -475,13 +507,6 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return nil } -// Resume unpauses a container. -func (cm *containerManager) Resume(_, _ *struct{}) error { - log.Debugf("containerManager.Resume") - cm.l.k.Unpause() - return nil -} - // Wait waits for the init process in the given container. func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error { log.Debugf("containerManager.Wait, cid: %s", *cid) diff --git a/runsc/boot/events.go b/runsc/boot/events.go index 0814b2a69..65137de8a 100644 --- a/runsc/boot/events.go +++ b/runsc/boot/events.go @@ -91,7 +91,7 @@ func (cm *containerManager) Event(_ *struct{}, out *EventOut) error { // Memory usage. // TODO(gvisor.dev/issue/172): Per-container accounting. mem := cm.l.k.MemoryFile() - mem.UpdateUsage() + _ = mem.UpdateUsage() // best effort to update. _, totalUsage := usage.MemoryAccounting.Copy() out.Event.Data.Memory.Usage = MemoryEntry{ Usage: totalUsage, diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 752fea0e1..703f34827 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -459,6 +459,14 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.MatchAny{}, seccomp.EqualTo(unix.TIOCINQ), }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(unix.SIOCGIFFLAGS), + }, + { + seccomp.MatchAny{}, + seccomp.EqualTo(unix.SIOCGIFCONF), + }, }, unix.SYS_LISTEN: {}, unix.SYS_READV: {}, diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go index 42cb8ed3a..8015a0e52 100644 --- a/runsc/boot/filter/config_amd64.go +++ b/runsc/boot/filter/config_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package filter diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go index f162f87ff..9f44379b4 100644 --- a/runsc/boot/filter/config_arm64.go +++ b/runsc/boot/filter/config_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package filter diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go index 89b66a6da..214bf8b1d 100644 --- a/runsc/boot/filter/config_profile.go +++ b/runsc/boot/filter/config_profile.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package filter import ( diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go index e28d4b8d6..5442add95 100644 --- a/runsc/boot/filter/extra_filters.go +++ b/runsc/boot/filter/extra_filters.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !msan && !race // +build !msan,!race package filter diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go index 41baa78cd..8873f9cf9 100644 --- a/runsc/boot/filter/extra_filters_msan.go +++ b/runsc/boot/filter/extra_filters_msan.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build msan // +build msan package filter diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go index 79b2104f0..046b39014 100644 --- a/runsc/boot/filter/extra_filters_race.go +++ b/runsc/boot/filter/extra_filters_race.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build race // +build race package filter diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index c4590aab1..40cf2a3df 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -25,6 +25,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -41,7 +42,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" @@ -69,7 +69,7 @@ const ( // tmpfs has some extra supported options that we must pass through. var tmpfsAllowedData = []string{"mode", "uid", "gid"} -func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { +func addOverlay(ctx context.Context, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { // Upper layer uses the same flags as lower, but it must be read-write. upperFlags := lowerFlags upperFlags.ReadOnly = false @@ -744,7 +744,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.C if useOverlay { log.Debugf("Adding overlay on top of shared mount %q", hint.name) - inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf) + inode, err = addOverlay(ctx, inode, hint.mount.Type, mf) if err != nil { return nil, err } @@ -785,7 +785,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con if conf.Overlay && !c.root.Readonly { log.Debugf("Adding overlay on top of root mount") // Overlay a tmpfs filesystem on top of the root. - rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) + rootInode, err = addOverlay(ctx, rootInode, "root-overlay-upper", mf) if err != nil { return nil, err } @@ -901,7 +901,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Confi if useOverlay { log.Debugf("Adding overlay on top of mount %q", m.Destination) - inode, err = addOverlay(ctx, conf, inode, m.Type, mf) + inode, err = addOverlay(ctx, inode, m.Type, mf) if err != nil { return err } @@ -1039,8 +1039,8 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mn maxTraversals := uint(0) tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals) - switch err { - case nil: + switch { + case err == nil: // Found '/tmp' in filesystem, check if it's empty. defer tmp.DecRef(ctx) f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) @@ -1061,7 +1061,7 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mn log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp") fallthrough - case syserror.ENOENT: + case linuxerr.Equals(linuxerr.ENOENT, err): // No '/tmp' found (or fallthrough from above). Safe to mount internal // tmpfs. tmpMount := specs.Mount{ diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index ad4d50008..b46d84e5a 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -58,6 +58,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" @@ -119,6 +120,10 @@ type Loader struct { // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() + // stopProfiling stops profiling started at container creation. It + // should be called when a sandbox is destroyed. + stopProfiling func() + // restore is set to true if we are restoring a container. restore bool @@ -198,6 +203,21 @@ type Args struct { TotalMem uint64 // UserLogFD is the file descriptor to write user logs to. UserLogFD int + // ProfileBlockFD is the file descriptor to write a block profile to. + // Valid if >=0. + ProfileBlockFD int + // ProfileCPUFD is the file descriptor to write a CPU profile to. + // Valid if >=0. + ProfileCPUFD int + // ProfileHeapFD is the file descriptor to write a heap profile to. + // Valid if >=0. + ProfileHeapFD int + // ProfileMutexFD is the file descriptor to write a mutex profile to. + // Valid if >=0. + ProfileMutexFD int + // TraceFD is the file descriptor to write a Go execution trace to. + // Valid if >=0. + TraceFD int } // make sure stdioFDs are always the same on initial start and on restore @@ -206,6 +226,8 @@ const startingStdioFD = 256 // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. func New(args Args) (*Loader, error) { + stopProfiling := startProfiling(args) + // We initialize the rand package now to make sure /dev/urandom is pre-opened // on kernels that do not support getrandom(2). if err := rand.Init(); err != nil { @@ -278,19 +300,15 @@ func New(args Args) (*Loader, error) { } // Create timekeeper. - tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) - if err != nil { - return nil, fmt.Errorf("creating timekeeper: %w", err) - } + tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) tk.SetClocks(time.NewCalibratedClocks()) - k.SetTimekeeper(tk) if err := enableStrace(args.Conf); err != nil { return nil, fmt.Errorf("enabling strace: %w", err) } // Create root network namespace/stack. - netns, err := newRootNetworkNamespace(args.Conf, k, k) + netns, err := newRootNetworkNamespace(args.Conf, tk, k) if err != nil { return nil, fmt.Errorf("creating network: %w", err) } @@ -332,6 +350,7 @@ func New(args Args) (*Loader, error) { // to createVFS in order to mount (among other things) procfs. if err = k.Init(kernel.InitKernelArgs{ FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, RootUserNamespace: creds.UserNamespace, RootNetworkNamespace: netns, ApplicationCores: uint(args.NumCPU), @@ -402,12 +421,13 @@ func New(args Args) (*Loader, error) { eid := execID{cid: args.ID} l := &Loader{ - k: k, - watchdog: dog, - sandboxID: args.ID, - processes: map[execID]*execProcess{eid: {}}, - mountHints: mountHints, - root: info, + k: k, + watchdog: dog, + sandboxID: args.ID, + processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, + root: info, + stopProfiling: stopProfiling, } // We don't care about child signals; some platforms can generate a @@ -500,6 +520,8 @@ func (l *Loader) Destroy() { for _, f := range l.root.goferFDs { _ = f.Close() } + + l.stopProfiling() } func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { @@ -636,8 +658,8 @@ func (l *Loader) run() error { return l.k.Start() } -// createContainer creates a new container inside the sandbox. -func (l *Loader) createContainer(cid string, tty *fd.FD) error { +// createSubcontainer creates a new container inside the sandbox. +func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error { l.mu.Lock() defer l.mu.Unlock() @@ -649,10 +671,10 @@ func (l *Loader) createContainer(cid string, tty *fd.FD) error { return nil } -// startContainer starts a child container. It returns the thread group ID of +// startSubcontainer starts a child container. It returns the thread group ID of // the newly created process. Used FDs are either closed or released. It's safe // for the caller to close any remaining files upon return. -func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error { +func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -718,7 +740,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin return fmt.Errorf("using TTY, stdios not expected: %d", l) } if ep.hostTTY == nil { - return fmt.Errorf("terminal enabled but no TTY provided (--console-socket possibly passed)") + return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") } info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} ep.hostTTY = nil @@ -737,7 +759,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Create the FD map, which will set stdin, stdout, and stderr. ctx := info.procArgs.NewContext(l.k) - fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.spec.Process.User) if err != nil { return nil, nil, nil, fmt.Errorf("importing fds: %w", err) } @@ -745,8 +767,11 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn // ours either way. info.procArgs.FDTable = fdTable - // Setup the child container file system. - l.startGoferMonitor(cid, info.goferFDs) + // Gofer FDs must be ordered and the first FD is always the rootfs. + if len(info.goferFDs) < 1 { + return nil, nil, nil, fmt.Errorf("rootfs gofer FD not found") + } + l.startGoferMonitor(cid, int32(info.goferFDs[0].FD())) mntr := newContainerMounter(info, l.k, l.mountHints, kernel.VFS2Enabled) if root { @@ -819,17 +844,21 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on -// the gofer FDs looking for disconnects, and kills the container processes if a -// disconnect occurs in any of the gofer FDs. -func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { +// the gofer FD looking for disconnects, and kills the container processes if +// the rootfs FD disconnects. +// +// Note that other gofer mounts are allowed to be unmounted and disconnected. +func (l *Loader) startGoferMonitor(cid string, rootfsGoferFD int32) { + if rootfsGoferFD < 0 { + panic(fmt.Sprintf("invalid FD: %d", rootfsGoferFD)) + } go func() { log.Debugf("Monitoring gofer health for container %q", cid) - var events []unix.PollFd - for _, goferFD := range goferFDs { - events = append(events, unix.PollFd{ - Fd: int32(goferFD.FD()), + events := []unix.PollFd{ + { + Fd: rootfsGoferFD, Events: unix.POLLHUP | unix.POLLRDHUP, - }) + }, } _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { // Use ppoll instead of poll because it's already whilelisted in seccomp. @@ -854,9 +883,9 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { }() } -// destroyContainer stops a container if it is still running and cleans up its -// filesystem. -func (l *Loader) destroyContainer(cid string) error { +// destroySubcontainer stops a container if it is still running and cleans up +// its filesystem. +func (l *Loader) destroySubcontainer(cid string) error { l.mu.Lock() defer l.mu.Unlock() @@ -983,7 +1012,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { tty: ttyFile, ttyVFS2: ttyFileVFS2, } - log.Debugf("updated processes: %s", l.processes) + log.Debugf("updated processes: %v", l.processes) return tgid, nil } @@ -1004,7 +1033,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { // Check for leaks and write coverage report after the root container has // exited. This guarantees that the report is written in cases where the - // sandbox is killed by a signal after the ContainerWait request is completed. + // sandbox is killed by a signal after the ContMgrWait request is completed. if l.root.procArgs.ContainerID == cid { // All sentry-created resources should have been released at this point. refsvfs2.DoLeakCheck() @@ -1027,7 +1056,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e l.mu.Lock() delete(l.processes, eid) - log.Debugf("updated processes (removal): %s", l.processes) + log.Debugf("updated processes (removal): %v", l.processes) l.mu.Unlock() return nil } @@ -1054,7 +1083,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e // to exit. func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { tg.WaitExited() - return tg.ExitStatus().Status() + return uint32(tg.ExitStatus()) } // WaitForStartSignal waits for a start signal from the control server. @@ -1063,7 +1092,7 @@ func (l *Loader) WaitForStartSignal() { } // WaitExit waits for the root container to exit, and returns its exit status. -func (l *Loader) WaitExit() kernel.ExitStatus { +func (l *Loader) WaitExit() linux.WaitStatus { // Wait for container. l.k.WaitExited() @@ -1084,23 +1113,24 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st return inet.NewRootNamespace(hostinet.NewStack(), nil), nil case config.NetworkNone, config.NetworkSandbox: - s, err := newEmptySandboxNetworkStack(clock, uniqueID) + s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite) if err != nil { return nil, err } creator := &sandboxNetstackCreator{ - clock: clock, - uniqueID: uniqueID, + clock: clock, + uniqueID: uniqueID, + allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, } return inet.NewRootNamespace(s, creator), nil default: - panic(fmt.Sprintf("invalid network configuration: %d", conf.Network)) + panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) } } -func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { +func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) { netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} transProtos := []stack.TransportProtocolFactory{ tcp.NewProtocol, @@ -1116,9 +1146,10 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. - RawFactory: raw.EndpointFactory{}, - UniqueID: uniqueID, - DefaultIPTables: netfilter.DefaultLinuxTables, + RawFactory: raw.EndpointFactory{}, + AllowPacketEndpointWrite: allowPacketEndpointWrite, + UniqueID: uniqueID, + DefaultIPTables: netfilter.DefaultLinuxTables, })} // Enable SACK Recovery. @@ -1155,13 +1186,14 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in // // +stateify savable type sandboxNetstackCreator struct { - clock tcpip.Clock - uniqueID stack.UniqueID + clock tcpip.Clock + uniqueID stack.UniqueID + allowPacketEndpointWrite bool } // CreateStack implements kernel.NetworkStackCreator.CreateStack. func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { - s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) + s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite) if err != nil { return nil, err } @@ -1170,7 +1202,7 @@ func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { n := &Network{Stack: s.(*netstack.Stack).Stack} nicID := tcpip.NICID(f.uniqueID.UniqueID()) link := DefaultLoopbackLink - linkEP := loopback.New() + linkEP := ethernet.New(loopback.New()) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return nil, err } @@ -1215,7 +1247,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e return nil default: - panic(fmt.Sprintf("unknown signal delivery mode %s", mode)) + panic(fmt.Sprintf("unknown signal delivery mode %v", mode)) } } @@ -1340,14 +1372,14 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2 return ep.tty, ep.ttyVFS2, nil } -func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, user specs.User) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if len(stdioFDs) != 3 { return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } k := kernel.KernelFromContext(ctx) fdTable := k.NewFDTable() - ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), stdioFDs) if err != nil { fdTable.DecRef(ctx) return nil, nil, nil, err diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 93c476971..ac6c26d25 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -188,8 +188,8 @@ func doRun(t *testing.T, vfsEnabled bool) { } // Wait for the application to exit. It should succeed. - if status := l.WaitExit(); status.Code != 0 || status.Signo != 0 { - t.Errorf("application exited with status %+v, want 0", status) + if status := l.WaitExit(); !status.Exited() || status.ExitStatus() != 0 { + t.Errorf("application exited with %s, want exit status 0", status) } } @@ -214,7 +214,7 @@ func doStartSignal(t *testing.T, vfsEnabled bool) { // We aren't going to wait on this application, so the control server // needs to be shut down manually. - defer l.ctrl.srv.Stop() + defer l.ctrl.srv.Stop(time.Hour) // Start a goroutine that calls WaitForStartSignal and writes to a // channel when it returns. diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 7e627e4c6..9fb3ebd95 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -23,9 +23,9 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" - "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -169,7 +169,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct nicID++ nicids[link.Name] = nicID - linkEP := loopback.New() + linkEP := ethernet.New(loopback.New()) log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { @@ -209,7 +209,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct linkEP, err := fdbased.New(&fdbased.Options{ FDs: FDs, MTU: uint32(link.MTU), - EthernetHeader: true, + EthernetHeader: mac != "", Address: mac, PacketDispatchMode: fdbased.RecvMMsg, GSOMaxSize: link.GSOMaxSize, @@ -228,9 +228,6 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } - // Enable support for AF_PACKET sockets to receive outgoing packets. - linkEP = packetsocket.New(linkEP) - log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err @@ -285,12 +282,15 @@ func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkE for _, addr := range addrs { proto, tcpipAddr := ipToAddressAndProto(addr.Address) - ap := tcpip.AddressWithPrefix{ - Address: tcpipAddr, - PrefixLen: addr.PrefixLen, + protocolAddr := tcpip.ProtocolAddress{ + Protocol: proto, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: tcpipAddr, + PrefixLen: addr.PrefixLen, + }, } - if err := n.Stack.AddAddressWithPrefix(id, proto, ap); err != nil { - return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err) + if err := n.Stack.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil { + return fmt.Errorf("AddProtocolAddress(%d, %+v, {}) failed: %s", id, protocolAddr, err) } } return nil diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go index 1ded20dee..36b78ad86 100644 --- a/runsc/boot/pprof/pprof.go +++ b/runsc/boot/pprof/pprof.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + // Package pprof provides a stub to initialize custom profilers. package pprof diff --git a/runsc/boot/profile.go b/runsc/boot/profile.go new file mode 100644 index 000000000..3ecd3e532 --- /dev/null +++ b/runsc/boot/profile.go @@ -0,0 +1,95 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "os" + "runtime" + "runtime/pprof" + "runtime/trace" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" +) + +// startProfiling initiates profiling as defined by the ProfileConfig, and +// returns a function that should be called to stop profiling. +func startProfiling(args Args) func() { + var onStopProfiling []func() + stopProfiling := func() { + for _, f := range onStopProfiling { + f() + } + } + + if args.ProfileBlockFD >= 0 { + file := os.NewFile(uintptr(args.ProfileBlockFD), "profile-block") + + runtime.SetBlockProfileRate(control.DefaultBlockProfileRate) + onStopProfiling = append(onStopProfiling, func() { + if err := pprof.Lookup("block").WriteTo(file, 0); err != nil { + log.Warningf("Error writing block profile: %v", err) + } + file.Close() + runtime.SetBlockProfileRate(0) + }) + } + + if args.ProfileCPUFD >= 0 { + file := os.NewFile(uintptr(args.ProfileCPUFD), "profile-cpu") + + pprof.StartCPUProfile(file) + onStopProfiling = append(onStopProfiling, func() { + pprof.StopCPUProfile() + file.Close() + }) + } + + if args.ProfileHeapFD >= 0 { + file := os.NewFile(uintptr(args.ProfileHeapFD), "profile-heap") + + onStopProfiling = append(onStopProfiling, func() { + if err := pprof.Lookup("heap").WriteTo(file, 0); err != nil { + log.Warningf("Error writing heap profile: %v", err) + } + file.Close() + }) + } + + if args.ProfileMutexFD >= 0 { + file := os.NewFile(uintptr(args.ProfileMutexFD), "profile-mutex") + + prev := runtime.SetMutexProfileFraction(control.DefaultMutexProfileRate) + onStopProfiling = append(onStopProfiling, func() { + if err := pprof.Lookup("mutex").WriteTo(file, 0); err != nil { + log.Warningf("Error writing mutex profile: %v", err) + } + file.Close() + runtime.SetMutexProfileFraction(prev) + }) + } + + if args.TraceFD >= 0 { + file := os.NewFile(uintptr(args.TraceFD), "trace") + + trace.Start(file) + onStopProfiling = append(onStopProfiling, func() { + trace.Stop() + file.Close() + }) + } + + return stopProfiling +} diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go index c21648a32..cf5be34cd 100644 --- a/runsc/boot/strace.go +++ b/runsc/boot/strace.go @@ -35,9 +35,14 @@ func enableStrace(conf *config.Config) error { } strace.LogMaximumSize = max + sink := strace.SinkTypeLog + if conf.StraceEvent { + sink = strace.SinkTypeEvent + } + if len(conf.StraceSyscalls) == 0 { - strace.EnableAll(strace.SinkTypeLog) + strace.EnableAll(sink) return nil } - return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog) + return strace.Enable(strings.Split(conf.StraceSyscalls, ","), sink) } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 52aa33529..346796d9c 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/devices/memdev" @@ -44,7 +45,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -656,20 +656,20 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config Path: fspath.Parse("/tmp"), } fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) - switch err { - case nil: + switch { + case err == nil: defer fd.DecRef(ctx) err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { if dirent.Name != "." && dirent.Name != ".." { - return syserror.ENOTEMPTY + return linuxerr.ENOTEMPTY } return nil })) - switch err { - case nil: + switch { + case err == nil: log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) - case syserror.ENOTEMPTY: + case linuxerr.Equals(linuxerr.ENOTEMPTY, err): // If more than "." and ".." is found, skip internal tmpfs to prevent // hiding existing files. log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) @@ -679,7 +679,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config } fallthrough - case syserror.ENOENT: + case linuxerr.Equals(linuxerr.ENOENT, err): // No '/tmp' found (or fallthrough from above). It's safe to mount internal // tmpfs. tmpMount := specs.Mount{ @@ -692,7 +692,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config _, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{mount: &tmpMount}) return err - case syserror.ENOTDIR: + case linuxerr.Equals(linuxerr.ENOTDIR, err): // Not a dir?! Let it be. return nil diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index 66a6a0f68..5dbf14376 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -424,10 +424,9 @@ func (c *Cgroup) Uninstall() error { // restores cgroup to the original state. func (c *Cgroup) Join() (func(), error) { // First save the current state so it can be restored. - undo := func() {} paths, err := loadPaths("self") if err != nil { - return undo, err + return nil, err } var undoPaths []string for ctrlr, path := range paths { @@ -438,8 +437,7 @@ func (c *Cgroup) Join() (func(), error) { } } - // Replace empty undo with the real thing before changes are made to cgroups. - undo = func() { + cu := cleanup.Make(func() { for _, path := range undoPaths { log.Debugf("Restoring cgroup %q", path) // Writing the value 0 to a cgroup.procs file causes @@ -449,7 +447,8 @@ func (c *Cgroup) Join() (func(), error) { log.Warningf("Error restoring cgroup %q: %v", path, err) } } - } + }) + defer cu.Clean() // Now join the cgroups. for key, ctrlr := range controllers { @@ -461,10 +460,10 @@ func (c *Cgroup) Join() (func(), error) { if ctrlr.optional() && os.IsNotExist(err) { continue } - return undo, err + return nil, err } } - return undo, nil + return cu.Release(), nil } // CPUQuota returns the CFS CPU quota. diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go index eba40621e..1431b4e8f 100644 --- a/runsc/cgroup/cgroup_test.go +++ b/runsc/cgroup/cgroup_test.go @@ -800,7 +800,7 @@ func TestLoadPaths(t *testing.T) { if err != nil { t.Fatalf("Unexpected error: %v", err) } - } else if !strings.Contains(err.Error(), tc.err) { + } else if err == nil || !strings.Contains(err.Error(), tc.err) { t.Fatalf("Wrong error message, want: *%s*, got: %v", tc.err, err) } for key, vWant := range tc.want { diff --git a/runsc/cli/main.go b/runsc/cli/main.go index 76184cd9c..3556d7665 100644 --- a/runsc/cli/main.go +++ b/runsc/cli/main.go @@ -243,7 +243,7 @@ func Main(version string) { subcmdCode := subcommands.Execute(context.Background(), conf, &ws) // Check for leaks and write coverage report before os.Exit(). refsvfs2.DoLeakCheck() - coverage.Report() + _ = coverage.Report() if subcmdCode == subcommands.ExitSuccess { log.Infof("Exiting with status: %v", ws) if ws.Signaled() { diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 39c8ff603..c5e32807d 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -36,6 +36,7 @@ go_library( "statefile.go", "symbolize.go", "syscalls.go", + "usage.go", "verity_prepare.go", "wait.go", ], @@ -95,10 +96,10 @@ go_test( "//runsc/config", "//runsc/container", "//runsc/mitigate", - "//runsc/mitigate/mock", "//runsc/specutils", "@com_github_google_go_cmp//cmp:go_default_library", "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", + "@com_github_google_subcommands//:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", ], diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index a14249641..e33a7f3cb 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -79,6 +79,26 @@ type Boot struct { // sandbox (e.g. gofer) and sent through this FD. mountsFD int + // profileBlockFD is the file descriptor to write a block profile to. + // Valid if >= 0. + profileBlockFD int + + // profileCPUFD is the file descriptor to write a CPU profile to. + // Valid if >= 0. + profileCPUFD int + + // profileHeapFD is the file descriptor to write a heap profile to. + // Valid if >= 0. + profileHeapFD int + + // profileMutexFD is the file descriptor to write a mutex profile to. + // Valid if >= 0. + profileMutexFD int + + // traceFD is the file descriptor to write a Go execution trace to. + // Valid if >= 0. + traceFD int + // pidns is set if the sandbox is in its own pid namespace. pidns bool @@ -119,6 +139,11 @@ func (b *Boot) SetFlags(f *flag.FlagSet) { f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") + f.IntVar(&b.profileBlockFD, "profile-block-fd", -1, "file descriptor to write block profile to. -1 disables profiling.") + f.IntVar(&b.profileCPUFD, "profile-cpu-fd", -1, "file descriptor to write CPU profile to. -1 disables profiling.") + f.IntVar(&b.profileHeapFD, "profile-heap-fd", -1, "file descriptor to write heap profile to. -1 disables profiling.") + f.IntVar(&b.profileMutexFD, "profile-mutex-fd", -1, "file descriptor to write mutex profile to. -1 disables profiling.") + f.IntVar(&b.traceFD, "trace-fd", -1, "file descriptor to write Go execution trace to. -1 disables tracing.") f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates") } @@ -157,10 +182,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // we will read it again after the exec call. This works // because the ReadSpecFromFile function seeks to the beginning // of the file before reading. - if err := callSelfAsNobody(args); err != nil { - Fatalf("%v", err) - } - panic("callSelfAsNobody must never return success") + Fatalf("callSelfAsNobody(%v): %v", args, callSelfAsNobody(args)) + panic("unreachable") } } @@ -199,10 +222,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // we will read it again after the exec call. This works // because the ReadSpecFromFile function seeks to the beginning // of the file before reading. - if err := setCapsAndCallSelf(args, caps); err != nil { - Fatalf("%v", err) - } - panic("setCapsAndCallSelf must never return success") + Fatalf("setCapsAndCallSelf(%v, %v): %v", args, caps, setCapsAndCallSelf(args, caps)) + panic("unreachable") } // Read resolved mount list and replace the original one from the spec. @@ -217,16 +238,21 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Create the loader. bootArgs := boot.Args{ - ID: f.Arg(0), - Spec: spec, - Conf: conf, - ControllerFD: b.controllerFD, - Device: os.NewFile(uintptr(b.deviceFD), "platform device"), - GoferFDs: b.ioFDs.GetArray(), - StdioFDs: b.stdioFDs.GetArray(), - NumCPU: b.cpuNum, - TotalMem: b.totalMem, - UserLogFD: b.userLogFD, + ID: f.Arg(0), + Spec: spec, + Conf: conf, + ControllerFD: b.controllerFD, + Device: os.NewFile(uintptr(b.deviceFD), "platform device"), + GoferFDs: b.ioFDs.GetArray(), + StdioFDs: b.stdioFDs.GetArray(), + NumCPU: b.cpuNum, + TotalMem: b.totalMem, + UserLogFD: b.userLogFD, + ProfileBlockFD: b.profileBlockFD, + ProfileCPUFD: b.profileCPUFD, + ProfileHeapFD: b.profileHeapFD, + ProfileMutexFD: b.profileMutexFD, + TraceFD: b.traceFD, } l, err := boot.New(bootArgs) if err != nil { @@ -259,7 +285,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) ws := l.WaitExit() log.Infof("application exiting with %+v", ws) waitStatus := args[1].(*unix.WaitStatus) - *waitStatus = unix.WaitStatus(ws.Status()) + *waitStatus = unix.WaitStatus(ws) l.Destroy() return subcommands.ExitSuccess } diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go index e13a94486..99075d82d 100644 --- a/runsc/cmd/capability_test.go +++ b/runsc/cmd/capability_test.go @@ -122,6 +122,9 @@ func TestCapabilities(t *testing.T) { func TestMain(m *testing.M) { flag.Parse() - specutils.MaybeRunAsRoot() + if err := specutils.MaybeRunAsRoot(); err != nil { + fmt.Fprintf(os.Stderr, "Error running as root: %v", err) + os.Exit(123) + } os.Exit(m.Run()) } diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go index e988247da..1fe9c6435 100644 --- a/runsc/cmd/chroot.go +++ b/runsc/cmd/chroot.go @@ -30,7 +30,7 @@ func mountInChroot(chroot, src, dst, typ string, flags uint32) error { chrootDst := filepath.Join(chroot, dst) log.Infof("Mounting %q at %q", src, chrootDst) - if err := specutils.Mount(src, chrootDst, typ, flags); err != nil { + if err := specutils.SafeSetupAndMount(src, chrootDst, typ, flags, "/proc"); err != nil { return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err) } return nil @@ -59,6 +59,23 @@ func pivotRoot(root string) error { return nil } +func copyFile(dst, src string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Create(dst) + if err != nil { + return err + } + defer out.Close() + + _, err = out.ReadFrom(in) + return err +} + // setUpChroot creates an empty directory with runsc mounted at /runsc and proc // mounted at /proc. func setUpChroot(pidns bool) error { @@ -70,14 +87,22 @@ func setUpChroot(pidns bool) error { // Convert all shared mounts into slave to be sure that nothing will be // propagated outside of our namespace. - if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", "/proc"); err != nil { return fmt.Errorf("error converting mounts: %v", err) } - if err := unix.Mount("runsc-root", chroot, "tmpfs", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC, ""); err != nil { + if err := specutils.SafeMount("runsc-root", chroot, "tmpfs", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC, "", "/proc"); err != nil { return fmt.Errorf("error mounting tmpfs in choot: %v", err) } + if err := os.Mkdir(filepath.Join(chroot, "etc"), 0755); err != nil { + return fmt.Errorf("error creating /etc in chroot: %v", err) + } + + if err := copyFile(filepath.Join(chroot, "etc/localtime"), "/etc/localtime"); err != nil { + log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err) + } + if pidns { flags := uint32(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_RDONLY) if err := mountInChroot(chroot, "proc", "/proc", "proc", flags); err != nil { @@ -89,7 +114,7 @@ func setUpChroot(pidns bool) error { } } - if err := unix.Mount("", chroot, "", unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_BIND, ""); err != nil { + if err := specutils.SafeMount("", chroot, "", unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_BIND, "", "/proc"); err != nil { return fmt.Errorf("error remounting chroot in read-only: %v", err) } diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index 6212ffb2e..318753728 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -37,9 +37,9 @@ type Debug struct { pid int stacks bool signal int - profileHeap string - profileCPU string profileBlock string + profileCPU string + profileHeap string profileMutex string trace string strace string @@ -48,6 +48,7 @@ type Debug struct { delay time.Duration duration time.Duration ps bool + cat stringSlice } // Name implements subcommands.Command. @@ -69,9 +70,9 @@ func (*Debug) Usage() string { func (d *Debug) SetFlags(f *flag.FlagSet) { f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set") f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") - f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") - f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.") + f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") + f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.") f.DurationVar(&d.delay, "delay", time.Hour, "amount of time to delay for collecting heap and goroutine profiles.") f.DurationVar(&d.duration, "duration", time.Hour, "amount of time to wait for CPU and trace profiles.") @@ -81,6 +82,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).") f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.") f.BoolVar(&d.ps, "ps", false, "lists processes") + f.Var(&d.cat, "cat", "reads files and print to standard output") } // Execute implements subcommands.Command.Execute. @@ -88,6 +90,13 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) var c *container.Container conf := args[0].(*config.Config) + if conf.ProfileBlock != "" || conf.ProfileCPU != "" || conf.ProfileHeap != "" || conf.ProfileMutex != "" { + return Errorf("global -profile-{block,cpu,heap,mutex} flags have no effect on runsc debug. Pass runsc debug -profile-{block,cpu,heap,mutex} instead") + } + if conf.TraceFile != "" { + return Errorf("global -trace flag has no effect on runsc debug. Pass runsc debug -trace instead") + } + if d.pid == 0 { // No pid, container ID must have been provided. if f.NArg() != 1 { @@ -166,7 +175,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) log.Infof("Enabling strace for syscalls: %s", d.strace) args.SetStrace = true args.EnableStrace = true - args.StraceWhitelist = strings.Split(d.strace, ",") + args.StraceAllowlist = strings.Split(d.strace, ",") } if len(d.logLevel) != 0 { @@ -217,19 +226,19 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Open profiling files. var ( - heapFile *os.File - cpuFile *os.File - traceFile *os.File blockFile *os.File + cpuFile *os.File + heapFile *os.File mutexFile *os.File + traceFile *os.File ) - if d.profileHeap != "" { - f, err := os.OpenFile(d.profileHeap, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if d.profileBlock != "" { + f, err := os.OpenFile(d.profileBlock, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { - return Errorf("error opening heap profile output: %v", err) + return Errorf("error opening blocking profile output: %v", err) } defer f.Close() - heapFile = f + blockFile = f } if d.profileCPU != "" { f, err := os.OpenFile(d.profileCPU, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) @@ -239,20 +248,13 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) defer f.Close() cpuFile = f } - if d.trace != "" { - f, err := os.OpenFile(d.trace, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) - if err != nil { - return Errorf("error opening trace profile output: %v", err) - } - traceFile = f - } - if d.profileBlock != "" { - f, err := os.OpenFile(d.profileBlock, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if d.profileHeap != "" { + f, err := os.OpenFile(d.profileHeap, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { - return Errorf("error opening blocking profile output: %v", err) + return Errorf("error opening heap profile output: %v", err) } defer f.Close() - blockFile = f + heapFile = f } if d.profileMutex != "" { f, err := os.OpenFile(d.profileMutex, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) @@ -262,21 +264,28 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) defer f.Close() mutexFile = f } + if d.trace != "" { + f, err := os.OpenFile(d.trace, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return Errorf("error opening trace profile output: %v", err) + } + traceFile = f + } // Collect profiles. var ( wg sync.WaitGroup - heapErr error - cpuErr error - traceErr error blockErr error + cpuErr error + heapErr error mutexErr error + traceErr error ) - if heapFile != nil { + if blockFile != nil { wg.Add(1) go func() { defer wg.Done() - heapErr = c.Sandbox.HeapProfile(heapFile, d.delay) + blockErr = c.Sandbox.BlockProfile(blockFile, d.duration) }() } if cpuFile != nil { @@ -286,25 +295,25 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) cpuErr = c.Sandbox.CPUProfile(cpuFile, d.duration) }() } - if traceFile != nil { + if heapFile != nil { wg.Add(1) go func() { defer wg.Done() - traceErr = c.Sandbox.Trace(traceFile, d.duration) + heapErr = c.Sandbox.HeapProfile(heapFile, d.delay) }() } - if blockFile != nil { + if mutexFile != nil { wg.Add(1) go func() { defer wg.Done() - blockErr = c.Sandbox.BlockProfile(blockFile, d.duration) + mutexErr = c.Sandbox.MutexProfile(mutexFile, d.duration) }() } - if mutexFile != nil { + if traceFile != nil { wg.Add(1) go func() { defer wg.Done() - mutexErr = c.Sandbox.MutexProfile(mutexFile, d.duration) + traceErr = c.Sandbox.Trace(traceFile, d.duration) }() } @@ -337,35 +346,41 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Collect all errors. errorCount := 0 - if heapErr != nil { + if blockErr != nil { errorCount++ - log.Infof("error collecting heap profile: %v", heapErr) - os.Remove(heapFile.Name()) + log.Infof("error collecting block profile: %v", blockErr) + os.Remove(blockFile.Name()) } if cpuErr != nil { errorCount++ log.Infof("error collecting cpu profile: %v", cpuErr) os.Remove(cpuFile.Name()) } - if traceErr != nil { - errorCount++ - log.Infof("error collecting trace profile: %v", traceErr) - os.Remove(traceFile.Name()) - } - if blockErr != nil { + if heapErr != nil { errorCount++ - log.Infof("error collecting block profile: %v", blockErr) - os.Remove(blockFile.Name()) + log.Infof("error collecting heap profile: %v", heapErr) + os.Remove(heapFile.Name()) } if mutexErr != nil { errorCount++ log.Infof("error collecting mutex profile: %v", mutexErr) os.Remove(mutexFile.Name()) } + if traceErr != nil { + errorCount++ + log.Infof("error collecting trace profile: %v", traceErr) + os.Remove(traceFile.Name()) + } if errorCount > 0 { return subcommands.ExitFailure } + if d.cat != nil { + if err := c.Cat(d.cat, os.Stdout); err != nil { + return Errorf("Cat failed: %v", err) + } + } + return subcommands.ExitSuccess } diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 5485db149..4eb5a96f1 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -130,7 +130,6 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su if conf.Network == config.NetworkNone { addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace}) - } else if conf.Rootless { if conf.Network == config.NetworkSandbox { c.notifyUser("*** Warning: sandbox network isn't supported with --rootless, switching to host ***") @@ -225,25 +224,25 @@ func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) { args := strings.Split(cmd, " ") cmd := exec.Command(args[0], args[1:]...) if err := cmd.Run(); err != nil { - c.cleanupNet(cid, dev, "", "", "") + c.cleanupNet(cid, "", "", "") return nil, fmt.Errorf("failed to run %q: %v", cmd, err) } } resolvPath, err := makeFile("/etc/resolv.conf", "nameserver 8.8.8.8\n", spec) if err != nil { - c.cleanupNet(cid, dev, "", "", "") + c.cleanupNet(cid, "", "", "") return nil, err } hostnamePath, err := makeFile("/etc/hostname", cid+"\n", spec) if err != nil { - c.cleanupNet(cid, dev, resolvPath, "", "") + c.cleanupNet(cid, resolvPath, "", "") return nil, err } hosts := fmt.Sprintf("127.0.0.1\tlocalhost\n%s\t%s\n", c.ip, cid) hostsPath, err := makeFile("/etc/hosts", hosts, spec) if err != nil { - c.cleanupNet(cid, dev, resolvPath, hostnamePath, "") + c.cleanupNet(cid, resolvPath, hostnamePath, "") return nil, err } @@ -253,7 +252,7 @@ func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) { } addNamespace(spec, netns) - return func() { c.cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath) }, nil + return func() { c.cleanupNet(cid, resolvPath, hostnamePath, hostsPath) }, nil } // cleanupNet tries to cleanup the network setup in setupNet. @@ -263,7 +262,7 @@ func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) { // // Unfortunately none of this can be automatically cleaned up on process exit, // we must do so explicitly. -func (c *Do) cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath string) { +func (c *Do) cleanupNet(cid, resolvPath, hostnamePath, hostsPath string) { _, peer := deviceNames(cid) cmds := []string{ diff --git a/runsc/cmd/error.go b/runsc/cmd/error.go index 3585b5448..96c5c1e8d 100644 --- a/runsc/cmd/error.go +++ b/runsc/cmd/error.go @@ -58,7 +58,7 @@ func Errorf(format string, args ...interface{}) subcommands.ExitStatus { panic(err) } if ErrorLogger != nil { - ErrorLogger.Write(b) + _, _ = ErrorLogger.Write(b) } return subcommands.ExitFailure diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go index 06f00e8e7..08246e543 100644 --- a/runsc/cmd/events.go +++ b/runsc/cmd/events.go @@ -33,6 +33,10 @@ type Events struct { intervalSec int // If true, events will print a single group of stats and exit. stats bool + // If true, events will dump all filtered events to stdout. + stream bool + // filters for streamed events. + filters stringSlice } // Name implements subcommands.Command.Name. @@ -62,6 +66,8 @@ OPTIONS: func (evs *Events) SetFlags(f *flag.FlagSet) { f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds") f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit") + f.BoolVar(&evs.stream, "stream", false, "dump all filtered events to stdout") + f.Var(&evs.filters, "filters", "only display matching events") } // Execute implements subcommands.Command.Execute. @@ -79,6 +85,13 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa Fatalf("loading sandbox: %v", err) } + if evs.stream { + if err := c.Stream(evs.filters, os.Stdout); err != nil { + Fatalf("Stream failed: %v", err) + } + return subcommands.ExitSuccess + } + // Repeatedly get stats from the container. for { // Get the event and print it as JSON. @@ -97,7 +110,9 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa if err != nil { log.Warningf("Error while marshalling event %v: %v", ev.Event, err) } else { - os.Stdout.Write(b) + if _, err := os.Stdout.Write(b); err != nil { + Fatalf("Error writing to stdout: %v", err) + } } // If we're only running once, break. If we're only running diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 242d474b8..2139fdf53 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -146,12 +146,12 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if ex.detach { return ex.execChildAndWait(waitStatus) } - return ex.exec(c, e, waitStatus) + return ex.exec(conf, c, e, waitStatus) } -func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *unix.WaitStatus) subcommands.ExitStatus { +func (ex *Exec) exec(conf *config.Config, c *container.Container, e *control.ExecArgs, waitStatus *unix.WaitStatus) subcommands.ExitStatus { // Start the new process and get its pid. - pid, err := c.Execute(e) + pid, err := c.Execute(conf, e) if err != nil { return Errorf("executing processes for container: %v", err) } diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 5ded7b946..2193e9040 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -116,9 +116,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Note: minimal argument handling for the default case to keep it simple. args := os.Args args = append(args, "--apply-caps=false", "--setup-root=false") - if err := setCapsAndCallSelf(args, goferCaps); err != nil { - Fatalf("Unable to apply caps: %v", err) - } + Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps)) panic("unreachable") } @@ -267,7 +265,8 @@ func isReadonlyMount(opts []string) bool { func setupRootFS(spec *specs.Spec, conf *config.Config) error { // Convert all shared mounts into slaves to be sure that nothing will be // propagated outside of our namespace. - if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + procPath := "/proc" + if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", procPath); err != nil { Fatalf("error converting mounts: %v", err) } @@ -280,21 +279,34 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error { // We need a directory to construct a new root and we know that // runsc can't start without /proc, so we can use it for this. flags := uintptr(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC) - if err := unix.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil { + if err := specutils.SafeMount("runsc-root", "/proc", "tmpfs", flags, "", procPath); err != nil { Fatalf("error mounting tmpfs: %v", err) } // Prepare tree structure for pivot_root(2). - os.Mkdir("/proc/proc", 0755) - os.Mkdir("/proc/root", 0755) + if err := os.Mkdir("/proc/proc", 0755); err != nil { + Fatalf("error creating /proc/proc: %v", err) + } + if err := os.Mkdir("/proc/root", 0755); err != nil { + Fatalf("error creating /proc/root: %v", err) + } + if err := os.Mkdir("/proc/etc", 0755); err != nil { + Fatalf("error creating /proc/etc: %v", err) + } + // This cannot use SafeMount because there's no available procfs. But we + // know that /proc is an empty tmpfs mount, so this is safe. if err := unix.Mount("runsc-proc", "/proc/proc", "proc", flags|unix.MS_RDONLY, ""); err != nil { Fatalf("error mounting proc: %v", err) } + if err := copyFile("/proc/etc/localtime", "/etc/localtime"); err != nil { + log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err) + } root = "/proc/root" + procPath = "/proc/proc" } // Mount root path followed by submounts. - if err := unix.Mount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, ""); err != nil { + if err := specutils.SafeMount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, "", procPath); err != nil { return fmt.Errorf("mounting root on root (%q) err: %v", root, err) } @@ -302,12 +314,12 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error { if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation}) } - if err := unix.Mount("", root, "", uintptr(flags), ""); err != nil { + if err := specutils.SafeMount("", root, "", uintptr(flags), "", procPath); err != nil { return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err) } // Replace the current spec, with the clean spec with symlinks resolved. - if err := setupMounts(conf, spec.Mounts, root); err != nil { + if err := setupMounts(conf, spec.Mounts, root, procPath); err != nil { Fatalf("error setting up FS: %v", err) } @@ -329,7 +341,7 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error { // to make it read-only for extra safety. log.Infof("Remounting root as readonly: %q", root) flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_REC) - if err := unix.Mount(root, root, "bind", flags, ""); err != nil { + if err := specutils.SafeMount(root, root, "bind", flags, "", procPath); err != nil { return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err) } } @@ -345,10 +357,10 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error { return nil } -// setupMounts binds mount all mounts specified in the spec in their correct +// setupMounts bind mounts all mounts specified in the spec in their correct // location inside root. It will resolve relative paths and symlinks. It also // creates directories as needed. -func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error { +func setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string) error { for _, m := range mounts { if !specutils.Is9PMount(m, conf.VFS2) { continue @@ -366,14 +378,14 @@ func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error { } log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) - if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil { - return fmt.Errorf("mounting %v: %v", m, err) + if err := specutils.SafeSetupAndMount(m.Source, dst, m.Type, flags, procPath); err != nil { + return fmt.Errorf("mounting %+v: %v", m, err) } // Set propagation options that cannot be set together with other options. flags = specutils.PropOptionsToFlags(m.Options) if flags != 0 { - if err := unix.Mount("", dst, "", uintptr(flags), ""); err != nil { + if err := specutils.SafeMount("", dst, "", uintptr(flags), "", procPath); err != nil { return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err) } } diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go index cd85dabbb..35545e938 100644 --- a/runsc/cmd/help.go +++ b/runsc/cmd/help.go @@ -58,7 +58,7 @@ func (*Help) Usage() string { } // SetFlags implements subcommands.Command.SetFlags. -func (h *Help) SetFlags(f *flag.FlagSet) {} +func (h *Help) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command.Execute. func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { diff --git a/runsc/cmd/install.go b/runsc/cmd/install.go index 2e223e3be..dc9e01d95 100644 --- a/runsc/cmd/install.go +++ b/runsc/cmd/install.go @@ -58,7 +58,7 @@ func (i *Install) SetFlags(fs *flag.FlagSet) { } // Execute implements subcommands.Command.Execute. -func (i *Install) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { +func (i *Install) Execute(_ context.Context, f *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus { // Grab the name and arguments. runtimeArgs := f.Args() @@ -134,7 +134,7 @@ func (u *Uninstall) SetFlags(fs *flag.FlagSet) { } // Execute implements subcommands.Command.Execute. -func (u *Uninstall) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { +func (u *Uninstall) Execute(context.Context, *flag.FlagSet, ...interface{}) subcommands.ExitStatus { log.Printf("Removing runtime %q from %q.", u.Runtime, u.ConfigFile) c, err := readConfig(u.ConfigFile) diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go index 9f9a47bd8..2adfcced7 100644 --- a/runsc/cmd/list.go +++ b/runsc/cmd/list.go @@ -102,7 +102,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) c.CreatedAt.Format(time.RFC3339Nano), c.Owner) } - w.Flush() + _ = w.Flush() case "json": // Print just the states. var states []specs.State diff --git a/runsc/cmd/mitigate.go b/runsc/cmd/mitigate.go index f4e65adb8..1aada5968 100644 --- a/runsc/cmd/mitigate.go +++ b/runsc/cmd/mitigate.go @@ -18,6 +18,7 @@ import ( "context" "fmt" "io/ioutil" + "os" "runtime" "github.com/google/subcommands" @@ -29,8 +30,8 @@ import ( const ( // cpuInfo is the path used to parse CPU info. cpuInfo = "/proc/cpuinfo" - // allPossibleCPUs is the path used to enable CPUs. - allPossibleCPUs = "/sys/devices/system/cpu/possible" + // Path to enable/disable SMT. + smtPath = "/sys/devices/system/cpu/smt/control" ) // Mitigate implements subcommands.Command for the "mitigate" command. @@ -39,10 +40,10 @@ type Mitigate struct { dryRun bool // Reverse mitigate by turning on all CPU cores. reverse bool - // Path to file to read to create CPUSet. - path string // Extra data for post mitigate operations. data string + // Control to mitigate/reverse smt. + control machineControl } // Name implements subcommands.command.name. @@ -56,12 +57,12 @@ func (*Mitigate) Synopsis() string { } // Usage implements Usage for cmd.Mitigate. -func (m Mitigate) Usage() string { +func (m *Mitigate) Usage() string { return fmt.Sprintf(`mitigate [flags] -mitigate mitigates a system to the "MDS" vulnerability by implementing a manual shutdown of SMT. The command checks /proc/cpuinfo for cpus having the MDS vulnerability, and if found, shutdown all but one CPU per hyperthread pair via /sys/devices/system/cpu/cpu{N}/online. CPUs can be restored by writing "2" to each file in /sys/devices/system/cpu/cpu{N}/online or performing a system reboot. +mitigate mitigates a system to the "MDS" vulnerability by writing "off" to %q. CPUs can be restored by writing "on" to the same file or rebooting your system. -The command can be reversed with --reverse, which reads the total CPUs from /sys/devices/system/cpu/possible and enables all with /sys/devices/system/cpu/cpu{N}/online.%s`, m.usage()) +The command can be reversed with --reverse, which writes "on" to the file above.%s`, smtPath, m.usage()) } // SetFlags sets flags for the command Mitigate. @@ -74,104 +75,110 @@ func (m *Mitigate) SetFlags(f *flag.FlagSet) { // Execute implements subcommands.Command.Execute. func (m *Mitigate) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { if runtime.GOARCH == "arm64" || runtime.GOARCH == "arm" { - log.Warningf("As ARM is not affected by MDS, mitigate does not support") - return subcommands.ExitFailure + log.Warningf("As ARM is not affected by MDS, mitigate does not support ARM machines.") + // Set reverse flag so that we still perform post mitigate operations. mitigate reverse is a noop in this case. + m.reverse = true } if f.NArg() != 0 { f.Usage() return subcommands.ExitUsageError } + m.control = &machineControlImpl{} + return m.execute() +} - m.path = cpuInfo - if m.reverse { - m.path = allPossibleCPUs +// execute executes mitigate operations. Seperate from Execute method for +// easier mocking. +func (m *Mitigate) execute() subcommands.ExitStatus { + beforeSet, err := m.control.getCPUs() + if err != nil { + return Errorf("Get before CPUSet failed: %v", err) } + log.Infof("CPUs before: %s", beforeSet.String()) - set, err := m.doExecute() - if err != nil { - return Errorf("Execute failed: %v", err) + if err := m.doEnableDisable(beforeSet); err != nil { + return Errorf("Enabled/Disable action failed on %q: %v", smtPath, err) } - if m.data == "" { - return subcommands.ExitSuccess + afterSet, err := m.control.getCPUs() + if err != nil { + return Errorf("Get after CPUSet failed: %v", err) } + log.Infof("CPUs after: %s", afterSet.String()) - if err = m.postMitigate(set); err != nil { + if err = m.postMitigate(afterSet); err != nil { return Errorf("Post Mitigate failed: %v", err) } return subcommands.ExitSuccess } -// Execute executes the Mitigate command. -func (m *Mitigate) doExecute() (mitigate.CPUSet, error) { - if m.dryRun { - log.Infof("Running with DryRun. No cpu settings will be changed.") - } - data, err := ioutil.ReadFile(m.path) - if err != nil { - return nil, fmt.Errorf("failed to read %s: %w", m.path, err) - } +// doEnableDisable does either enable or disable operation based on flags. +func (m *Mitigate) doEnableDisable(set mitigate.CPUSet) error { if m.reverse { - set, err := m.doReverse(data) - if err != nil { - return nil, fmt.Errorf("reverse operation failed: %w", err) + if m.dryRun { + log.Infof("Skipping reverse action because dryrun is set.") + return nil } - return set, nil + return m.control.enable() } - set, err := m.doMitigate(data) - if err != nil { - return nil, fmt.Errorf("mitigate operation failed: %w", err) + if m.dryRun { + log.Infof("Skipping mitigate action because dryrun is set.") + return nil } - return set, nil + if set.IsVulnerable() { + return m.control.disable() + } + log.Infof("CPUs not vulnerable. Skipping disable call.") + return nil } -func (m *Mitigate) doMitigate(data []byte) (mitigate.CPUSet, error) { - set, err := mitigate.NewCPUSet(data) - if err != nil { - return nil, err - } +// Interface to wrap interactions with underlying machine. Done +// so testing with mocks can be done hermetically. +type machineControl interface { + enable() error + disable() error + isEnabled() (bool, error) + getCPUs() (mitigate.CPUSet, error) +} - log.Infof("Mitigate found the following CPUs...") - log.Infof("%s", set) +// Implementation of SMT control interaction with the underlying machine. +type machineControlImpl struct{} - disableList := set.GetShutdownList() - log.Infof("Disabling threads on thread pairs.") - for _, t := range disableList { - log.Infof("Disable thread: %s", t) - if m.dryRun { - continue - } - if err := t.Disable(); err != nil { - return nil, fmt.Errorf("error disabling thread: %s err: %w", t, err) - } - } - log.Infof("Shutdown successful.") - return set, nil +func (*machineControlImpl) enable() error { + return checkFileExistsOnWrite("enable", "on") } -func (m *Mitigate) doReverse(data []byte) (mitigate.CPUSet, error) { - set, err := mitigate.NewCPUSetFromPossible(data) - if err != nil { - return nil, err - } +func (*machineControlImpl) disable() error { + return checkFileExistsOnWrite("disable", "off") +} - log.Infof("Reverse mitigate found the following CPUs...") - log.Infof("%s", set) +// Writes data to SMT control. If file not found, logs file not exist error and returns nil +// error, which is done because machines without the file pointed to by smtPath only have one +// thread per core in the first place. Otherwise returns error from ioutil.WriteFile. +func checkFileExistsOnWrite(op, data string) error { + err := ioutil.WriteFile(smtPath, []byte(data), 0644) + if err != nil && os.IsExist(err) { + log.Infof("File %q does not exist for operation %s. This machine probably has no smt control.", smtPath, op) + return nil + } + return err +} - enableList := set.GetRemainingList() +func (*machineControlImpl) isEnabled() (bool, error) { + data, err := ioutil.ReadFile(cpuInfo) + return string(data) == "on", err +} - log.Infof("Enabling all CPUs...") - for _, t := range enableList { - log.Infof("Enabling thread: %s", t) - if m.dryRun { - continue - } - if err := t.Enable(); err != nil { - return nil, fmt.Errorf("error enabling thread: %s err: %w", t, err) - } +func (*machineControlImpl) getCPUs() (mitigate.CPUSet, error) { + data, err := ioutil.ReadFile(cpuInfo) + if err != nil { + return nil, fmt.Errorf("failed to read %s: %w", cpuInfo, err) + } + set, err := mitigate.NewCPUSet(string(data)) + if err != nil { + return nil, fmt.Errorf("getCPUs: %v", err) } - log.Infof("Enable successful.") return set, nil } diff --git a/runsc/cmd/mitigate_extras.go b/runsc/cmd/mitigate_extras.go index 2cb2833f0..2c3e17cd6 100644 --- a/runsc/cmd/mitigate_extras.go +++ b/runsc/cmd/mitigate_extras.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package cmd import ( diff --git a/runsc/cmd/mitigate_test.go b/runsc/cmd/mitigate_test.go index 2d3fef7c1..294fc645c 100644 --- a/runsc/cmd/mitigate_test.go +++ b/runsc/cmd/mitigate_test.go @@ -12,153 +12,139 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package cmd import ( - "fmt" - "io/ioutil" - "os" - "strings" "testing" - "gvisor.dev/gvisor/runsc/mitigate/mock" + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/mitigate" ) -type executeTestCase struct { - name string - mitigateData string - mitigateError error - mitigateCPU int - reverseData string - reverseError error - reverseCPU int +type mockMachineControl struct { + enabled bool + cpus mitigate.CPUSet } -func TestExecute(t *testing.T) { +func (m *mockMachineControl) enable() error { + m.enabled = true + return nil +} - partial := `processor : 1 -vendor_id : AuthenticAMD -cpu family : 23 -model : 49 -model name : AMD EPYC 7B12 -physical id : 0 -bugs : sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass -power management: -` +func (m *mockMachineControl) disable() error { + if m.cpus.IsVulnerable() { + m.enabled = false + } + return nil +} +func (m *mockMachineControl) isEnabled() (bool, error) { + return m.enabled, nil +} + +func (m *mockMachineControl) getCPUs() (mitigate.CPUSet, error) { + set := m.cpus + if !m.enabled { + set = m.cpus[:len(m.cpus)/2] + } + + // Instead of just returning the created CPU set stored in this struct, call + // NewCPUSet to exercise that code path as the machineControlImpl would. + return mitigate.NewCPUSet(set.String()) +} + +type executeTestCase struct { + name string + cpu mitigate.MockCPU + mitigateWantCPUs int + mitigateError subcommands.ExitStatus + mitigateWantEnabled bool + reverseWantCPUs int + reverseError subcommands.ExitStatus + reverseWantEnabled bool + dryrun bool +} + +func TestExecute(t *testing.T) { for _, tc := range []executeTestCase{ { - name: "CascadeLake4", - mitigateData: mock.CascadeLake4.MakeCPUString(), - mitigateCPU: 2, - reverseData: mock.CascadeLake4.MakeSysPossibleString(), - reverseCPU: 4, + name: "CascadeLake4", + cpu: mitigate.CascadeLake4, + mitigateWantCPUs: 2, + mitigateWantEnabled: false, + reverseWantCPUs: 4, + reverseWantEnabled: true, }, { - name: "Empty", - mitigateData: "", - mitigateError: fmt.Errorf(`mitigate operation failed: no cpus found for: ""`), - reverseData: "", - reverseError: fmt.Errorf(`reverse operation failed: mismatch regex from possible: ""`), + name: "CascadeLake4DryRun", + cpu: mitigate.CascadeLake4, + mitigateWantCPUs: 4, + mitigateWantEnabled: true, + reverseWantCPUs: 4, + reverseWantEnabled: true, + dryrun: true, }, { - name: "Partial", - mitigateData: `processor : 0 -vendor_id : AuthenticAMD -cpu family : 23 -model : 49 -model name : AMD EPYC 7B12 -physical id : 0 -core id : 0 -cpu cores : 1 -bugs : sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass -power management::84 - -` + partial, - mitigateError: fmt.Errorf(`mitigate operation failed: failed to match key "core id": %q`, partial), - reverseData: "1-", - reverseError: fmt.Errorf(`reverse operation failed: mismatch regex from possible: %q`, "1-"), + name: "AMD8", + cpu: mitigate.AMD8, + mitigateWantCPUs: 8, + mitigateWantEnabled: true, + reverseWantCPUs: 8, + reverseWantEnabled: true, + }, + { + name: "Empty", + cpu: mitigate.Empty, + mitigateError: Errorf(`mitigate operation failed: no cpus found for: ""`), + reverseError: Errorf(`mitigate operation failed: no cpus found for: ""`), }, } { t.Run(tc.name, func(t *testing.T) { + set := tc.cpu.MakeCPUSet() m := &Mitigate{ - dryRun: true, + control: &mockMachineControl{ + enabled: true, + cpus: set, + }, + dryRun: tc.dryrun, } - m.doExecuteTest(t, "Mitigate", tc.mitigateData, tc.mitigateCPU, tc.mitigateError) + t.Run("Mitigate", func(t *testing.T) { + m.doExecuteTest(t, tc.mitigateWantEnabled, tc.mitigateWantCPUs, tc.mitigateError) + }) m.reverse = true - m.doExecuteTest(t, "Reverse", tc.reverseData, tc.reverseCPU, tc.reverseError) + t.Run("Reverse", func(t *testing.T) { + m.doExecuteTest(t, tc.reverseWantEnabled, tc.reverseWantCPUs, tc.reverseError) + }) }) } } -func TestExecuteSmoke(t *testing.T) { - smokeMitigate, err := ioutil.ReadFile(cpuInfo) - if err != nil { - t.Fatalf("Failed to read %s: %v", cpuInfo, err) +// doExecuteTest runs Execute with the mitigate operation and reverse operation. +func (m *Mitigate) doExecuteTest(t *testing.T, wantEnabled bool, wantCPUs int, wantErr subcommands.ExitStatus) { + subError := m.execute() + if subError != wantErr { + t.Fatalf("Mitigate error mismatch: want: %v got: %v", wantErr, subError) } - m := &Mitigate{ - dryRun: true, + // case where test should end in error or we don't care + // about how many cpus are returned. + if wantErr != subcommands.ExitSuccess { + log.Infof("return") + return } - m.doExecuteTest(t, "Mitigate", string(smokeMitigate), 0, nil) - - smokeReverse, err := ioutil.ReadFile(allPossibleCPUs) - if err != nil { - t.Fatalf("Failed to read %s: %v", allPossibleCPUs, err) + gotEnabled, _ := m.control.isEnabled() + if wantEnabled != gotEnabled { + t.Fatalf("Incorrect enabled state: want: %t got: %t", wantEnabled, gotEnabled) } - m.reverse = true - m.doExecuteTest(t, "Reverse", string(smokeReverse), 0, nil) -} - -// doExecuteTest runs Execute with the mitigate operation and reverse operation. -func (m *Mitigate) doExecuteTest(t *testing.T, name, data string, want int, wantErr error) { - t.Run(name, func(t *testing.T) { - file, err := ioutil.TempFile("", "outfile.txt") - if err != nil { - t.Fatalf("Failed to create tmpfile: %v", err) - } - defer os.Remove(file.Name()) - - if _, err := file.WriteString(data); err != nil { - t.Fatalf("Failed to write to file: %v", err) - } - - // Set fields for mitigate and dryrun to keep test hermetic. - m.path = file.Name() - - set, err := m.doExecute() - if err = checkErr(wantErr, err); err != nil { - t.Fatalf("Mitigate error mismatch: %v", err) - } - - // case where test should end in error or we don't care - // about how many cpus are returned. - if wantErr != nil || want < 1 { - return - } - got := len(set.GetRemainingList()) - if want != got { - t.Fatalf("Failed wrong number of remaining CPUs: want %d, got %d", want, got) - } - - }) -} - -// checkErr checks error for equality. -func checkErr(want, got error) error { - switch { - case want == nil && got == nil: - case want != nil && got == nil: - fallthrough - case want == nil && got != nil: - fallthrough - case want.Error() != strings.Trim(got.Error(), " "): - return fmt.Errorf("got: %v want: %v", got, want) + gotCPUs, _ := m.control.getCPUs() + if len(gotCPUs) != wantCPUs { + t.Fatalf("Incorrect number of CPUs: want: %d got: %d", wantCPUs, len(gotCPUs)) } - return nil } diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go index 15ef7b577..9768f1cfb 100644 --- a/runsc/cmd/pause.go +++ b/runsc/cmd/pause.go @@ -42,7 +42,7 @@ func (*Pause) Usage() string { } // SetFlags implements subcommands.Command.SetFlags. -func (*Pause) SetFlags(f *flag.FlagSet) { +func (*Pause) SetFlags(*flag.FlagSet) { } // Execute implements subcommands.Command.Execute. diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go index 856469252..d62e89e80 100644 --- a/runsc/cmd/resume.go +++ b/runsc/cmd/resume.go @@ -43,7 +43,7 @@ func (*Resume) Usage() string { } // SetFlags implements subcommands.Command.SetFlags. -func (r *Resume) SetFlags(f *flag.FlagSet) { +func (r *Resume) SetFlags(*flag.FlagSet) { } // Execute implements subcommands.Command.Execute. diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go index 722181aff..da11c9d06 100644 --- a/runsc/cmd/run.go +++ b/runsc/cmd/run.go @@ -68,7 +68,14 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s waitStatus := args[1].(*unix.WaitStatus) if conf.Rootless { - return Errorf("Rootless mode not supported with %q", r.Name()) + if conf.Network == config.NetworkSandbox { + return Errorf("sandbox network isn't supported with --rootless, use --network=none or --network=host") + } + + if err := specutils.MaybeRunAsRoot(); err != nil { + return Errorf("Error executing inside namespace: %v", err) + } + // Execution will continue here if no more capabilities are needed... } bundleDir := r.bundleDir diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go index 964a65064..7c395d722 100644 --- a/runsc/cmd/start.go +++ b/runsc/cmd/start.go @@ -43,7 +43,7 @@ func (*Start) Usage() string { } // SetFlags implements subcommands.Command.SetFlags. -func (*Start) SetFlags(f *flag.FlagSet) {} +func (*Start) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command.Execute. func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go index 1f7913d5a..061003bab 100644 --- a/runsc/cmd/state.go +++ b/runsc/cmd/state.go @@ -45,7 +45,7 @@ func (*State) Usage() string { } // SetFlags implements subcommands.Command.SetFlags. -func (*State) SetFlags(f *flag.FlagSet) {} +func (*State) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command.Execute. func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { @@ -71,6 +71,8 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s if err != nil { Fatalf("marshaling container state: %v", err) } - os.Stdout.Write(b) + if _, err := os.Stdout.Write(b); err != nil { + Fatalf("Error writing to stdout: %v", err) + } return subcommands.ExitSuccess } diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go index a8c83d662..608be9bb4 100644 --- a/runsc/cmd/syscalls.go +++ b/runsc/cmd/syscalls.go @@ -103,7 +103,7 @@ func (s *Syscalls) SetFlags(f *flag.FlagSet) { } // Execute implements subcommands.Command.Execute. -func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { +func (s *Syscalls) Execute(context.Context, *flag.FlagSet, ...interface{}) subcommands.ExitStatus { out, ok := outputMap[s.format] if !ok { Fatalf("Unsupported output format %q", s.format) diff --git a/runsc/cmd/usage.go b/runsc/cmd/usage.go new file mode 100644 index 000000000..d2aeafa28 --- /dev/null +++ b/runsc/cmd/usage.go @@ -0,0 +1,93 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "os" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// Usage implements subcommands.Command for the "usage" command. +type Usage struct { + full bool + fd bool +} + +// Name implements subcommands.Command.Name. +func (*Usage) Name() string { + return "usage" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Usage) Synopsis() string { + return "Usage shows application memory usage across various categories in bytes." +} + +// Usage implements subcommands.Command.Usage. +func (*Usage) Usage() string { + return `usage [flags] <container id> - print memory usages to standard output.` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (u *Usage) SetFlags(f *flag.FlagSet) { + f.BoolVar(&u.full, "full", false, "enumerate all usage by categories") + f.BoolVar(&u.fd, "fd", false, "retrieves a subset of usage through the established usage FD") +} + +// Execute implements subcommands.Command.Execute. +func (u *Usage) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() < 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*config.Config) + + cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) + if err != nil { + Fatalf("loading container: %v", err) + } + + if !u.fd { + m, err := cont.Usage(u.full) + if err != nil { + Fatalf("usage failed: %v", err) + } + if err := json.NewEncoder(os.Stdout).Encode(m); err != nil { + Fatalf("Encode MemoryUsage failed: %v", err) + } + } else { + m, err := cont.UsageFD() + if err != nil { + Fatalf("usagefd failed: %v", err) + } + + mapped, unknown, total, err := m.Fetch() + if err != nil { + Fatalf("Fetch memory usage failed: %v", err) + } + + fmt.Printf("Mapped %v, Unknown %v, Total %v\n", mapped, unknown, total) + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/verity_prepare.go b/runsc/cmd/verity_prepare.go index 66128b2a3..44c1d05db 100644 --- a/runsc/cmd/verity_prepare.go +++ b/runsc/cmd/verity_prepare.go @@ -82,18 +82,23 @@ func (c *VerityPrepare) Execute(_ context.Context, f *flag.FlagSet, args ...inte }, Process: &specs.Process{ Cwd: absRoot, - Args: []string{c.tool, "--path", "/verityroot"}, + Args: []string{c.tool, "--path", "/verityroot", "--rawpath", "/rawroot"}, Env: os.Environ(), Capabilities: specutils.AllCapabilities(), }, Hostname: hostname, Mounts: []specs.Mount{ - specs.Mount{ + { Source: c.dir, Destination: "/verityroot", Type: "bind", Options: []string{"verity.roothash="}, }, + { + Source: c.dir, + Destination: "/rawroot", + Type: "bind", + }, }, } diff --git a/runsc/config/BUILD b/runsc/config/BUILD index b1672bb9d..64295d283 100644 --- a/runsc/config/BUILD +++ b/runsc/config/BUILD @@ -11,6 +11,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/refs", + "//pkg/sentry/control:control_go_proto", "//pkg/sentry/watchdog", "//pkg/sync", "//runsc/flag", @@ -24,5 +25,8 @@ go_test( "config_test.go", ], library = ":config", - deps = ["//runsc/flag"], + deps = [ + "//pkg/sentry/control:control_go_proto", + "//runsc/flag", + ], ) diff --git a/runsc/config/config.go b/runsc/config/config.go index 3d8c7a0ab..a562f7bf4 100644 --- a/runsc/config/config.go +++ b/runsc/config/config.go @@ -19,8 +19,10 @@ package config import ( "fmt" + "strings" "gvisor.dev/gvisor/pkg/refs" + controlpb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto" "gvisor.dev/gvisor/pkg/sentry/watchdog" ) @@ -84,6 +86,9 @@ type Config struct { // capabilities. EnableRaw bool `flag:"net-raw"` + // AllowPacketEndpointWrite enables write operations on packet endpoints. + AllowPacketEndpointWrite bool `flag:"TESTONLY-allow-packet-endpoint-write"` + // HardwareGSO indicates that hardware segmentation offload is enabled. HardwareGSO bool `flag:"gso"` @@ -117,6 +122,10 @@ type Config struct { // StraceLogSize is the max size of data blobs to display. StraceLogSize uint `flag:"strace-log-size"` + // StraceEvent indicates sending strace to events if true. Strace is + // sent to log if false. + StraceEvent bool `flag:"strace-event"` + // DisableSeccomp indicates whether seccomp syscall filters should be // disabled. Pardon the double negation, but default to enabled is important. DisableSeccomp bool @@ -131,6 +140,29 @@ type Config struct { // ProfileEnable is set to prepare the sandbox to be profiled. ProfileEnable bool `flag:"profile"` + // ProfileBlock collects a block profile to the passed file for the + // duration of the container execution. Requires ProfileEnabled. + ProfileBlock string `flag:"profile-block"` + + // ProfileCPU collects a CPU profile to the passed file for the + // duration of the container execution. Requires ProfileEnabled. + ProfileCPU string `flag:"profile-cpu"` + + // ProfileHeap collects a heap profile to the passed file for the + // duration of the container execution. Requires ProfileEnabled. + ProfileHeap string `flag:"profile-heap"` + + // ProfileMutex collects a mutex profile to the passed file for the + // duration of the container execution. Requires ProfileEnabled. + ProfileMutex string `flag:"profile-mutex"` + + // TraceFile collects a Go runtime execution trace to the passed file + // for the duration of the container execution. + TraceFile string `flag:"trace"` + + // Controls defines the controls that may be enabled. + Controls controlConfig `flag:"controls"` + // RestoreFile is the path to the saved container image RestoreFile string @@ -142,7 +174,8 @@ type Config struct { // Rootless allows the sandbox to be started with a user that is not root. // Defense in depth measures are weaker in rootless mode. Specifically, the // sandbox and Gofer process run as root inside a user namespace with root - // mapped to the caller's user. + // mapped to the caller's user. When using rootless, the container root path + // should not have a symlink. Rootless bool `flag:"rootless"` // AlsoLogToStderr allows to send log messages to stderr. @@ -175,7 +208,8 @@ type Config struct { // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in // tests. It allows runsc to start the sandbox process as the current // user, and without chrooting the sandbox process. This can be - // necessary in test environments that have limited capabilities. + // necessary in test environments that have limited capabilities. When + // disabling chroot, the container root path should not have a symlink. TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"` // TestOnlyTestNameEnv should only be used in tests. It looks up for the @@ -193,6 +227,21 @@ func (c *Config) validate() error { if c.NumNetworkChannels <= 0 { return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels) } + // Require profile flags to explicitly opt-in to profiling with + // -profile rather than implying it since these options have security + // implications. + if c.ProfileBlock != "" && !c.ProfileEnable { + return fmt.Errorf("profile-block flag requires enabling profiling with profile flag") + } + if c.ProfileCPU != "" && !c.ProfileEnable { + return fmt.Errorf("profile-cpu flag requires enabling profiling with profile flag") + } + if c.ProfileHeap != "" && !c.ProfileEnable { + return fmt.Errorf("profile-heap flag requires enabling profiling with profile flag") + } + if c.ProfileMutex != "" && !c.ProfileEnable { + return fmt.Errorf("profile-mutex flag requires enabling profiling with profile flag") + } return nil } @@ -345,6 +394,96 @@ func (q QueueingDiscipline) String() string { panic(fmt.Sprintf("Invalid qdisc %d", q)) } +// controlConfig represents control endpoints. +type controlConfig struct { + Controls *controlpb.ControlConfig +} + +// Set implements flag.Value. +func (c *controlConfig) Set(v string) error { + controls := strings.Split(v, ",") + var controlList []controlpb.ControlConfig_Endpoint + for _, control := range controls { + switch control { + case "EVENTS": + controlList = append(controlList, controlpb.ControlConfig_EVENTS) + case "FS": + controlList = append(controlList, controlpb.ControlConfig_FS) + case "LIFECYCLE": + controlList = append(controlList, controlpb.ControlConfig_LIFECYCLE) + case "LOGGING": + controlList = append(controlList, controlpb.ControlConfig_LOGGING) + case "PROFILE": + controlList = append(controlList, controlpb.ControlConfig_PROFILE) + case "USAGE": + controlList = append(controlList, controlpb.ControlConfig_USAGE) + case "PROC": + controlList = append(controlList, controlpb.ControlConfig_PROC) + case "STATE": + controlList = append(controlList, controlpb.ControlConfig_STATE) + case "DEBUG": + controlList = append(controlList, controlpb.ControlConfig_DEBUG) + default: + return fmt.Errorf("invalid control %q", control) + } + } + c.Controls.AllowedControls = controlList + return nil +} + +// Get implements flag.Value. +func (c *controlConfig) Get() interface{} { + return *c +} + +// String implements flag.Value. +func (c *controlConfig) String() string { + v := "" + for _, control := range c.Controls.GetAllowedControls() { + if len(v) > 0 { + v += "," + } + switch control { + case controlpb.ControlConfig_EVENTS: + v += "EVENTS" + case controlpb.ControlConfig_FS: + v += "FS" + case controlpb.ControlConfig_LIFECYCLE: + v += "LIFECYCLE" + case controlpb.ControlConfig_LOGGING: + v += "LOGGING" + case controlpb.ControlConfig_PROFILE: + v += "PROFILE" + case controlpb.ControlConfig_USAGE: + v += "USAGE" + case controlpb.ControlConfig_PROC: + v += "PROC" + case controlpb.ControlConfig_STATE: + v += "STATE" + case controlpb.ControlConfig_DEBUG: + v += "DEBUG" + default: + panic(fmt.Sprintf("Invalid control %d", control)) + } + } + return v +} + +func defaultControlConfig() *controlConfig { + c := controlConfig{} + c.Controls = &controlpb.ControlConfig{} + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_EVENTS) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_FS) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_LIFECYCLE) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_LOGGING) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_PROFILE) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_USAGE) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_PROC) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_STATE) + c.Controls.AllowedControls = append(c.Controls.AllowedControls, controlpb.ControlConfig_DEBUG) + return &c +} + func leakModePtr(v refs.LeakMode) *refs.LeakMode { return &v } diff --git a/runsc/config/config_test.go b/runsc/config/config_test.go index fb162b7eb..57c241c86 100644 --- a/runsc/config/config_test.go +++ b/runsc/config/config_test.go @@ -18,6 +18,7 @@ import ( "strings" "testing" + controlpb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto" "gvisor.dev/gvisor/runsc/flag" ) @@ -41,21 +42,43 @@ func TestDefault(t *testing.T) { } } -func setDefault(name string) { +func setDefault(name string) error { fl := flag.CommandLine.Lookup(name) - fl.Value.Set(fl.DefValue) + return fl.Value.Set(fl.DefValue) } func TestFromFlags(t *testing.T) { - flag.CommandLine.Lookup("root").Value.Set("some-path") - flag.CommandLine.Lookup("debug").Value.Set("true") - flag.CommandLine.Lookup("num-network-channels").Value.Set("123") - flag.CommandLine.Lookup("network").Value.Set("none") + if err := flag.CommandLine.Lookup("root").Value.Set("some-path"); err != nil { + t.Errorf("Flag set: %v", err) + } + if err := flag.CommandLine.Lookup("debug").Value.Set("true"); err != nil { + t.Errorf("Flag set: %v", err) + } + if err := flag.CommandLine.Lookup("num-network-channels").Value.Set("123"); err != nil { + t.Errorf("Flag set: %v", err) + } + if err := flag.CommandLine.Lookup("network").Value.Set("none"); err != nil { + t.Errorf("Flag set: %v", err) + } + if err := flag.CommandLine.Lookup("controls").Value.Set("EVENTS,FS"); err != nil { + t.Errorf("Flag set: %v", err) + } defer func() { - setDefault("root") - setDefault("debug") - setDefault("num-network-channels") - setDefault("network") + if err := setDefault("root"); err != nil { + t.Errorf("Flag set: %v", err) + } + if err := setDefault("debug"); err != nil { + t.Errorf("Flag set: %v", err) + } + if err := setDefault("num-network-channels"); err != nil { + t.Errorf("Flag set: %v", err) + } + if err := setDefault("network"); err != nil { + t.Errorf("Flag set: %v", err) + } + if err := setDefault("controls"); err != nil { + t.Errorf("Flag set: %v", err) + } }() c, err := NewFromFlags() @@ -74,6 +97,12 @@ func TestFromFlags(t *testing.T) { if want := NetworkNone; c.Network != want { t.Errorf("Network=%v, want: %v", c.Network, want) } + wants := []controlpb.ControlConfig_Endpoint{controlpb.ControlConfig_EVENTS, controlpb.ControlConfig_FS} + for i, want := range wants { + if c.Controls.Controls.AllowedControls[i] != want { + t.Errorf("Controls.Controls.AllowedControls[%d]=%v, want: %v", i, c.Controls.Controls.AllowedControls[i], want) + } + } } func TestToFlags(t *testing.T) { @@ -85,10 +114,15 @@ func TestToFlags(t *testing.T) { c.Debug = true c.NumNetworkChannels = 123 c.Network = NetworkNone + c.Controls = controlConfig{ + Controls: &controlpb.ControlConfig{ + AllowedControls: []controlpb.ControlConfig_Endpoint{controlpb.ControlConfig_EVENTS, controlpb.ControlConfig_FS}, + }, + } flags := c.ToFlags() - if len(flags) != 4 { - t.Errorf("wrong number of flags set, want: 4, got: %d: %s", len(flags), flags) + if len(flags) != 5 { + t.Errorf("wrong number of flags set, want: 5, got: %d: %s", len(flags), flags) } t.Logf("Flags: %s", flags) fm := map[string]string{} @@ -101,6 +135,7 @@ func TestToFlags(t *testing.T) { "--debug": "true", "--num-network-channels": "123", "--network": "none", + "--controls": "EVENTS,FS", } { if got, ok := fm[name]; ok { if got != want { diff --git a/runsc/config/flags.go b/runsc/config/flags.go index 6f1b5927a..1bf23951a 100644 --- a/runsc/config/flags.go +++ b/runsc/config/flags.go @@ -56,16 +56,23 @@ func RegisterFlags() { flag.Bool("strace", false, "enable strace.") flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.") flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.") + flag.Bool("strace-event", false, "send strace to event.") // Flags that control sandbox runtime behavior. flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.") flag.Var(watchdogActionPtr(watchdog.LogWarning), "watchdog-action", "sets what action the watchdog takes when triggered: log (default), panic.") flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + flag.String("profile-block", "", "collects a block profile to this file path for the duration of the container execution. Requires -profile=true.") + flag.String("profile-cpu", "", "collects a CPU profile to this file path for the duration of the container execution. Requires -profile=true.") + flag.String("profile-heap", "", "collects a heap profile to this file path for the duration of the container execution. Requires -profile=true.") + flag.String("profile-mutex", "", "collects a mutex profile to this file path for the duration of the container execution. Requires -profile=true.") + flag.String("trace", "", "collects a Go runtime execution trace to this file path for the duration of the container execution.") flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") flag.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.") flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") flag.Bool("oci-seccomp", false, "Enables loading OCI seccomp filters inside the sandbox.") + flag.Var(defaultControlConfig(), "controls", "Sentry control endpoints.") // Flags that control sandbox runtime behavior: FS related. flag.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem validation to use for the root mount: exclusive (default), shared.") @@ -90,6 +97,7 @@ func RegisterFlags() { // Test flags, not to be used outside tests, ever. flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.") + flag.Bool("TESTONLY-allow-packet-endpoint-write", false, "TEST ONLY; do not ever use! Used for tests to allow writes on packet sockets.") }) } diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 79b056fce..9d36086c3 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -288,7 +288,7 @@ func TestJobControlSignalExec(t *testing.T) { StdioIsPty: true, } - pid, err := c.Execute(execArgs) + pid, err := c.Execute(conf, execArgs) if err != nil { t.Fatalf("error executing: %v", err) } @@ -308,7 +308,9 @@ func TestJobControlSignalExec(t *testing.T) { } // Execute sleep. - ptyMaster.Write([]byte("sleep 100\n")) + if _, err := ptyMaster.Write([]byte("sleep 100\n")); err != nil { + t.Fatalf("ptyMaster.Write: %v", err) + } // Wait for it to start. Sleep's PPID is bash's PID. expectedPL = append(expectedPL, newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process()) @@ -411,7 +413,9 @@ func TestJobControlSignalRootContainer(t *testing.T) { // which makes this a suitable Reader for WaitUntilRead. ptyBuf := newBlockingBuffer() tee := io.TeeReader(ptyMaster, ptyBuf) - go io.Copy(os.Stderr, tee) + go func() { + _, _ = io.Copy(os.Stderr, tee) + }() // Start the container. if err := c.Start(conf); err != nil { @@ -444,7 +448,9 @@ func TestJobControlSignalRootContainer(t *testing.T) { } // Execute sleep via the terminal. - ptyMaster.Write([]byte("sleep 100\n")) + if _, err := ptyMaster.Write([]byte("sleep 100\n")); err != nil { + t.Fatalf("ptyMaster.Write(): %v", err) + } // Wait for sleep to start. expectedPL = append(expectedPL, newProcessBuilder().PID(2).PPID(1).Cmd("sleep").Process()) @@ -563,13 +569,15 @@ func TestMultiContainerTerminal(t *testing.T) { // file. Writes after a certain point will block unless we drain the // PTY, so we must continually copy from it. // - // We log the output to stderr for debugabilitly, and also to a buffer, + // We log the output to stderr for debuggability, and also to a buffer, // since we wait on particular output from bash below. We use a custom // blockingBuffer which is thread-safe and also blocks on Read calls, // which makes this a suitable Reader for WaitUntilRead. ptyBuf := newBlockingBuffer() tee := io.TeeReader(tc.master, ptyBuf) - go io.Copy(os.Stderr, tee) + go func() { + _, _ = io.Copy(os.Stderr, tee) + }() // Wait for bash to start. expectedPL := []*control.Process{ @@ -581,7 +589,9 @@ func TestMultiContainerTerminal(t *testing.T) { // Execute echo command and check that it was executed correctly. Use // a variable to ensure it's not matching against command echo. - tc.master.Write([]byte("echo foo-${PWD}-123\n")) + if _, err := tc.master.Write([]byte("echo foo-${PWD}-123\n")); err != nil { + t.Fatalf("master.Write(): %v", err) + } if err := testutil.WaitUntilRead(ptyBuf, "foo-/-123", 5*time.Second); err != nil { t.Fatalf("echo didn't execute: %v", err) } diff --git a/runsc/container/container.go b/runsc/container/container.go index 0820edaec..50b0dd5e7 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -208,7 +208,7 @@ func New(conf *config.Config, args Args) (*Container, error) { if err := c.Saver.lockForNew(); err != nil { return nil, err } - defer c.Saver.unlock() + defer c.Saver.unlockOrDie() // If the metadata annotations indicate that this container should be started // in an existing sandbox, we must do so. These are the possible metadata @@ -310,7 +310,7 @@ func New(conf *config.Config, args Args) (*Container, error) { defer tty.Close() } - if err := c.Sandbox.CreateContainer(c.ID, tty); err != nil { + if err := c.Sandbox.CreateSubcontainer(conf, c.ID, tty); err != nil { return nil, err } } @@ -340,7 +340,7 @@ func (c *Container) Start(conf *config.Config) error { if err := c.Saver.lock(); err != nil { return err } - unlock := cleanup.Make(func() { c.Saver.unlock() }) + unlock := cleanup.Make(c.Saver.unlockOrDie) defer unlock.Clean() if err := c.requireStatus("start", Created); err != nil { @@ -388,7 +388,7 @@ func (c *Container) Start(conf *config.Config) error { stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} } - return c.Sandbox.StartContainer(c.Spec, conf, c.ID, stdios, goferFiles) + return c.Sandbox.StartSubcontainer(c.Spec, conf, c.ID, stdios, goferFiles) }); err != nil { return err } @@ -426,7 +426,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *config.Config, restoreFile s if err := c.Saver.lock(); err != nil { return err } - defer c.Saver.unlock() + defer c.Saver.unlockOrDie() if err := c.requireStatus("restore", Created); err != nil { return err @@ -480,13 +480,13 @@ func Run(conf *config.Config, args Args) (unix.WaitStatus, error) { // Execute runs the specified command in the container. It returns the PID of // the newly created process. -func (c *Container) Execute(args *control.ExecArgs) (int32, error) { +func (c *Container) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { log.Debugf("Execute in container, cid: %s, args: %+v", c.ID, args) if err := c.requireStatus("execute in", Created, Running); err != nil { return 0, err } args.ContainerID = c.ID - return c.Sandbox.Execute(args) + return c.Sandbox.Execute(conf, args) } // Event returns events for the container. @@ -614,7 +614,7 @@ func (c *Container) Pause() error { if err := c.Saver.lock(); err != nil { return err } - defer c.Saver.unlock() + defer c.Saver.unlockOrDie() if c.Status != Created && c.Status != Running { return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status) @@ -634,7 +634,7 @@ func (c *Container) Resume() error { if err := c.Saver.lock(); err != nil { return err } - defer c.Saver.unlock() + defer c.Saver.unlockOrDie() if c.Status != Paused { return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status) @@ -646,6 +646,36 @@ func (c *Container) Resume() error { return c.saveLocked() } +// Cat prints out the content of the files. +func (c *Container) Cat(files []string, out *os.File) error { + log.Debugf("Cat in container, cid: %s, files: %+v", c.ID, files) + return c.Sandbox.Cat(c.ID, files, out) +} + +// Usage displays memory used by the application. +func (c *Container) Usage(full bool) (control.MemoryUsage, error) { + log.Debugf("Usage in container, cid: %s, full: %v", c.ID, full) + return c.Sandbox.Usage(c.ID, full) +} + +// UsageFD shows application memory usage using two donated FDs. +func (c *Container) UsageFD() (*control.MemoryUsageRecord, error) { + log.Debugf("UsageFD in container, cid: %s", c.ID) + return c.Sandbox.UsageFD(c.ID) +} + +// Reduce requests that the sentry attempt to reduce its memory usage. +func (c *Container) Reduce(wait bool) error { + log.Debugf("Reduce in container, cid: %s", c.ID) + return c.Sandbox.Reduce(c.ID, wait) +} + +// Stream dumps all events to out. +func (c *Container) Stream(filters []string, out *os.File) error { + log.Debugf("Stream in container, cid: %s", c.ID) + return c.Sandbox.Stream(c.ID, filters, out) +} + // State returns the metadata of the container. func (c *Container) State() specs.State { return specs.State{ @@ -675,8 +705,8 @@ func (c *Container) Destroy() error { return err } defer func() { - c.Saver.unlock() - c.Saver.close() + c.Saver.unlockOrDie() + _ = c.Saver.close() }() // Stored for later use as stop() sets c.Sandbox to nil. @@ -789,30 +819,31 @@ func (c *Container) stop() error { } func (c *Container) waitForStopped() error { + if c.GoferPid == 0 { + return nil + } + + if c.IsSandboxRunning() { + if err := c.SignalContainer(unix.Signal(0), false); err == nil { + return fmt.Errorf("container is still running") + } + } + + if c.goferIsChild { + // The gofer process is a child of the current process, + // so we can wait it and collect its zombie. + if _, err := unix.Wait4(int(c.GoferPid), nil, 0, nil); err != nil { + return fmt.Errorf("error waiting the gofer process: %v", err) + } + c.GoferPid = 0 + return nil + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) op := func() error { - if c.IsSandboxRunning() { - if err := c.SignalContainer(unix.Signal(0), false); err == nil { - return fmt.Errorf("container is still running") - } - } - if c.GoferPid == 0 { - return nil - } - if c.goferIsChild { - // The gofer process is a child of the current process, - // so we can wait it and collect its zombie. - wpid, err := unix.Wait4(int(c.GoferPid), nil, unix.WNOHANG, nil) - if err != nil { - return fmt.Errorf("error waiting the gofer process: %v", err) - } - if wpid == 0 { - return fmt.Errorf("gofer is still running") - } - - } else if err := unix.Kill(c.GoferPid, 0); err == nil { + if err := unix.Kill(c.GoferPid, 0); err == nil { return fmt.Errorf("gofer is still running") } c.GoferPid = 0 @@ -910,6 +941,9 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu binPath := specutils.ExePath cmd := exec.Command(binPath, args...) cmd.ExtraFiles = goferEnds + + // Set Args[0] to make easier to spot the gofer process. Otherwise it's + // shown as `exe`. cmd.Args[0] = "runsc-gofer" if attached { @@ -1020,10 +1054,10 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error { return fn() } restore, err := cg.Join() - defer restore() if err != nil { return err } + defer restore() return fn() } diff --git a/runsc/container/container_norace_test.go b/runsc/container/container_norace_test.go index 838c1e20a..a4daf16ed 100644 --- a/runsc/container/container_norace_test.go +++ b/runsc/container/container_norace_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !race // +build !race package container diff --git a/runsc/container/container_race_test.go b/runsc/container/container_race_test.go index 9fb4c4fc0..86a57145c 100644 --- a/runsc/container/container_race_test.go +++ b/runsc/container/container_race_test.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build race // +build race package container diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 249324c5a..69dcf3f03 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -53,19 +53,22 @@ func TestMain(m *testing.M) { if err := testutil.ConfigureExePath(); err != nil { panic(err.Error()) } - specutils.MaybeRunAsRoot() + if err := specutils.MaybeRunAsRoot(); err != nil { + fmt.Fprintf(os.Stderr, "Error running as root: %v", err) + os.Exit(123) + } os.Exit(m.Run()) } -func execute(cont *Container, name string, arg ...string) (unix.WaitStatus, error) { +func execute(conf *config.Config, cont *Container, name string, arg ...string) (unix.WaitStatus, error) { args := &control.ExecArgs{ Filename: name, Argv: append([]string{name}, arg...), } - return cont.executeSync(args) + return cont.executeSync(conf, args) } -func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, error) { +func executeCombinedOutput(conf *config.Config, cont *Container, name string, arg ...string) ([]byte, error) { r, w, err := os.Pipe() if err != nil { return nil, err @@ -77,7 +80,7 @@ func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, Argv: append([]string{name}, arg...), FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, w, w}}, } - ws, err := cont.executeSync(args) + ws, err := cont.executeSync(conf, args) w.Close() if err != nil { return nil, err @@ -91,8 +94,8 @@ func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, } // executeSync synchronously executes a new process. -func (c *Container) executeSync(args *control.ExecArgs) (unix.WaitStatus, error) { - pid, err := c.Execute(args) +func (c *Container) executeSync(conf *config.Config, args *control.ExecArgs) (unix.WaitStatus, error) { + pid, err := c.Execute(conf, args) if err != nil { return 0, fmt.Errorf("error executing: %v", err) } @@ -169,8 +172,8 @@ func blockUntilWaitable(pid int) error { } // execPS executes `ps` inside the container and return the processes. -func execPS(c *Container) ([]*control.Process, error) { - out, err := executeCombinedOutput(c, "/bin/ps", "-e") +func execPS(conf *config.Config, c *Container) ([]*control.Process, error) { + out, err := executeCombinedOutput(conf, c, "/bin/ps", "-e") if err != nil { return nil, err } @@ -439,6 +442,11 @@ func configs(t *testing.T, opts ...configOption) map[string]*config.Config { return all } +// sleepSpec generates a spec with sleep 1000 and a conf. +func sleepSpecConf(t *testing.T) (*specs.Spec, *config.Config) { + return testutil.NewSpecWithArgs("sleep", "1000"), testutil.TestConfig(t) +} + // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle. // It verifies after each step that the container can be loaded from disk, and // has the correct status. @@ -452,7 +460,7 @@ func TestLifecycle(t *testing.T) { t.Run(name, func(t *testing.T) { // The container will just sleep for a long time. We will kill it before // it finishes sleeping. - spec := testutil.NewSpecWithArgs("sleep", "100") + spec, _ := sleepSpecConf(t) rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { @@ -523,9 +531,11 @@ func TestLifecycle(t *testing.T) { ws, err := c.Wait() if err != nil { ch <- err + return } if got, want := ws.Signal(), unix.SIGTERM; got != want { ch <- fmt.Errorf("got signal %v, want %v", got, want) + return } ch <- nil }() @@ -859,7 +869,7 @@ func TestExec(t *testing.T) { } { t.Run(tc.name, func(t *testing.T) { // t.Parallel() - if ws, err := cont.executeSync(&tc.args); err != nil { + if ws, err := cont.executeSync(conf, &tc.args); err != nil { t.Fatalf("executeAsync(%+v): %v", tc.args, err) } else if ws != 0 { t.Fatalf("executeAsync(%+v) failed with exit: %v", tc.args, ws) @@ -877,7 +887,7 @@ func TestExec(t *testing.T) { } defer unix.Close(fds[0]) - _, err = cont.executeSync(&control.ExecArgs{ + _, err = cont.executeSync(conf, &control.ExecArgs{ Argv: []string{"/nonexist"}, FilePayload: urpc.FilePayload{ Files: []*os.File{os.NewFile(uintptr(fds[1]), "sock")}, @@ -898,7 +908,7 @@ func TestExecProcList(t *testing.T) { for name, conf := range configs(t, all...) { t.Run(name, func(t *testing.T) { const uid = 343 - spec := testutil.NewSpecWithArgs("sleep", "100") + spec, _ := sleepSpecConf(t) _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { @@ -932,7 +942,7 @@ func TestExecProcList(t *testing.T) { // start running exec (which blocks). ch := make(chan error) go func() { - exitStatus, err := cont.executeSync(execArgs) + exitStatus, err := cont.executeSync(conf, execArgs) if err != nil { ch <- err } else if exitStatus != 0 { @@ -1417,8 +1427,7 @@ func TestPauseResume(t *testing.T) { // with calls to pause and resume and that pausing and resuming only // occurs given the correct state. func TestPauseResumeStatus(t *testing.T) { - spec := testutil.NewSpecWithArgs("sleep", "20") - conf := testutil.TestConfig(t) + spec, conf := sleepSpecConf(t) _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) @@ -1485,7 +1494,7 @@ func TestCapabilities(t *testing.T) { for name, conf := range configs(t, all...) { t.Run(name, func(t *testing.T) { - spec := testutil.NewSpecWithArgs("sleep", "100") + spec, _ := sleepSpecConf(t) rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) @@ -1525,7 +1534,9 @@ func TestCapabilities(t *testing.T) { defer os.Remove(exePath) // Need to traverse the intermediate directory. - os.Chmod(rootDir, 0755) + if err := os.Chmod(rootDir, 0755); err != nil { + t.Fatal(err) + } execArgs := &control.ExecArgs{ Filename: exePath, @@ -1537,7 +1548,7 @@ func TestCapabilities(t *testing.T) { } // "exe" should fail because we don't have the necessary permissions. - if _, err := cont.executeSync(execArgs); err == nil { + if _, err := cont.executeSync(conf, execArgs); err == nil { t.Fatalf("container executed without error, but an error was expected") } @@ -1546,7 +1557,7 @@ func TestCapabilities(t *testing.T) { EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), } // "exe" should not fail this time. - if _, err := cont.executeSync(execArgs); err != nil { + if _, err := cont.executeSync(conf, execArgs); err != nil { t.Fatalf("container failed to exec %v: %v", args, err) } }) @@ -1633,7 +1644,7 @@ func TestMountNewDir(t *testing.T) { func TestReadonlyRoot(t *testing.T) { for name, conf := range configs(t, all...) { t.Run(name, func(t *testing.T) { - spec := testutil.NewSpecWithArgs("sleep", "100") + spec, _ := sleepSpecConf(t) spec.Root.Readonly = true _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) @@ -1657,7 +1668,7 @@ func TestReadonlyRoot(t *testing.T) { } // Read mounts to check that root is readonly. - out, err := executeCombinedOutput(c, "/bin/sh", "-c", "mount | grep ' / ' | grep -o -e '(.*)'") + out, err := executeCombinedOutput(conf, c, "/bin/sh", "-c", "mount | grep ' / ' | grep -o -e '(.*)'") if err != nil { t.Fatalf("exec failed: %v", err) } @@ -1667,7 +1678,7 @@ func TestReadonlyRoot(t *testing.T) { } // Check that file cannot be created. - ws, err := execute(c, "/bin/touch", "/foo") + ws, err := execute(conf, c, "/bin/touch", "/foo") if err != nil { t.Fatalf("touch file in ro mount: %v", err) } @@ -1685,7 +1696,7 @@ func TestReadonlyMount(t *testing.T) { if err != nil { t.Fatalf("ioutil.TempDir() failed: %v", err) } - spec := testutil.NewSpecWithArgs("sleep", "100") + spec, _ := sleepSpecConf(t) spec.Mounts = append(spec.Mounts, specs.Mount{ Destination: dir, Source: dir, @@ -1716,7 +1727,7 @@ func TestReadonlyMount(t *testing.T) { // Read mounts to check that volume is readonly. cmd := fmt.Sprintf("mount | grep ' %s ' | grep -o -e '(.*)'", dir) - out, err := executeCombinedOutput(c, "/bin/sh", "-c", cmd) + out, err := executeCombinedOutput(conf, c, "/bin/sh", "-c", cmd) if err != nil { t.Fatalf("exec failed, err: %v", err) } @@ -1726,7 +1737,7 @@ func TestReadonlyMount(t *testing.T) { } // Check that file cannot be created. - ws, err := execute(c, "/bin/touch", path.Join(dir, "file")) + ws, err := execute(conf, c, "/bin/touch", path.Join(dir, "file")) if err != nil { t.Fatalf("touch file in ro mount: %v", err) } @@ -1845,7 +1856,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) { "baz-" + testutil.RandomContainerID(), } for _, cid := range cids { - spec := testutil.NewSpecWithArgs("sleep", "100") + spec, _ := sleepSpecConf(t) bundleDir, cleanup, err := testutil.SetupBundleDir(spec) if err != nil { t.Fatalf("error setting up container: %v", err) @@ -2153,7 +2164,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) { go func() { defer wg.Done() // Ignore failures, start can fail if destroy runs first. - startCont.Start(conf) + _ = startCont.Start(conf) }() wg.Add(1) @@ -2222,7 +2233,7 @@ func TestMountPropagation(t *testing.T) { t.Fatalf("mount(%q, MS_SHARED): %v", srcMnt, err) } - spec := testutil.NewSpecWithArgs("sleep", "1000") + spec, conf := sleepSpecConf(t) priv := filepath.Join(tmpDir, "priv") slave := filepath.Join(tmpDir, "slave") @@ -2241,7 +2252,6 @@ func TestMountPropagation(t *testing.T) { }, } - conf := testutil.TestConfig(t) _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) @@ -2271,13 +2281,13 @@ func TestMountPropagation(t *testing.T) { // Check that mount didn't propagate to private mount. privFile := filepath.Join(priv, "mnt", "file") - if ws, err := execute(cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 { + if ws, err := execute(conf, cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 { t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err) } // Check that mount propagated to slave mount. slaveFile := filepath.Join(slave, "mnt", "file") - if ws, err := execute(cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 { + if ws, err := execute(conf, cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 { t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err) } } @@ -2343,7 +2353,7 @@ func TestMountSymlink(t *testing.T) { // Check that symlink was resolved and mount was created where the symlink // is pointing to. file := path.Join(target, "file") - if ws, err := execute(cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 { + if ws, err := execute(conf, cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 { t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err) } }) @@ -2556,12 +2566,11 @@ func TestRlimits(t *testing.T) { // TestRlimitsExec sets limit to number of open files and checks that the limit // is propagated to exec'd processes. func TestRlimitsExec(t *testing.T) { - spec := testutil.NewSpecWithArgs("sleep", "100") + spec, conf := sleepSpecConf(t) spec.Process.Rlimits = []specs.POSIXRlimit{ {Type: "RLIMIT_NOFILE", Hard: 1000, Soft: 100}, } - conf := testutil.TestConfig(t) _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) @@ -2582,7 +2591,7 @@ func TestRlimitsExec(t *testing.T) { t.Fatalf("error starting container: %v", err) } - got, err := executeCombinedOutput(cont, "/bin/sh", "-c", "ulimit -n") + got, err := executeCombinedOutput(conf, cont, "/bin/sh", "-c", "ulimit -n") if err != nil { t.Fatal(err) } @@ -2590,3 +2599,276 @@ func TestRlimitsExec(t *testing.T) { t.Errorf("ulimit result, got: %q, want: %q", got, want) } } + +// TestCat creates a file and checks that cat generates the expected output. +func TestCat(t *testing.T) { + f, err := ioutil.TempFile(testutil.TmpDir(), "test-case") + if err != nil { + t.Fatalf("ioutil.TempFile failed: %v", err) + } + defer os.RemoveAll(f.Name()) + + content := "test-cat" + if _, err := f.WriteString(content); err != nil { + t.Fatalf("f.WriteString(): %v", err) + } + f.Close() + + spec, conf := sleepSpecConf(t) + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + + cont, err := New(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + r, w, err := os.Pipe() + if err != nil { + t.Fatalf("os.Create(): %v", err) + } + + if err := cont.Cat([]string{f.Name()}, w); err != nil { + t.Fatalf("error cat from container: %v", err) + } + + buf := make([]byte, 1024) + if _, err := r.Read(buf); err != nil { + t.Fatalf("Read out: %v", err) + } + if got, want := string(buf), content; !strings.Contains(got, want) { + t.Errorf("out got %s, want include %s", buf, want) + } +} + +// TestUsage checks that usage generates the expected memory usage. +func TestUsage(t *testing.T) { + spec, conf := sleepSpecConf(t) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + + cont, err := New(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + for _, full := range []bool{false, true} { + m, err := cont.Usage(full) + if err != nil { + t.Fatalf("error usage from container: %v", err) + } + if m.Mapped == 0 { + t.Errorf("Usage mapped got zero") + } + if m.Total == 0 { + t.Errorf("Usage total got zero") + } + if full { + if m.System == 0 { + t.Errorf("Usage system got zero") + } + if m.Anonymous == 0 { + t.Errorf("Usage anonymous got zero") + } + } + } +} + +// TestUsageFD checks that usagefd generates the expected memory usage. +func TestUsageFD(t *testing.T) { + spec, conf := sleepSpecConf(t) + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + + cont, err := New(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + m, err := cont.UsageFD() + if err != nil { + t.Fatalf("error usageFD from container: %v", err) + } + + mapped, unknown, total, err := m.Fetch() + if err != nil { + t.Fatalf("error Fetch memory usage: %v", err) + } + + if mapped == 0 { + t.Errorf("UsageFD Mapped got zero") + } + if unknown == 0 { + t.Errorf("UsageFD unknown got zero") + } + if total == 0 { + t.Errorf("UsageFD total got zero") + } +} + +// TestReduce checks that reduce call succeeds. +func TestReduce(t *testing.T) { + spec, conf := sleepSpecConf(t) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + + cont, err := New(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + if err := cont.Reduce(false); err != nil { + t.Fatalf("error reduce from container: %v", err) + } +} + +// TestStream checks that Stream dumps expected events. +func TestStream(t *testing.T) { + spec, conf := sleepSpecConf(t) + conf.Strace = true + conf.StraceEvent = true + conf.StraceSyscalls = "" + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + + cont, err := New(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + r, w, err := os.Pipe() + if err != nil { + t.Fatalf("os.Create(): %v", err) + } + + // Spawn a new thread to Stream events as it blocks indefinitely. + go func() { + cont.Stream(nil, w) + }() + + buf := make([]byte, 1024) + if _, err := r.Read(buf); err != nil { + t.Fatalf("Read out: %v", err) + } + + // A syscall strace event includes "Strace". + if got, want := string(buf), "Strace"; !strings.Contains(got, want) { + t.Errorf("out got %s, want include %s", buf, want) + } +} + +// TestProfile checks that profiling options generate profiles. +func TestProfile(t *testing.T) { + // Perform a non-trivial amount of work so we actually capture + // something in the profiles. + spec := testutil.NewSpecWithArgs("/bin/bash", "-c", "true") + conf := testutil.TestConfig(t) + conf.ProfileEnable = true + conf.ProfileBlock = filepath.Join(t.TempDir(), "block.pprof") + conf.ProfileCPU = filepath.Join(t.TempDir(), "cpu.pprof") + conf.ProfileHeap = filepath.Join(t.TempDir(), "heap.pprof") + conf.ProfileMutex = filepath.Join(t.TempDir(), "mutex.pprof") + conf.TraceFile = filepath.Join(t.TempDir(), "trace.out") + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + Attached: true, + } + + _, err = Run(conf, args) + if err != nil { + t.Fatalf("Creating container: %v", err) + } + + // Basic test; simply assert that the profiles are not empty. + for _, name := range []string{conf.ProfileBlock, conf.ProfileCPU, conf.ProfileHeap, conf.ProfileMutex, conf.TraceFile} { + fi, err := os.Stat(name) + if err != nil { + t.Fatalf("Unable to stat profile file %s: %v", name, err) + } + if fi.Size() == 0 { + t.Errorf("Profile file %s is empty: %+v", name, fi) + } + } +} diff --git a/runsc/container/hook.go b/runsc/container/hook.go index 901607aee..ce1c9e1de 100644 --- a/runsc/container/hook.go +++ b/runsc/container/hook.go @@ -101,8 +101,8 @@ func executeHook(h specs.Hook, s specs.State) error { return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String()) } case <-timer: - cmd.Process.Kill() - cmd.Wait() + _ = cmd.Process.Kill() + _ = cmd.Wait() return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String()) } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 0dbe1e323..9d8022e50 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -105,11 +105,11 @@ type execDesc struct { name string } -func execMany(t *testing.T, execs []execDesc) { +func execMany(t *testing.T, conf *config.Config, execs []execDesc) { for _, exec := range execs { t.Run(exec.name, func(t *testing.T) { args := &control.ExecArgs{Argv: exec.cmd} - if ws, err := exec.c.executeSync(args); err != nil { + if ws, err := exec.c.executeSync(conf, args); err != nil { t.Errorf("error executing %+v: %v", args, err) } else if ws.ExitStatus() != exec.want { t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want) @@ -217,7 +217,7 @@ func TestMultiPIDNS(t *testing.T) { newProcessBuilder().PID(2).Cmd("sleep").Process(), newProcessBuilder().Cmd("ps").Process(), } - got, err := execPS(containers[0]) + got, err := execPS(conf, containers[0]) if err != nil { t.Fatal(err) } @@ -229,7 +229,7 @@ func TestMultiPIDNS(t *testing.T) { newProcessBuilder().PID(1).Cmd("sleep").Process(), newProcessBuilder().Cmd("ps").Process(), } - got, err = execPS(containers[1]) + got, err = execPS(conf, containers[1]) if err != nil { t.Fatal(err) } @@ -313,7 +313,7 @@ func TestMultiPIDNSPath(t *testing.T) { newProcessBuilder().PID(3).Cmd("sleep").Process(), newProcessBuilder().Cmd("ps").Process(), } - got, err := execPS(containers[0]) + got, err := execPS(conf, containers[0]) if err != nil { t.Fatal(err) } @@ -328,7 +328,7 @@ func TestMultiPIDNSPath(t *testing.T) { newProcessBuilder().PID(3).Cmd("sleep").Process(), newProcessBuilder().Cmd("ps").Process(), } - got, err = execPS(containers[1]) + got, err = execPS(conf, containers[1]) if err != nil { t.Fatal(err) } @@ -341,7 +341,7 @@ func TestMultiPIDNSPath(t *testing.T) { newProcessBuilder().PID(1).Cmd("sleep").Process(), newProcessBuilder().Cmd("ps").Process(), } - got, err = execPS(containers[2]) + got, err = execPS(conf, containers[2]) if err != nil { t.Fatal(err) } @@ -541,7 +541,7 @@ func TestExecWait(t *testing.T) { WorkingDirectory: "/", KUID: 0, } - pid, err := containers[0].Execute(args) + pid, err := containers[0].Execute(conf, args) if err != nil { t.Fatalf("error executing: %v", err) } @@ -744,7 +744,7 @@ func TestMultiContainerDestroy(t *testing.T) { Filename: app, Argv: []string{app, "fork-bomb"}, } - if _, err := containers[1].Execute(args); err != nil { + if _, err := containers[1].Execute(conf, args); err != nil { t.Fatalf("error exec'ing: %v", err) } @@ -821,7 +821,7 @@ func TestMultiContainerProcesses(t *testing.T) { Filename: "/bin/sleep", Argv: []string{"/bin/sleep", "100"}, } - if _, err := containers[1].Execute(args); err != nil { + if _, err := containers[1].Execute(conf, args); err != nil { t.Fatalf("error exec'ing: %v", err) } expectedPL1 = append(expectedPL1, newProcessBuilder().PID(4).Cmd("sleep").Process()) @@ -882,7 +882,7 @@ func TestMultiContainerKillAll(t *testing.T) { Filename: app, Argv: []string{app, "task-tree", "--depth=2", "--width=2"}, } - if _, err := containers[1].Execute(args); err != nil { + if _, err := containers[1].Execute(conf, args); err != nil { t.Fatalf("error exec'ing: %v", err) } // Wait for these new processes to start. @@ -894,7 +894,9 @@ func TestMultiContainerKillAll(t *testing.T) { if tc.killContainer { // First kill the init process to make the container be stopped with // processes still running inside. - containers[1].SignalContainer(unix.SIGKILL, false) + if err := containers[1].SignalContainer(unix.SIGKILL, false); err != nil { + t.Fatalf("SignalContainer(): %v", err) + } op := func() error { c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{}) if err != nil { @@ -912,7 +914,7 @@ func TestMultiContainerKillAll(t *testing.T) { c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{}) if err != nil { - t.Fatalf("failed to load child container %q: %v", c.ID, err) + t.Fatalf("failed to load child container %q: %v", ids[1], err) } // Kill'Em All if err := c.SignalContainer(unix.SIGKILL, true); err != nil { @@ -1040,7 +1042,8 @@ func TestMultiContainerDestroyStarting(t *testing.T) { wg.Add(1) go func() { defer wg.Done() - startCont.Start(conf) // ignore failures, start can fail if destroy runs first. + // Ignore failures, start can fail if destroy runs first. + _ = startCont.Start(conf) }() wg.Add(1) @@ -1314,7 +1317,7 @@ func TestMultiContainerSharedMount(t *testing.T) { name: "dir removed from container1", }, } - execMany(t, execs) + execMany(t, conf, execs) }) } } @@ -1379,7 +1382,7 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) { name: "fails to write to container1", }, } - execMany(t, execs) + execMany(t, conf, execs) }) } } @@ -1437,7 +1440,7 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { name: "file appears in container1", }, } - execMany(t, execs) + execMany(t, conf, execs) containers[1].Destroy() @@ -1487,7 +1490,7 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { name: "file removed from container1", }, } - execMany(t, execs) + execMany(t, conf, execs) }) } } @@ -1540,7 +1543,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { name: "directory is mounted in container1", }, } - execMany(t, execs) + execMany(t, conf, execs) }) } } @@ -1651,7 +1654,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { } // Check that container isn't running anymore. - if _, err := execute(c, "/bin/true"); err == nil { + if _, err := execute(conf, c, "/bin/true"); err == nil { t.Fatalf("Container %q was not stopped after gofer death", c.ID) } @@ -1666,7 +1669,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { if err := waitForProcessList(c, pl); err != nil { t.Errorf("Container %q was affected by another container: %v", c.ID, err) } - if _, err := execute(c, "/bin/true"); err != nil { + if _, err := execute(conf, c, "/bin/true"); err != nil { t.Fatalf("Container %q was affected by another container: %v", c.ID, err) } } @@ -1688,7 +1691,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Check that entire sandbox isn't running anymore. for _, c := range containers { - if _, err := execute(c, "/bin/true"); err == nil { + if _, err := execute(conf, c, "/bin/true"); err == nil { t.Fatalf("Container %q was not stopped after gofer death", c.ID) } } @@ -1864,7 +1867,7 @@ func TestMultiContainerHomeEnvDir(t *testing.T) { defer cleanup() // Exec into the root container synchronously. - if _, err := execute(containers[0], "/bin/sh", "-c", execCmd); err != nil { + if _, err := execute(conf, containers[0], "/bin/sh", "-c", execCmd); err != nil { t.Errorf("error executing %+v: %v", execCmd, err) } @@ -1980,7 +1983,7 @@ func TestMultiContainerEvent(t *testing.T) { if busyUsage <= sleepUsage { t.Logf("Busy container usage lower than sleep (busy: %d, sleep: %d), retrying...", busyUsage, sleepUsage) - return fmt.Errorf("Busy container should have higher usage than sleep, busy: %d, sleep: %d", busyUsage, sleepUsage) + return fmt.Errorf("busy container should have higher usage than sleep, busy: %d, sleep: %d", busyUsage, sleepUsage) } return nil } @@ -2053,7 +2056,7 @@ func TestDuplicateEnvVariable(t *testing.T) { Argv: []string{"/bin/sh", "-c", cmdExec}, Envv: []string{"VAR=foo", "VAR=bar"}, } - if ws, err := containers[0].executeSync(execArgs); err != nil || ws.ExitStatus() != 0 { + if ws, err := containers[0].executeSync(conf, execArgs); err != nil || ws.ExitStatus() != 0 { t.Fatalf("exec failed, ws: %v, err: %v", ws, err) } diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go index cb5bffb89..f16b2bd02 100644 --- a/runsc/container/shared_volume_test.go +++ b/runsc/container/shared_volume_test.go @@ -72,7 +72,7 @@ func TestSharedVolume(t *testing.T) { Filename: "/usr/bin/test", Argv: []string{"test", "-f", filename}, } - if ws, err := c.executeSync(argsTestFile); err != nil { + if ws, err := c.executeSync(conf, argsTestFile); err != nil { t.Fatalf("unexpected error testing file %q: %v", filename, err) } else if ws.ExitStatus() == 0 { t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err) @@ -84,7 +84,7 @@ func TestSharedVolume(t *testing.T) { } // Now we should be able to test the file from within the sandbox. - if ws, err := c.executeSync(argsTestFile); err != nil { + if ws, err := c.executeSync(conf, argsTestFile); err != nil { t.Fatalf("unexpected error testing file %q: %v", filename, err) } else if ws.ExitStatus() != 0 { t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus()) @@ -97,7 +97,7 @@ func TestSharedVolume(t *testing.T) { } // File should no longer exist at the old path within the sandbox. - if ws, err := c.executeSync(argsTestFile); err != nil { + if ws, err := c.executeSync(conf, argsTestFile); err != nil { t.Fatalf("unexpected error testing file %q: %v", filename, err) } else if ws.ExitStatus() == 0 { t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus()) @@ -108,7 +108,7 @@ func TestSharedVolume(t *testing.T) { Filename: "/usr/bin/test", Argv: []string{"test", "-f", newFilename}, } - if ws, err := c.executeSync(argsTestNewFile); err != nil { + if ws, err := c.executeSync(conf, argsTestNewFile); err != nil { t.Fatalf("unexpected error testing file %q: %v", newFilename, err) } else if ws.ExitStatus() != 0 { t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus()) @@ -120,7 +120,7 @@ func TestSharedVolume(t *testing.T) { } // Renamed file should no longer exist at the old path within the sandbox. - if ws, err := c.executeSync(argsTestNewFile); err != nil { + if ws, err := c.executeSync(conf, argsTestNewFile); err != nil { t.Fatalf("unexpected error testing file %q: %v", newFilename, err) } else if ws.ExitStatus() == 0 { t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus()) @@ -133,7 +133,7 @@ func TestSharedVolume(t *testing.T) { KUID: auth.KUID(os.Getuid()), KGID: auth.KGID(os.Getgid()), } - if ws, err := c.executeSync(argsTouch); err != nil { + if ws, err := c.executeSync(conf, argsTouch); err != nil { t.Fatalf("unexpected error touching file %q: %v", filename, err) } else if ws.ExitStatus() != 0 { t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus()) @@ -154,7 +154,7 @@ func TestSharedVolume(t *testing.T) { Filename: "/bin/rm", Argv: []string{"rm", filename}, } - if ws, err := c.executeSync(argsRemove); err != nil { + if ws, err := c.executeSync(conf, argsRemove); err != nil { t.Fatalf("unexpected error removing file %q: %v", filename, err) } else if ws.ExitStatus() != 0 { t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus()) @@ -166,14 +166,14 @@ func TestSharedVolume(t *testing.T) { } } -func checkFile(c *Container, filename string, want []byte) error { +func checkFile(conf *config.Config, c *Container, filename string, want []byte) error { cpy := filename + ".copy" - if _, err := execute(c, "/bin/cp", "-f", filename, cpy); err != nil { + if _, err := execute(conf, c, "/bin/cp", "-f", filename, cpy); err != nil { return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err) } got, err := ioutil.ReadFile(cpy) if err != nil { - return fmt.Errorf("Error reading file %q: %v", filename, err) + return fmt.Errorf("error reading file %q: %v", filename, err) } if !bytes.Equal(got, want) { return fmt.Errorf("file content inside the sandbox is wrong, got: %q, want: %q", got, want) @@ -226,16 +226,16 @@ func TestSharedVolumeFile(t *testing.T) { if err := ioutil.WriteFile(filename, []byte(want), 0666); err != nil { t.Fatalf("Error writing to %q: %v", filename, err) } - if err := checkFile(c, filename, want); err != nil { + if err := checkFile(conf, c, filename, want); err != nil { t.Fatal(err.Error()) } // Append to file inside the container and check that content is not lost. - if _, err := execute(c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil { + if _, err := execute(conf, c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil { t.Fatalf("unexpected error appending file %q: %v", filename, err) } want = []byte("host-sandbox-") - if err := checkFile(c, filename, want); err != nil { + if err := checkFile(conf, c, filename, want); err != nil { t.Fatal(err.Error()) } @@ -250,7 +250,7 @@ func TestSharedVolumeFile(t *testing.T) { t.Fatalf("Error writing to file %q: %v", filename, err) } want = []byte("host-sandbox-host") - if err := checkFile(c, filename, want); err != nil { + if err := checkFile(conf, c, filename, want); err != nil { t.Fatal(err.Error()) } @@ -259,7 +259,7 @@ func TestSharedVolumeFile(t *testing.T) { t.Fatalf("Error truncating file %q: %v", filename, err) } want = want[:5] - if err := checkFile(c, filename, want); err != nil { + if err := checkFile(conf, c, filename, want); err != nil { t.Fatal(err.Error()) } } diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go index 0399903a0..23810f593 100644 --- a/runsc/container/state_file.go +++ b/runsc/container/state_file.go @@ -264,10 +264,10 @@ func (s *StateFile) lockForNew() error { // Checks if the container already exists by looking for the metadata file. if _, err := os.Stat(s.statePath()); err == nil { - s.unlock() + s.unlockOrDie() return fmt.Errorf("container already exists") } else if !os.IsNotExist(err) { - s.unlock() + s.unlockOrDie() return fmt.Errorf("looking for existing container: %v", err) } return nil @@ -286,6 +286,15 @@ func (s *StateFile) unlock() error { return nil } +func (s *StateFile) unlockOrDie() { + if !s.flock.Locked() { + panic("unlock called without lock held") + } + if err := s.flock.Unlock(); err != nil { + panic(fmt.Sprintf("Error releasing lock on %q: %v", s.flock, err)) + } +} + // saveLocked saves 'v' to the state file. // // Preconditions: lock() must been called before. @@ -308,7 +317,7 @@ func (s *StateFile) load(v interface{}) error { if err := s.lock(); err != nil { return err } - defer s.unlock() + defer s.unlockOrDie() metaBytes, err := ioutil.ReadFile(s.statePath()) if err != nil { diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go index f921a8107..6b25da904 100644 --- a/runsc/flag/flag.go +++ b/runsc/flag/flag.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + // Package flag wraps flag primitives. package flag diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go index 2d0151dcc..1cb9d312a 100644 --- a/runsc/fsgofer/filter/config_amd64.go +++ b/runsc/fsgofer/filter/config_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package filter diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go index 7d458c02d..ab750c3be 100644 --- a/runsc/fsgofer/filter/config_arm64.go +++ b/runsc/fsgofer/filter/config_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package filter diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go index e28d4b8d6..5442add95 100644 --- a/runsc/fsgofer/filter/extra_filters.go +++ b/runsc/fsgofer/filter/extra_filters.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !msan && !race // +build !msan,!race package filter diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go index d768ed0bb..e5915652f 100644 --- a/runsc/fsgofer/filter/extra_filters_msan.go +++ b/runsc/fsgofer/filter/extra_filters_msan.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build msan // +build msan package filter diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go index 9e75c025d..1a4862e1b 100644 --- a/runsc/fsgofer/filter/extra_filters_race.go +++ b/runsc/fsgofer/filter/extra_filters_race.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build race // +build race package filter diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 3f362b25e..600b21189 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -51,10 +51,10 @@ const ( // verityXattrs are the extended attributes used by verity file system. var verityXattrs = map[string]struct{}{ - "user.merkle.offset": struct{}{}, - "user.merkle.size": struct{}{}, - "user.merkle.childrenOffset": struct{}{}, - "user.merkle.childrenSize": struct{}{}, + "user.merkle.offset": {}, + "user.merkle.size": {}, + "user.merkle.childrenOffset": {}, + "user.merkle.childrenSize": {}, } // join is equivalent to path.Join() but skips path.Clean() which is expensive. @@ -1242,13 +1242,14 @@ func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) { } parent := l.file.FD() - for _, name := range names { - child, err := unix.Openat(parent, name, openFlags|unix.O_PATH, 0) + closeParent := func() { if parent != l.file.FD() { - // Parent is no longer needed. _ = unix.Close(parent) - parent = -1 } + } + defer closeParent() + for _, name := range names { + child, err := unix.Openat(parent, name, openFlags|unix.O_PATH, 0) if err != nil { if errors.Is(err, unix.ENOENT) { // No pont in continuing any further. @@ -1256,10 +1257,11 @@ func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) { } return nil, err } + closeParent() + parent = child var stat unix.Stat_t if err := unix.Fstat(child, &stat); err != nil { - _ = unix.Close(child) return nil, err } valid, attr := l.fillAttr(&stat) @@ -1271,13 +1273,9 @@ func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) { if (stat.Mode & unix.S_IFMT) != unix.S_IFDIR { // Doesn't need to continue if entry is not a dir. Including symlinks // that cannot be followed. - _ = unix.Close(child) break } parent = child } - if parent != -1 && parent != l.file.FD() { - _ = unix.Close(parent) - } return stats, nil } diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go index 29ebf8500..884f7fc26 100644 --- a/runsc/fsgofer/fsgofer_amd64_unsafe.go +++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package fsgofer diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go index 9fd5d0871..1207d9e8a 100644 --- a/runsc/fsgofer/fsgofer_arm64_unsafe.go +++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package fsgofer diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index 77723827a..ee6cc97df 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -65,15 +65,6 @@ func configTestName(conf *Config) string { return "RWMount" } -func assertPanic(t *testing.T, f func()) { - defer func() { - if r := recover(); r == nil { - t.Errorf("function did not panic") - } - }() - f() -} - func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error { want := make([]byte, len(content)) copy(want, content) @@ -195,7 +186,7 @@ func setup(fileType uint32) (string, string, error) { } root, err := a.Attach() if err != nil { - return "", "", fmt.Errorf("Attach failed, err: %v", err) + return "", "", fmt.Errorf("attach failed, err: %v", err) } defer root.Close() @@ -290,10 +281,10 @@ func checkIDs(f p9.File, uid, gid int) error { return fmt.Errorf("GetAttr() failed, err: %v", err) } if want := p9.UID(uid); stat.UID != want { - return fmt.Errorf("Wrong UID, want: %v, got: %v", want, stat.UID) + return fmt.Errorf("wrong UID, want: %v, got: %v", want, stat.UID) } if want := p9.GID(gid); stat.GID != want { - return fmt.Errorf("Wrong GID, want: %v, got: %v", want, stat.GID) + return fmt.Errorf("wrong GID, want: %v, got: %v", want, stat.GID) } return nil } @@ -574,7 +565,7 @@ func SetGetXattr(l *localFile, name string, value string) error { return err } if ret != value { - return fmt.Errorf("Got value %s, want %s", ret, value) + return fmt.Errorf("got value %s, want %s", ret, value) } return nil } diff --git a/runsc/mitigate/BUILD b/runsc/mitigate/BUILD index 1238890fc..9f4a7ba8d 100644 --- a/runsc/mitigate/BUILD +++ b/runsc/mitigate/BUILD @@ -4,7 +4,10 @@ package(licenses = ["notice"]) go_library( name = "mitigate", - srcs = ["mitigate.go"], + srcs = [ + "mitigate.go", + "mock.go", + ], visibility = [ "//runsc:__subpackages__", ], @@ -16,8 +19,5 @@ go_test( size = "small", srcs = ["mitigate_test.go"], library = ":mitigate", - deps = [ - "//runsc/mitigate/mock", - "@com_github_google_go_cmp//cmp:go_default_library", - ], + deps = ["@com_github_google_go_cmp//cmp:go_default_library"], ) diff --git a/runsc/mitigate/mitigate.go b/runsc/mitigate/mitigate.go index 88409af8f..00e5bf2a9 100644 --- a/runsc/mitigate/mitigate.go +++ b/runsc/mitigate/mitigate.go @@ -19,10 +19,7 @@ package mitigate import ( "fmt" - "io/ioutil" - "os" "regexp" - "sort" "strconv" "strings" ) @@ -39,128 +36,20 @@ const ( physicalIDKey = "physical id" coreIDKey = "core id" bugsKey = "bugs" - - // Path to shutdown a CPU. - cpuOnlineTemplate = "/sys/devices/system/cpu/cpu%d/online" ) // CPUSet contains a map of all CPUs on the system, mapped // by Physical ID and CoreIDs. threads with the same // Core and Physical ID are Hyperthread pairs. -type CPUSet map[threadID]*ThreadGroup +type CPUSet []*CPU // NewCPUSet creates a CPUSet from data read from /proc/cpuinfo. -func NewCPUSet(data []byte) (CPUSet, error) { - processors, err := getThreads(string(data)) - if err != nil { - return nil, err - } - - set := make(CPUSet) - for _, p := range processors { - // Each ID is of the form physicalID:coreID. Hyperthread pairs - // have identical physical and core IDs. We need to match - // Hyperthread pairs so that we can shutdown all but one per - // pair. - core, ok := set[p.id] - if !ok { - core = &ThreadGroup{} - set[p.id] = core - } - core.isVulnerable = core.isVulnerable || p.IsVulnerable() - core.threads = append(core.threads, p) - } - - // We need to make sure we shutdown the lowest number processor per - // thread group. - for _, tg := range set { - sort.Slice(tg.threads, func(i, j int) bool { - return tg.threads[i].processorNumber < tg.threads[j].processorNumber - }) - } - return set, nil -} - -// NewCPUSetFromPossible makes a cpuSet data read from -// /sys/devices/system/cpu/possible. This is used in enable operations -// where the caller simply wants to enable all CPUS. -func NewCPUSetFromPossible(data []byte) (CPUSet, error) { - threads, err := GetThreadsFromPossible(data) - if err != nil { - return nil, err - } - - // We don't care if a CPU is vulnerable or not, we just - // want to return a list of all CPUs on the host. - set := CPUSet{ - threads[0].id: &ThreadGroup{ - threads: threads, - isVulnerable: false, - }, - } - return set, nil -} - -// String implements the String method for CPUSet. -func (c CPUSet) String() string { - ret := "" - for _, tg := range c { - ret += fmt.Sprintf("%s\n", tg) - } - return ret -} - -// GetRemainingList returns the list of threads that will remain active -// after mitigation. -func (c CPUSet) GetRemainingList() []Thread { - threads := make([]Thread, 0, len(c)) - for _, core := range c { - // If we're vulnerable, take only one thread from the pair. - if core.isVulnerable { - threads = append(threads, core.threads[0]) - continue - } - // Otherwise don't shutdown anything. - threads = append(threads, core.threads...) - } - return threads -} - -// GetShutdownList returns the list of threads that will be shutdown on -// mitigation. -func (c CPUSet) GetShutdownList() []Thread { - threads := make([]Thread, 0) - for _, core := range c { - // Only if we're vulnerable do shutdown anything. In this case, - // shutdown all but the first entry. - if core.isVulnerable && len(core.threads) > 1 { - threads = append(threads, core.threads[1:]...) - } - } - return threads -} - -// ThreadGroup represents Hyperthread pairs on the same physical/core ID. -type ThreadGroup struct { - threads []Thread - isVulnerable bool -} - -// String implements the String method for threadGroup. -func (c ThreadGroup) String() string { - ret := fmt.Sprintf("ThreadGroup:\nIsVulnerable: %t\n", c.isVulnerable) - for _, processor := range c.threads { - ret += fmt.Sprintf("%s\n", processor) - } - return ret -} - -// getThreads returns threads structs from reading /proc/cpuinfo. -func getThreads(data string) ([]Thread, error) { +func NewCPUSet(data string) (CPUSet, error) { // Each processor entry should start with the // processor key. Find the beginings of each. - r := buildRegex(processorKey, `\d+`) + r := buildRegex(processorKey) indices := r.FindAllStringIndex(data, -1) + if len(indices) < 1 { return nil, fmt.Errorf("no cpus found for: %q", data) } @@ -172,193 +61,132 @@ func getThreads(data string) ([]Thread, error) { // indexes (e.g. data[index[i], index[i+1]]). // There should be len(indicies) - 1 CPUs // since the last index is the end of the string. - cpus := make([]Thread, 0, len(indices)) + var set CPUSet // Find each string that represents a CPU. These begin "processor". for i := 1; i < len(indices); i++ { start := indices[i-1][0] end := indices[i][0] // Parse the CPU entry, which should be between start/end. - c, err := newThread(data[start:end]) + c, err := newCPU(data[start:end]) if err != nil { return nil, err } - cpus = append(cpus, c) + set = append(set, c) } - return cpus, nil + return set, nil } -// GetThreadsFromPossible makes threads from data read from /sys/devices/system/cpu/possible. -func GetThreadsFromPossible(data []byte) ([]Thread, error) { - possibleRegex := regexp.MustCompile(`(?m)^(\d+)(-(\d+))?$`) - matches := possibleRegex.FindStringSubmatch(string(data)) - if len(matches) != 4 { - return nil, fmt.Errorf("mismatch regex from possible: %q", string(data)) - } - - // If matches[3] is empty, we only have one cpu entry. - if matches[3] == "" { - matches[3] = matches[1] - } - - begin, err := strconv.ParseInt(matches[1], 10, 64) - if err != nil { - return nil, fmt.Errorf("failed to parse begin: %v", err) - } - end, err := strconv.ParseInt(matches[3], 10, 64) - if err != nil { - return nil, fmt.Errorf("failed to parse end: %v", err) - } - if begin > end || begin < 0 || end < 0 { - return nil, fmt.Errorf("invalid cpu bounds from possible: begin: %d end: %d", begin, end) - } - - ret := make([]Thread, 0, end-begin) - for i := begin; i <= end; i++ { - ret = append(ret, Thread{ - processorNumber: i, - id: threadID{ - physicalID: 0, // we don't care about id for enable ops. - coreID: 0, - }, - }) +// IsVulnerable checks if this CPUSet is vulnerable to MDS. +func (c CPUSet) IsVulnerable() bool { + for _, cpu := range c { + if cpu.IsVulnerable() { + return true + } } - - return ret, nil + return false } -// threadID for each thread is defined by the physical and -// core IDs. If equal, two threads are Hyperthread pairs. -type threadID struct { - physicalID int64 - coreID int64 +// String implements the String method for CPUSet. +func (c CPUSet) String() string { + parts := make([]string, len(c)) + for i, cpu := range c { + parts[i] = cpu.String() + } + return strings.Join(parts, "\n") } -// Thread represents pertinent info about a single hyperthread in a pair. -type Thread struct { +// CPU represents pertinent info about a single hyperthread in a pair. +type CPU struct { processorNumber int64 // the processor number of this CPU. vendorID string // the vendorID of CPU (e.g. AuthenticAMD). cpuFamily int64 // CPU family number (e.g. 6 for CascadeLake/Skylake). model int64 // CPU model number (e.g. 85 for CascadeLake/Skylake). - id threadID // id for this thread + physicalID int64 // Physical ID of this CPU. + coreID int64 // Core ID of this CPU. bugs map[string]struct{} // map of vulnerabilities parsed from the 'bugs' field. } -// newThread parses a CPU from a single cpu entry from /proc/cpuinfo. -func newThread(data string) (Thread, error) { - empty := Thread{} +func newCPU(data string) (*CPU, error) { processor, err := parseProcessor(data) if err != nil { - return empty, err + return nil, err } vendorID, err := parseVendorID(data) if err != nil { - return empty, err + return nil, err } cpuFamily, err := parseCPUFamily(data) if err != nil { - return empty, err + return nil, err } model, err := parseModel(data) if err != nil { - return empty, err + return nil, err } physicalID, err := parsePhysicalID(data) if err != nil { - return empty, err + return nil, err } coreID, err := parseCoreID(data) if err != nil { - return empty, err + return nil, err } bugs, err := parseBugs(data) if err != nil { - return empty, err + return nil, err } - return Thread{ + return &CPU{ processorNumber: processor, vendorID: vendorID, cpuFamily: cpuFamily, model: model, - id: threadID{ - physicalID: physicalID, - coreID: coreID, - }, - bugs: bugs, + physicalID: physicalID, + coreID: coreID, + bugs: bugs, }, nil } -// String implements the String method for thread. -func (t Thread) String() string { - template := `CPU: %d -CPU ID: %+v -Vendor: %s -Family/Model: %d/%d -Bugs: %s +// String implements the String method for CPU. +func (t *CPU) String() string { + template := `%s: %d +%s: %s +%s: %d +%s: %d +%s: %d +%s: %d +%s: %s ` - bugs := make([]string, 0) + var bugs []string for bug := range t.bugs { bugs = append(bugs, bug) } - return fmt.Sprintf(template, t.processorNumber, t.id, t.vendorID, t.cpuFamily, t.model, strings.Join(bugs, ",")) -} - -// Enable turns on the CPU by writing 1 to /sys/devices/cpu/cpu{N}/online. -func (t Thread) Enable() error { - // Linux ensures that "cpu0" is always online. - if t.processorNumber == 0 { - return nil - } - cpuPath := fmt.Sprintf(cpuOnlineTemplate, t.processorNumber) - f, err := os.OpenFile(cpuPath, os.O_WRONLY|os.O_CREATE, 0644) - if err != nil { - return fmt.Errorf("failed to open file %s: %v", cpuPath, err) - } - if _, err = f.Write([]byte{'1'}); err != nil { - return fmt.Errorf("failed to write '1' to %s: %v", cpuPath, err) - } - return nil -} - -// Disable turns off the CPU by writing 0 to /sys/devices/cpu/cpu{N}/online. -func (t Thread) Disable() error { - // The core labeled "cpu0" can never be taken offline via this method. - // Linux will return EPERM if the user even creates a file at the /sys - // path above. - if t.processorNumber == 0 { - return fmt.Errorf("invalid shutdown operation: cpu0 cannot be disabled") - } - cpuPath := fmt.Sprintf(cpuOnlineTemplate, t.processorNumber) - return ioutil.WriteFile(cpuPath, []byte{'0'}, 0644) + return fmt.Sprintf(template, + processorKey, t.processorNumber, + vendorIDKey, t.vendorID, + cpuFamilyKey, t.cpuFamily, + modelKey, t.model, + physicalIDKey, t.physicalID, + coreIDKey, t.coreID, + bugsKey, strings.Join(bugs, " ")) } // IsVulnerable checks if a CPU is vulnerable to mds. -func (t Thread) IsVulnerable() bool { +func (t *CPU) IsVulnerable() bool { _, ok := t.bugs[mds] return ok } -// isActive checks if a CPU is active from /sys/devices/system/cpu/cpu{N}/online -// If the file does not exist (ioutil returns in error), we assume the CPU is on. -func (t Thread) isActive() bool { - cpuPath := fmt.Sprintf(cpuOnlineTemplate, t.processorNumber) - data, err := ioutil.ReadFile(cpuPath) - if err != nil { - return true - } - return len(data) > 0 && data[0] != '0' -} - // SimilarTo checks family/model/bugs fields for equality of two // processors. -func (t Thread) SimilarTo(other Thread) bool { +func (t *CPU) SimilarTo(other *CPU) bool { if t.vendorID != other.vendorID { return false } @@ -437,14 +265,14 @@ func parseIntegerResult(data, key string) (int64, error) { } // buildRegex builds a regex for parsing each CPU field. -func buildRegex(key, match string) *regexp.Regexp { +func buildRegex(key string) *regexp.Regexp { reg := fmt.Sprintf(`(?m)^%s\s*:\s*(.*)$`, key) return regexp.MustCompile(reg) } // parseRegex parses data with key inserted into a standard regex template. func parseRegex(data, key, match string) (string, error) { - r := buildRegex(key, match) + r := buildRegex(key) matches := r.FindStringSubmatch(data) if len(matches) < 2 { diff --git a/runsc/mitigate/mitigate_test.go b/runsc/mitigate/mitigate_test.go index 890c65f05..e79d879e9 100644 --- a/runsc/mitigate/mitigate_test.go +++ b/runsc/mitigate/mitigate_test.go @@ -12,95 +12,59 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package mitigate import ( - "fmt" "io/ioutil" "strings" "testing" - - "gvisor.dev/gvisor/runsc/mitigate/mock" ) // TestMockCPUSet tests mock cpu test cases against the cpuSet functions. func TestMockCPUSet(t *testing.T) { for _, tc := range []struct { - testCase mock.CPU + testCase MockCPU isVulnerable bool }{ { - testCase: mock.AMD8, + testCase: AMD8, isVulnerable: false, }, { - testCase: mock.Haswell2, + testCase: Haswell2, isVulnerable: true, }, { - testCase: mock.Haswell2core, + testCase: Haswell2core, isVulnerable: true, }, { - testCase: mock.CascadeLake2, + testCase: CascadeLake2, isVulnerable: true, }, { - testCase: mock.CascadeLake4, + testCase: CascadeLake4, isVulnerable: true, }, } { t.Run(tc.testCase.Name, func(t *testing.T) { - data := tc.testCase.MakeCPUString() - set, err := NewCPUSet([]byte(data)) + data := tc.testCase.MakeCPUSet().String() + set, err := NewCPUSet(data) if err != nil { t.Fatalf("Failed to create cpuSet: %v", err) } - t.Logf("data: %s", data) - - for _, tg := range set { - if err := checkSorted(tg.threads); err != nil { - t.Fatalf("Failed to sort cpuSet: %v", err) - } - } - - remaining := set.GetRemainingList() - // In the non-vulnerable case, no cores should be shutdown so all should remain. - want := tc.testCase.PhysicalCores * tc.testCase.Cores * tc.testCase.ThreadsPerCore - if tc.isVulnerable { - want = tc.testCase.PhysicalCores * tc.testCase.Cores - } - - if want != len(remaining) { - t.Fatalf("Failed to shutdown the correct number of cores: want: %d got: %d", want, len(remaining)) - } - - if !tc.isVulnerable { - return + if tc.testCase.NumCPUs() != len(set) { + t.Fatalf("Got wrong number of CPUs: want: %d got: %d", tc.testCase.NumCPUs(), len(set)) } - // If the set is vulnerable, we expect only 1 thread per hyperthread pair. - for _, r := range remaining { - if _, ok := set[r.id]; !ok { - t.Fatalf("Entry %+v not in map, there must be two entries in the same thread group.", r) - } - delete(set, r.id) - } - - possible := tc.testCase.MakeSysPossibleString() - set, err = NewCPUSetFromPossible([]byte(possible)) - if err != nil { - t.Fatalf("Failed to make cpuSet: %v", err) - } - - want = tc.testCase.PhysicalCores * tc.testCase.Cores * tc.testCase.ThreadsPerCore - got := len(set.GetRemainingList()) - if got != want { - t.Fatalf("Returned the wrong number of CPUs want: %d got: %d", want, got) + if set.IsVulnerable() != tc.isVulnerable { + t.Fatalf("incorrect vulnerable value: got: %t want: %t", set.IsVulnerable(), tc.isVulnerable) } + t.Logf("data: %s", data) }) } } @@ -116,29 +80,27 @@ physical id: 0 core id : 0 bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa itlb_multihit ` - want := Thread{ + want := CPU{ processorNumber: 0, vendorID: "GenuineIntel", cpuFamily: 6, model: 85, - id: threadID{ - physicalID: 0, - coreID: 0, - }, + physicalID: 0, + coreID: 0, bugs: map[string]struct{}{ - "cpu_meltdown": struct{}{}, - "spectre_v1": struct{}{}, - "spectre_v2": struct{}{}, - "spec_store_bypass": struct{}{}, - "l1tf": struct{}{}, - "mds": struct{}{}, - "swapgs": struct{}{}, - "taa": struct{}{}, - "itlb_multihit": struct{}{}, + "cpu_meltdown": {}, + "spectre_v1": {}, + "spectre_v2": {}, + "spec_store_bypass": {}, + "l1tf": {}, + "mds": {}, + "swapgs": {}, + "taa": {}, + "itlb_multihit": {}, }, } - got, err := newThread(data) + got, err := newCPU(data) if err != nil { t.Fatalf("getCpu failed with error: %v", err) } @@ -153,12 +115,12 @@ bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa } func TestInvalid(t *testing.T) { - result, err := getThreads(`something not a processor`) + result, err := newCPU(`something not a processor`) if err == nil { t.Fatalf("getCPU set didn't return an error: %+v", result) } - if !strings.Contains(err.Error(), "no cpus") { + if !strings.Contains(err.Error(), "failed to match key \"processor\"") { t.Fatalf("Incorrect error returned: %v", err) } } @@ -220,7 +182,7 @@ cache_alignment : 64 address sizes : 46 bits physical, 48 bits virtual power management: ` - cpuSet, err := getThreads(data) + cpuSet, err := NewCPUSet(data) if err != nil { t.Fatalf("getCPUSet failed: %v", err) } @@ -230,18 +192,18 @@ power management: t.Fatalf("Num CPU mismatch: want: %d, got: %d", wantCPULen, len(cpuSet)) } - wantCPU := Thread{ + wantCPU := CPU{ vendorID: "GenuineIntel", cpuFamily: 6, model: 63, bugs: map[string]struct{}{ - "cpu_meltdown": struct{}{}, - "spectre_v1": struct{}{}, - "spectre_v2": struct{}{}, - "spec_store_bypass": struct{}{}, - "l1tf": struct{}{}, - "mds": struct{}{}, - "swapgs": struct{}{}, + "cpu_meltdown": {}, + "spectre_v1": {}, + "spectre_v2": {}, + "spec_store_bypass": {}, + "l1tf": {}, + "mds": {}, + "swapgs": {}, }, } @@ -259,17 +221,11 @@ func TestReadFile(t *testing.T) { t.Fatalf("Failed to read cpuinfo: %v", err) } - set, err := NewCPUSet(data) + set, err := NewCPUSet(string(data)) if err != nil { t.Fatalf("Failed to parse CPU data %v\n%s", err, data) } - for _, tg := range set { - if err := checkSorted(tg.threads); err != nil { - t.Fatalf("Failed to sort cpuSet: %v", err) - } - } - if len(set) < 1 { t.Fatalf("Failed to parse any CPUs: %d", len(set)) } @@ -334,38 +290,6 @@ cache_alignment : 64 address sizes : 46 bits physical, 48 bits virtual power management:` - const cascade = `processor : 0 -vendor_id : GenuineIntel -cpu family : 6 -model : 85 -model name : Intel(R) Xeon(R) CPU -stepping : 7 -microcode : 0x1 -cpu MHz : 2800.198 -cache size : 33792 KB -physical id : 0 -siblings : 2 -core id : 0 -cpu cores : 1 -apicid : 0 -initial apicid : 0 -fpu : yes -fpu_exception : yes -cpuid level : 13 -wp : yes -flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 - ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmu -lqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowpr -efetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid r -tm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves a -rat avx512_vnni md_clear arch_capabilities -bugs : spectre_v1 spectre_v2 spec_store_bypass mds swapgs taa -bogomips : 5600.39 -clflush size : 64 -cache_alignment : 64 -address sizes : 46 bits physical, 48 bits virtual -power management:` - const amd = `processor : 0 vendor_id : AuthenticAMD cpu family : 23 @@ -414,7 +338,7 @@ power management:` }, } { t.Run(tc.name, func(t *testing.T) { - set, err := getThreads(tc.cpuString) + set, err := NewCPUSet(tc.cpuString) if err != nil { t.Fatalf("Failed to getCPUSet:%v\n %s", err, tc.cpuString) } @@ -429,104 +353,9 @@ power management:` }() if got != tc.vulnerable { - t.Fatalf("Mismatch vulnerable for cpu %+s: got %t want: %t", tc.name, tc.vulnerable, got) + t.Fatalf("Mismatch vulnerable for cpu %s: got %t want: %t", tc.name, tc.vulnerable, got) } } }) } } - -func TestReverse(t *testing.T) { - const noParse = "-1-" - for _, tc := range []struct { - name string - output string - wantErr error - wantCount int - }{ - { - name: "base", - output: "0-7", - wantErr: nil, - wantCount: 8, - }, - { - name: "huge", - output: "0-111", - wantErr: nil, - wantCount: 112, - }, - { - name: "not zero", - output: "50-53", - wantErr: nil, - wantCount: 4, - }, - { - name: "small", - output: "0", - wantErr: nil, - wantCount: 1, - }, - { - name: "invalid order", - output: "10-6", - wantErr: fmt.Errorf("invalid cpu bounds from possible: begin: %d end: %d", 10, 6), - }, - { - name: "no parse", - output: noParse, - wantErr: fmt.Errorf(`mismatch regex from possible: %q`, noParse), - }, - } { - t.Run(tc.name, func(t *testing.T) { - threads, err := GetThreadsFromPossible([]byte(tc.output)) - - switch { - case tc.wantErr == nil: - if err != nil { - t.Fatalf("Wanted nil err, got: %v", err) - } - case err == nil: - t.Fatalf("Want error: %v got: %v", tc.wantErr, err) - default: - if tc.wantErr.Error() != err.Error() { - t.Fatalf("Want error: %v got error: %v", tc.wantErr, err) - } - } - - if len(threads) != tc.wantCount { - t.Fatalf("Want count: %d got: %d", tc.wantCount, len(threads)) - } - }) - } -} - -func TestReverseSmoke(t *testing.T) { - data, err := ioutil.ReadFile("/sys/devices/system/cpu/possible") - if err != nil { - t.Fatalf("Failed to read from possible: %v", err) - } - threads, err := GetThreadsFromPossible(data) - if err != nil { - t.Fatalf("Could not parse possible output: %v", err) - } - - if len(threads) <= 0 { - t.Fatalf("Didn't get any CPU cores: %d", len(threads)) - } -} - -func checkSorted(threads []Thread) error { - if len(threads) < 2 { - return nil - } - last := threads[0].processorNumber - for _, t := range threads[1:] { - if last >= t.processorNumber { - return fmt.Errorf("threads out of order: thread %d before %d", t.processorNumber, last) - } - last = t.processorNumber - } - return nil -} diff --git a/runsc/mitigate/mock/mock.go b/runsc/mitigate/mock.go index 12c59e356..4588ae2ed 100644 --- a/runsc/mitigate/mock/mock.go +++ b/runsc/mitigate/mock.go @@ -12,26 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package mock contains mock CPUs for mitigate tests. -package mock +package mitigate -import "fmt" +import "strings" -// CPU represents data from CPUs that will be mitigated. -type CPU struct { +// MockCPU represents data from CPUs that will be mitigated. +type MockCPU struct { Name string VendorID string - Family int - Model int + Family int64 + Model int64 ModelName string Bugs string - PhysicalCores int - Cores int - ThreadsPerCore int + PhysicalCores int64 + Cores int64 + ThreadsPerCore int64 } // CascadeLake2 is a two core Intel CascadeLake machine. -var CascadeLake2 = CPU{ +var CascadeLake2 = MockCPU{ Name: "CascadeLake", VendorID: "GenuineIntel", Family: 6, @@ -44,7 +43,7 @@ var CascadeLake2 = CPU{ } // CascadeLake4 is a four core Intel CascadeLake machine. -var CascadeLake4 = CPU{ +var CascadeLake4 = MockCPU{ Name: "CascadeLake", VendorID: "GenuineIntel", Family: 6, @@ -57,7 +56,7 @@ var CascadeLake4 = CPU{ } // Haswell2 is a two core Intel Haswell machine. -var Haswell2 = CPU{ +var Haswell2 = MockCPU{ Name: "Haswell", VendorID: "GenuineIntel", Family: 6, @@ -70,7 +69,7 @@ var Haswell2 = CPU{ } // Haswell2core is a 2 core Intel Haswell machine with no hyperthread pairs. -var Haswell2core = CPU{ +var Haswell2core = MockCPU{ Name: "Haswell2Physical", VendorID: "GenuineIntel", Family: 6, @@ -83,7 +82,7 @@ var Haswell2core = CPU{ } // AMD2 is an two core AMD machine. -var AMD2 = CPU{ +var AMD2 = MockCPU{ Name: "AMD", VendorID: "AuthenticAMD", Family: 23, @@ -96,7 +95,7 @@ var AMD2 = CPU{ } // AMD8 is an eight core AMD machine. -var AMD8 = CPU{ +var AMD8 = MockCPU{ Name: "AMD", VendorID: "AuthenticAMD", Family: 23, @@ -108,47 +107,39 @@ var AMD8 = CPU{ ThreadsPerCore: 2, } -// MakeCPUString makes a string formated like /proc/cpuinfo for each cpuTestCase -func (tc CPU) MakeCPUString() string { - template := `processor : %d -vendor_id : %s -cpu family : %d -model : %d -model name : %s -physical id : %d -core id : %d -cpu cores : %d -bugs : %s - -` +// Empty is an empty CPU set. +var Empty = MockCPU{ + Name: "Empty", +} - ret := `` - for i := 0; i < tc.PhysicalCores; i++ { - for j := 0; j < tc.Cores; j++ { - for k := 0; k < tc.ThreadsPerCore; k++ { +// MakeCPUSet makes a cpuSet from a MockCPU. +func (tc MockCPU) MakeCPUSet() CPUSet { + bugs := make(map[string]struct{}) + for _, bug := range strings.Split(tc.Bugs, " ") { + bugs[bug] = struct{}{} + } + var cpus CPUSet = []*CPU{} + for i := int64(0); i < tc.PhysicalCores; i++ { + for j := int64(0); j < tc.Cores; j++ { + for k := int64(0); k < tc.ThreadsPerCore; k++ { processorNum := (i*tc.Cores+j)*tc.ThreadsPerCore + k - ret += fmt.Sprintf(template, - processorNum, /*processor*/ - tc.VendorID, /*vendor_id*/ - tc.Family, /*cpu family*/ - tc.Model, /*model*/ - tc.ModelName, /*model name*/ - i, /*physical id*/ - j, /*core id*/ - k, /*cpu cores*/ - tc.Bugs, /*bugs*/ - ) + cpu := &CPU{ + processorNumber: processorNum, + vendorID: tc.VendorID, + cpuFamily: tc.Family, + model: tc.Model, + physicalID: i, + coreID: j, + bugs: bugs, + } + cpus = append(cpus, cpu) } } } - return ret + return cpus } -// MakeSysPossibleString makes a string representing a the contents of /sys/devices/system/cpu/possible. -func (tc CPU) MakeSysPossibleString() string { - max := tc.PhysicalCores * tc.Cores * tc.ThreadsPerCore - if max == 1 { - return "0" - } - return fmt.Sprintf("0-%d", max-1) +// NumCPUs returns the number of CPUs for this CPU. +func (tc MockCPU) NumCPUs() int { + return int(tc.PhysicalCores * tc.Cores * tc.ThreadsPerCore) } diff --git a/runsc/mitigate/mock/BUILD b/runsc/mitigate/mock/BUILD deleted file mode 100644 index 5019ff9ee..000000000 --- a/runsc/mitigate/mock/BUILD +++ /dev/null @@ -1,11 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "mock", - srcs = ["mock.go"], - visibility = [ - "//runsc:__subpackages__", - ], -) diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index bc4a3fa32..d625230dd 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -17,12 +17,14 @@ go_library( "//pkg/control/client", "//pkg/control/server", "//pkg/coverage", + "//pkg/eventchannel", "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/platform", "//pkg/sync", "//pkg/tcpip/header", "//pkg/tcpip/stack", + "//pkg/unet", "//pkg/urpc", "//runsc/boot", "//runsc/boot/platforms", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index f69558021..3451d1037 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -49,7 +49,7 @@ import ( // // Run the following container to test it: // docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4 -func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Config) error { +func setupNetwork(conn *urpc.Client, pid int, conf *config.Config) error { log.Infof("Setting up network") switch conf.Network { @@ -301,13 +301,13 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) ( // Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer // for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max". - // wmem_max/rmem_max default to a unusually low value of 208KB. This is too low - // for gVisor to be able to receive packets at high throughputs without + // wmem_max/rmem_max default to a unusually low value of 208KB. This is too + // low for gVisor to be able to receive packets at high throughputs without // incurring packet drops. const bufSize = 4 << 20 // 4MB. if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, bufSize); err != nil { - unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize) + _ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize) sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF) if sz < bufSize { @@ -316,10 +316,10 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) ( } if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, bufSize); err != nil { - unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize) + _ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize) sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF) if sz < bufSize { - log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err) + log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err) } } diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 29e202b7d..f4a37cedc 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -35,10 +35,12 @@ import ( "gvisor.dev/gvisor/pkg/control/client" "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" @@ -65,6 +67,11 @@ type Sandbox struct { // is not running. Pid int `json:"pid"` + // UID is the user ID in the parent namespace that the sandbox is running as. + UID int `json:"uid"` + // GID is the group ID in the parent namespace that the sandbox is running as. + GID int `json:"gid"` + // Cgroup has the cgroup configuration for the sandbox. Cgroup *cgroup.Cgroup `json:"cgroup"` @@ -175,26 +182,30 @@ func New(conf *config.Config, args *Args) (*Sandbox, error) { return s, nil } -// CreateContainer creates a non-root container inside the sandbox. -func (s *Sandbox) CreateContainer(cid string, tty *os.File) error { - log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) - sandboxConn, err := s.sandboxConnect() - if err != nil { - return fmt.Errorf("couldn't connect to sandbox: %v", err) - } - defer sandboxConn.Close() +// CreateSubcontainer creates a container inside the sandbox. +func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error { + log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) var files []*os.File if tty != nil { files = []*os.File{tty} } + if err := s.configureStdios(conf, files); err != nil { + return err + } + + sandboxConn, err := s.sandboxConnect() + if err != nil { + return fmt.Errorf("couldn't connect to sandbox: %v", err) + } + defer sandboxConn.Close() args := boot.CreateArgs{ CID: cid, FilePayload: urpc.FilePayload{Files: files}, } - if err := sandboxConn.Call(boot.ContainerCreate, &args, nil); err != nil { - return fmt.Errorf("creating non-root container %q: %v", cid, err) + if err := sandboxConn.Call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil { + return fmt.Errorf("creating sub-container %q: %v", cid, err) } return nil } @@ -209,22 +220,27 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error { defer conn.Close() // Configure the network. - if err := setupNetwork(conn, s.Pid, spec, conf); err != nil { + if err := setupNetwork(conn, s.Pid, conf); err != nil { return fmt.Errorf("setting up network: %v", err) } // Send a message to the sandbox control server to start the root // container. - if err := conn.Call(boot.RootContainerStart, &s.ID, nil); err != nil { + if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil { return fmt.Errorf("starting root container: %v", err) } return nil } -// StartContainer starts running a non-root container inside the sandbox. -func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error { - log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) +// StartSubcontainer starts running a sub-container inside the sandbox. +func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error { + log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) + + if err := s.configureStdios(conf, stdios); err != nil { + return err + } + sandboxConn, err := s.sandboxConnect() if err != nil { return fmt.Errorf("couldn't connect to sandbox: %v", err) @@ -244,8 +260,8 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid stri CID: cid, FilePayload: payload, } - if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { - return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err) + if err := sandboxConn.Call(boot.ContMgrStartSubcontainer, &args, nil); err != nil { + return fmt.Errorf("starting sub-container %v: %v", spec.Process.Args, err) } return nil } @@ -282,12 +298,12 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *config.Config, fil defer conn.Close() // Configure the network. - if err := setupNetwork(conn, s.Pid, spec, conf); err != nil { + if err := setupNetwork(conn, s.Pid, conf); err != nil { return fmt.Errorf("setting up network: %v", err) } // Restore the container and start the root container. - if err := conn.Call(boot.ContainerRestore, &opt, nil); err != nil { + if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil { return fmt.Errorf("restoring container %q: %v", cid, err) } @@ -305,7 +321,7 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { defer conn.Close() var pl []*control.Process - if err := conn.Call(boot.ContainerProcesses, &cid, &pl); err != nil { + if err := conn.Call(boot.ContMgrProcesses, &cid, &pl); err != nil { return nil, fmt.Errorf("retrieving process data from sandbox: %v", err) } return pl, nil @@ -318,8 +334,13 @@ func (s *Sandbox) NewCGroup() (*cgroup.Cgroup, error) { // Execute runs the specified command in the container. It returns the PID of // the newly created process. -func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) { +func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID) + + if err := s.configureStdios(conf, args.Files); err != nil { + return 0, err + } + conn, err := s.sandboxConnect() if err != nil { return 0, s.connError(err) @@ -328,7 +349,7 @@ func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) { // Send a message to the sandbox control server to start the container. var pid int32 - if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil { + if err := conn.Call(boot.ContMgrExecuteAsync, args, &pid); err != nil { return 0, fmt.Errorf("executing command %q in sandbox: %v", args, err) } return pid, nil @@ -346,7 +367,7 @@ func (s *Sandbox) Event(cid string) (*boot.EventOut, error) { var e boot.EventOut // TODO(b/129292330): Pass in the container id (cid) here. The sandbox // should return events only for that container. - if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil { + if err := conn.Call(boot.ContMgrEvent, nil, &e); err != nil { return nil, fmt.Errorf("retrieving event data from sandbox: %v", err) } e.Event.ID = cid @@ -469,6 +490,61 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn cmd.Args = append(cmd.Args, "--start-sync-fd="+strconv.Itoa(nextFD)) nextFD++ + if conf.ProfileBlock != "" { + blockFile, err := os.OpenFile(conf.ProfileBlock, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening block profiling file %q: %v", conf.ProfileBlock, err) + } + defer blockFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, blockFile) + cmd.Args = append(cmd.Args, "--profile-block-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.ProfileCPU != "" { + cpuFile, err := os.OpenFile(conf.ProfileCPU, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening cpu profiling file %q: %v", conf.ProfileCPU, err) + } + defer cpuFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, cpuFile) + cmd.Args = append(cmd.Args, "--profile-cpu-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.ProfileHeap != "" { + heapFile, err := os.OpenFile(conf.ProfileHeap, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening heap profiling file %q: %v", conf.ProfileHeap, err) + } + defer heapFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, heapFile) + cmd.Args = append(cmd.Args, "--profile-heap-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.ProfileMutex != "" { + mutexFile, err := os.OpenFile(conf.ProfileMutex, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening mutex profiling file %q: %v", conf.ProfileMutex, err) + } + defer mutexFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, mutexFile) + cmd.Args = append(cmd.Args, "--profile-mutex-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.TraceFile != "" { + traceFile, err := os.OpenFile(conf.TraceFile, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening trace file %q: %v", conf.TraceFile, err) + } + defer traceFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, traceFile) + cmd.Args = append(cmd.Args, "--trace-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + // If there is a gofer, sends all socket ends to the sandbox. for _, f := range args.IOFiles { defer f.Close() @@ -505,6 +581,7 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn cmd.Stdin = nil cmd.Stdout = nil cmd.Stderr = nil + var stdios [3]*os.File // If the console control socket file is provided, then create a new // pty master/replica pair and set the TTY on the sandbox process. @@ -525,11 +602,9 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn cmd.SysProcAttr.Ctty = nextFD // Pass the tty as all stdio fds to sandbox. - for i := 0; i < 3; i++ { - cmd.ExtraFiles = append(cmd.ExtraFiles, tty) - cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD)) - nextFD++ - } + stdios[0] = tty + stdios[1] = tty + stdios[2] = tty if conf.Debug { // If debugging, send the boot process stdio to the @@ -541,11 +616,9 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn } else { // If not using a console, pass our current stdio as the // container stdio via flags. - for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} { - cmd.ExtraFiles = append(cmd.ExtraFiles, f) - cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD)) - nextFD++ - } + stdios[0] = os.Stdin + stdios[1] = os.Stdout + stdios[2] = os.Stderr if conf.Debug { // If debugging, send the boot process stdio to the @@ -595,6 +668,10 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) } + // These are set to the uid/gid that the sandbox process will use. + s.UID = os.Getuid() + s.GID = os.Getgid() + // User namespace depends on the network type. Host network requires to run // inside the user namespace specified in the spec or the current namespace // if none is configured. @@ -636,51 +713,49 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn const nobody = 65534 if conf.Rootless { log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) - cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ - { - ContainerID: nobody, - HostID: os.Getuid(), - Size: 1, - }, - } - cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ - { - ContainerID: nobody, - HostID: os.Getgid(), - Size: 1, - }, - } - } else { // Map nobody in the new namespace to nobody in the parent namespace. - // - // A sandbox process will construct an empty - // root for itself, so it has to have - // CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities. - cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ - { - ContainerID: nobody, - HostID: nobody, - Size: 1, - }, - } - cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ - { - ContainerID: nobody, - HostID: nobody, - Size: 1, - }, - } + s.UID = nobody + s.GID = nobody } // Set credentials to run as user and group nobody. cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody} + cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ + { + ContainerID: nobody, + HostID: s.UID, + Size: 1, + }, + } + cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ + { + ContainerID: nobody, + HostID: s.GID, + Size: 1, + }, + } + + // A sandbox process will construct an empty root for itself, so it has + // to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities. cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, uintptr(capability.CAP_SYS_ADMIN), uintptr(capability.CAP_SYS_CHROOT)) + } else { return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") } } + if err := s.configureStdios(conf, stdios[:]); err != nil { + return fmt.Errorf("configuring stdios: %w", err) + } + for _, file := range stdios { + cmd.ExtraFiles = append(cmd.ExtraFiles, file) + cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD)) + nextFD++ + } + + // Set Args[0] to make easier to spot the sandbox process. Otherwise it's + // shown as `exe`. cmd.Args[0] = "runsc-sandbox" if s.Cgroup != nil { @@ -796,8 +871,14 @@ func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) { // Try the Wait RPC to the sandbox. var ws unix.WaitStatus - err = conn.Call(boot.ContainerWait, &cid, &ws) + err = conn.Call(boot.ContMgrWait, &cid, &ws) + conn.Close() if err == nil { + if s.IsRootContainer(cid) { + if err := s.waitForStopped(); err != nil { + return unix.WaitStatus(0), err + } + } // It worked! return ws, nil } @@ -841,7 +922,7 @@ func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) { PID: pid, CID: cid, } - if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil { + if err := conn.Call(boot.ContMgrWaitPID, args, &ws); err != nil { return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err) } return ws, nil @@ -891,7 +972,7 @@ func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error { Signo: int32(sig), Mode: mode, } - if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil { + if err := conn.Call(boot.ContMgrSignal, &args, nil); err != nil { return fmt.Errorf("signaling container %q: %v", cid, err) } return nil @@ -920,7 +1001,7 @@ func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProces PID: pid, Mode: mode, } - if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil { + if err := conn.Call(boot.ContMgrSignal, &args, nil); err != nil { return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err) } return nil @@ -942,7 +1023,7 @@ func (s *Sandbox) Checkpoint(cid string, f *os.File) error { }, } - if err := conn.Call(boot.ContainerCheckpoint, &opt, nil); err != nil { + if err := conn.Call(boot.ContMgrCheckpoint, &opt, nil); err != nil { return fmt.Errorf("checkpointing container %q: %v", cid, err) } return nil @@ -957,7 +1038,7 @@ func (s *Sandbox) Pause(cid string) error { } defer conn.Close() - if err := conn.Call(boot.ContainerPause, nil, nil); err != nil { + if err := conn.Call(boot.LifecyclePause, nil, nil); err != nil { return fmt.Errorf("pausing container %q: %v", cid, err) } return nil @@ -972,12 +1053,114 @@ func (s *Sandbox) Resume(cid string) error { } defer conn.Close() - if err := conn.Call(boot.ContainerResume, nil, nil); err != nil { + if err := conn.Call(boot.LifecycleResume, nil, nil); err != nil { return fmt.Errorf("resuming container %q: %v", cid, err) } return nil } +// Cat sends the cat call for a container in the sandbox. +func (s *Sandbox) Cat(cid string, files []string, out *os.File) error { + log.Debugf("Cat sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + if err := conn.Call(boot.FsCat, &control.CatOpts{ + Files: files, + FilePayload: urpc.FilePayload{Files: []*os.File{out}}, + }, nil); err != nil { + return fmt.Errorf("Cat container %q: %v", cid, err) + } + return nil +} + +// Usage sends the collect call for a container in the sandbox. +func (s *Sandbox) Usage(cid string, Full bool) (control.MemoryUsage, error) { + log.Debugf("Usage sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return control.MemoryUsage{}, err + } + defer conn.Close() + + var m control.MemoryUsage + err = conn.Call(boot.UsageCollect, &control.MemoryUsageOpts{ + Full: Full, + }, &m) + return m, err +} + +// UsageFD sends the usagefd call for a container in the sandbox. +func (s *Sandbox) UsageFD(cid string) (*control.MemoryUsageRecord, error) { + log.Debugf("Usage sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return nil, err + } + defer conn.Close() + + var m control.MemoryUsageFile + if err := conn.Call(boot.UsageUsageFD, &control.MemoryUsageFileOpts{ + Version: 1, + }, &m); err != nil { + return nil, fmt.Errorf("UsageFD failed: %v", err) + } + + if len(m.FilePayload.Files) != 2 { + return nil, fmt.Errorf("wants exactly two fds") + } + + return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1]) +} + +// Reduce sends the reduce call for a container in the sandbox. +func (s *Sandbox) Reduce(cid string, wait bool) error { + log.Debugf("Reduce sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + return conn.Call(boot.UsageReduce, &control.UsageReduceOpts{ + Wait: wait, + }, nil) +} + +// Stream sends the AttachDebugEmitter call for a container in the sandbox, and +// dumps filtered events to out. +func (s *Sandbox) Stream(cid string, filters []string, out *os.File) error { + log.Debugf("Stream sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + r, w, err := unet.SocketPair(false) + if err != nil { + return err + } + + wfd, err := w.Release() + if err != nil { + return fmt.Errorf("failed to release write socket FD: %v", err) + } + + if err := conn.Call(boot.EventsAttachDebugEmitter, &control.EventsOpts{ + FilePayload: urpc.FilePayload{Files: []*os.File{ + os.NewFile(uintptr(wfd), "event sink"), + }}, + }, nil); err != nil { + return fmt.Errorf("AttachDebugEmitter failed: %v", err) + } + + return eventchannel.ProcessAll(r, filters, out) +} + // IsRunning returns true if the sandbox or gofer process is running. func (s *Sandbox) IsRunning() bool { if s.Pid != 0 { @@ -1000,7 +1183,7 @@ func (s *Sandbox) Stacks() (string, error) { defer conn.Close() var stacks string - if err := conn.Call(boot.SandboxStacks, nil, &stacks); err != nil { + if err := conn.Call(boot.DebugStacks, nil, &stacks); err != nil { return "", fmt.Errorf("getting sandbox %q stacks: %v", s.ID, err) } return stacks, nil @@ -1019,7 +1202,7 @@ func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error { FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Delay: delay, } - return conn.Call(boot.HeapProfile, &opts, nil) + return conn.Call(boot.ProfileHeap, &opts, nil) } // CPUProfile collects a CPU profile. @@ -1035,7 +1218,7 @@ func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error { FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Duration: duration, } - return conn.Call(boot.CPUProfile, &opts, nil) + return conn.Call(boot.ProfileCPU, &opts, nil) } // BlockProfile writes a block profile to the given file. @@ -1051,7 +1234,7 @@ func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error { FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Duration: duration, } - return conn.Call(boot.BlockProfile, &opts, nil) + return conn.Call(boot.ProfileBlock, &opts, nil) } // MutexProfile writes a mutex profile to the given file. @@ -1067,7 +1250,7 @@ func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error { FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Duration: duration, } - return conn.Call(boot.MutexProfile, &opts, nil) + return conn.Call(boot.ProfileMutex, &opts, nil) } // Trace collects an execution trace. @@ -1083,7 +1266,7 @@ func (s *Sandbox) Trace(f *os.File, duration time.Duration) error { FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Duration: duration, } - return conn.Call(boot.Trace, &opts, nil) + return conn.Call(boot.ProfileTrace, &opts, nil) } // ChangeLogging changes logging options. @@ -1095,7 +1278,7 @@ func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { } defer conn.Close() - if err := conn.Call(boot.ChangeLogging, &args, nil); err != nil { + if err := conn.Call(boot.LoggingChange, &args, nil); err != nil { return fmt.Errorf("changing sandbox %q logging: %v", s.ID, err) } return nil @@ -1126,34 +1309,33 @@ func (s *Sandbox) destroyContainer(cid string) error { return err } defer conn.Close() - if err := conn.Call(boot.ContainerDestroy, &cid, nil); err != nil { + if err := conn.Call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil { return fmt.Errorf("destroying container %q: %v", cid, err) } return nil } func (s *Sandbox) waitForStopped() error { + if s.child { + s.statusMu.Lock() + defer s.statusMu.Unlock() + if s.Pid == 0 { + return nil + } + // The sandbox process is a child of the current process, + // so we can wait it and collect its zombie. + if _, err := unix.Wait4(int(s.Pid), &s.status, 0, nil); err != nil { + return fmt.Errorf("error waiting the sandbox process: %v", err) + } + s.Pid = 0 + return nil + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) op := func() error { - if s.child { - s.statusMu.Lock() - defer s.statusMu.Unlock() - if s.Pid == 0 { - return nil - } - // The sandbox process is a child of the current process, - // so we can wait it and collect its zombie. - wpid, err := unix.Wait4(int(s.Pid), &s.status, unix.WNOHANG, nil) - if err != nil { - return fmt.Errorf("error waiting the sandbox process: %v", err) - } - if wpid == 0 { - return fmt.Errorf("sandbox is still running") - } - s.Pid = 0 - } else if s.IsRunning() { + if s.IsRunning() { return fmt.Errorf("sandbox is still running") } return nil @@ -1161,6 +1343,23 @@ func (s *Sandbox) waitForStopped() error { return backoff.Retry(op, b) } +// configureStdios change stdios ownership to give access to the sandbox +// process. This may be skipped depending on the configuration. +func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error { + if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + // Cannot change ownership without CAP_CHOWN. + return nil + } + + for _, file := range stdios { + log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID) + if err := file.Chown(s.UID, s.GID); err != nil { + return err + } + } + return nil +} + // deviceFileForPlatform opens the device file for the given platform. If the // platform does not need a device file, then nil is returned. func deviceFileForPlatform(name string) (*os.File, error) { diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go index 9ecd0fde6..ac20696ee 100644 --- a/runsc/specutils/fs.go +++ b/runsc/specutils/fs.go @@ -67,8 +67,8 @@ var optionsMap = map[string]mapping{ // verityMountOptions is the set of valid verity mount option keys. var verityMountOptions = map[string]struct{}{ - "verity.roothash": struct{}{}, - "verity.action": struct{}{}, + "verity.roothash": {}, + "verity.action": {}, } // propOptionsMap is similar to optionsMap, but it lists propagation options diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go index 69d7ba5c4..21559f5e5 100644 --- a/runsc/specutils/namespace.go +++ b/runsc/specutils/namespace.go @@ -270,7 +270,10 @@ func MaybeRunAsRoot() error { go func() { for { // Forward all signals to child process. - cmd.Process.Signal(<-ch) + sig := <-ch + if err := cmd.Process.Signal(sig); err != nil { + log.Warningf("Error forwarding signal %v to child (PID %d)", sig, cmd.Process.Pid) + } } }() if err := cmd.Wait(); err != nil { diff --git a/runsc/specutils/safemount_test/BUILD b/runsc/specutils/safemount_test/BUILD new file mode 100644 index 000000000..c39c40492 --- /dev/null +++ b/runsc/specutils/safemount_test/BUILD @@ -0,0 +1,23 @@ +load("//tools:defs.bzl", "go_binary", "go_test") + +package(licenses = ["notice"]) + +go_test( + name = "safemount_test", + size = "small", + srcs = ["safemount_test.go"], + data = [":safemount_runner"], + deps = [ + "//pkg/test/testutil", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_binary( + name = "safemount_runner", + srcs = ["safemount_runner.go"], + deps = [ + "//runsc/specutils", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/specutils/safemount_test/safemount_runner.go b/runsc/specutils/safemount_test/safemount_runner.go new file mode 100644 index 000000000..b23193033 --- /dev/null +++ b/runsc/specutils/safemount_test/safemount_runner.go @@ -0,0 +1,117 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// safemount_runner is used to test the SafeMount function. Because use of +// unix.Mount requires privilege, tests must launch this process with +// CLONE_NEWNS and CLONE_NEWUSER. +package main + +import ( + "errors" + "fmt" + "log" + "os" + "path/filepath" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/runsc/specutils" +) + +func main() { + // The test temporary directory is the first argument. + tempdir := os.Args[1] + + tcs := []struct { + name string + testfunc func() error + }{{ + name: "unix.Mount to folder succeeds", + testfunc: func() error { + dir2Path := filepath.Join(tempdir, "subdir2") + if err := unix.Mount(filepath.Join(tempdir, "subdir"), dir2Path, "bind", unix.MS_BIND, ""); err != nil { + return fmt.Errorf("mount: %v", err) + } + return unix.Unmount(dir2Path, unix.MNT_DETACH) + }, + }, { + // unix.Mount doesn't care whether the target is a symlink. + name: "unix.Mount to symlink succeeds", + testfunc: func() error { + symlinkPath := filepath.Join(tempdir, "symlink") + if err := unix.Mount(filepath.Join(tempdir, "subdir"), symlinkPath, "bind", unix.MS_BIND, ""); err != nil { + return fmt.Errorf("mount: %v", err) + } + return unix.Unmount(symlinkPath, unix.MNT_DETACH) + }, + }, { + name: "SafeMount to folder succeeds", + testfunc: func() error { + dir2Path := filepath.Join(tempdir, "subdir2") + if err := specutils.SafeMount(filepath.Join(tempdir, "subdir"), dir2Path, "bind", unix.MS_BIND, "", "/proc"); err != nil { + return fmt.Errorf("SafeMount: %v", err) + } + return unix.Unmount(dir2Path, unix.MNT_DETACH) + }, + }, { + name: "SafeMount to symlink fails", + testfunc: func() error { + err := specutils.SafeMount(filepath.Join(tempdir, "subdir"), filepath.Join(tempdir, "symlink"), "bind", unix.MS_BIND, "", "/proc") + if err == nil { + return fmt.Errorf("SafeMount didn't fail, but should have") + } + var symErr *specutils.ErrSymlinkMount + if !errors.As(err, &symErr) { + return fmt.Errorf("expected SafeMount to fail with ErrSymlinkMount, but got: %v", err) + } + return nil + }, + }} + + for _, tc := range tcs { + if err := runTest(tempdir, tc.testfunc); err != nil { + log.Fatalf("failed test %q: %v", tc.name, err) + } + } +} + +// runTest runs testfunc with the following directory structure: +// tempdir/ +// subdir/ +// subdir2/ +// symlink --> ./subdir2 +func runTest(tempdir string, testfunc func() error) error { + // Create tempdir/subdir/. + dirPath := filepath.Join(tempdir, "subdir") + if err := os.Mkdir(dirPath, 0777); err != nil { + return fmt.Errorf("os.Mkdir(%s, 0777)", dirPath) + } + defer os.Remove(dirPath) + + // Create tempdir/subdir2/. + dir2Path := filepath.Join(tempdir, "subdir2") + if err := os.Mkdir(dir2Path, 0777); err != nil { + return fmt.Errorf("os.Mkdir(%s, 0777)", dir2Path) + } + defer os.Remove(dir2Path) + + // Create tempdir/symlink, which points to ./subdir2. + symlinkPath := filepath.Join(tempdir, "symlink") + if err := os.Symlink("./subdir2", symlinkPath); err != nil { + return fmt.Errorf("failed to create symlink %s: %v", symlinkPath, err) + } + defer os.Remove(symlinkPath) + + // Run the actual test. + return testfunc() +} diff --git a/runsc/specutils/safemount_test/safemount_test.go b/runsc/specutils/safemount_test/safemount_test.go new file mode 100644 index 000000000..8820978c4 --- /dev/null +++ b/runsc/specutils/safemount_test/safemount_test.go @@ -0,0 +1,53 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemount_test + +import ( + "os" + "os/exec" + "syscall" + "testing" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/test/testutil" +) + +func TestSafeMount(t *testing.T) { + // We run the actual tests in another process, as we need CAP_SYS_ADMIN to + // call mount(2). The new process runs in its own user and mount namespaces. + runner, err := testutil.FindFile("runsc/specutils/safemount_test/safemount_runner") + if err != nil { + t.Fatalf("failed to find test runner binary: %v", err) + } + cmd := exec.Command(runner, t.TempDir()) + cmd.SysProcAttr = &unix.SysProcAttr{ + Cloneflags: unix.CLONE_NEWNS | unix.CLONE_NEWUSER, + UidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getuid(), Size: 1}, + }, + GidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getgid(), Size: 1}, + }, + GidMappingsEnableSetgroups: false, + Credential: &syscall.Credential{ + Uid: 0, + Gid: 0, + }, + } + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("failed running %s with error: %v\ntest output:\n%s", cmd, err, output) + } +} diff --git a/runsc/specutils/seccomp/audit_amd64.go b/runsc/specutils/seccomp/audit_amd64.go index 417cf4a7a..5ef3edaea 100644 --- a/runsc/specutils/seccomp/audit_amd64.go +++ b/runsc/specutils/seccomp/audit_amd64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build amd64 // +build amd64 package seccomp diff --git a/runsc/specutils/seccomp/audit_arm64.go b/runsc/specutils/seccomp/audit_arm64.go index b727ceff2..6253cba61 100644 --- a/runsc/specutils/seccomp/audit_arm64.go +++ b/runsc/specutils/seccomp/audit_arm64.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build arm64 // +build arm64 package seccomp diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index c228d6299..5365b5b1b 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -217,7 +217,7 @@ func ReadMounts(f *os.File) ([]specs.Mount, error) { } var mounts []specs.Mount if err := json.Unmarshal(bytes, &mounts); err != nil { - return nil, fmt.Errorf("error unmarshaling mounts: %v\n %s", err, string(bytes)) + return nil, fmt.Errorf("error unmarshaling mounts: %v\nJSON bytes:\n%s", err, string(bytes)) } return mounts, nil } @@ -434,10 +434,12 @@ func DebugLogFile(logPattern, command, test string) (*os.File, error) { return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) } -// Mount creates the mount point and calls Mount with the given flags. -func Mount(src, dst, typ string, flags uint32) error { - // Create the mount point inside. The type must be the same as the - // source (file or directory). +// SafeSetupAndMount creates the mount point and calls Mount with the given +// flags. procPath is the path to procfs. If it is "", procfs is assumed to be +// mounted at /proc. +func SafeSetupAndMount(src, dst, typ string, flags uint32, procPath string) error { + // Create the mount point inside. The type must be the same as the source + // (file or directory). var isDir bool if typ == "proc" { // Special case, as there is no source directory for proc mounts. @@ -468,12 +470,50 @@ func Mount(src, dst, typ string, flags uint32) error { } // Do the mount. - if err := unix.Mount(src, dst, typ, uintptr(flags), ""); err != nil { + if err := SafeMount(src, dst, typ, uintptr(flags), "", procPath); err != nil { return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err) } return nil } +// ErrSymlinkMount is returned by SafeMount when the mount destination is found +// to be a symlink. +type ErrSymlinkMount struct { + error +} + +// SafeMount is like unix.Mount, but will fail if dst is a symlink. procPath is +// the path to procfs. If it is "", procfs is assumed to be mounted at /proc. +// +// SafeMount can fail when dst contains a symlink. However, it is called in the +// normal case with a destination consisting of a known root (/proc/root) and +// symlink-free path (from resolveSymlink). +func SafeMount(src, dst, fstype string, flags uintptr, data, procPath string) error { + // Open the destination. + fd, err := unix.Open(dst, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("failed to safely mount: Open(%s, _, _): %w", dst, err) + } + defer unix.Close(fd) + + // Use /proc/self/fd/ to verify that we opened the intended destination. This + // guards against dst being a symlink, in which case we could accidentally + // mount over the symlink's target. + if procPath == "" { + procPath = "/proc" + } + safePath := filepath.Join(procPath, "self/fd", strconv.Itoa(fd)) + target, err := os.Readlink(safePath) + if err != nil { + return fmt.Errorf("failed to safely mount: Readlink(%s): %w", safePath, err) + } + if dst != target { + return &ErrSymlinkMount{fmt.Errorf("failed to safely mount: expected to open %s, but found %s", dst, target)} + } + + return unix.Mount(src, safePath, fstype, flags, data) +} + // ContainsStr returns true if 'str' is inside 'strs'. func ContainsStr(strs []string, str string) bool { for _, s := range strs { diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go index 2c86fffe8..e2d3a75dc 100644 --- a/runsc/specutils/specutils_test.go +++ b/runsc/specutils/specutils_test.go @@ -29,7 +29,7 @@ func TestWaitForReadyHappy(t *testing.T) { if err := cmd.Start(); err != nil { t.Fatalf("cmd.Start() failed, err: %v", err) } - defer cmd.Wait() + defer func() { _ = cmd.Wait() }() var count int err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) { @@ -42,7 +42,9 @@ func TestWaitForReadyHappy(t *testing.T) { if err != nil { t.Errorf("ProcessWaitReady got: %v, expected: nil", err) } - cmd.Process.Kill() + if err := cmd.Process.Kill(); err != nil { + t.Errorf("cmd.ProcessKill(): %v", err) + } } func TestWaitForReadyFail(t *testing.T) { @@ -50,7 +52,7 @@ func TestWaitForReadyFail(t *testing.T) { if err := cmd.Start(); err != nil { t.Fatalf("cmd.Start() failed, err: %v", err) } - defer cmd.Wait() + defer func() { _ = cmd.Wait() }() var count int err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) { @@ -58,12 +60,14 @@ func TestWaitForReadyFail(t *testing.T) { count++ return false, nil } - return false, fmt.Errorf("Fake error") + return false, fmt.Errorf("fake error") }) if err == nil { t.Errorf("ProcessWaitReady got: nil, expected: error") } - cmd.Process.Kill() + if err := cmd.Process.Kill(); err != nil { + t.Errorf("cmd.ProcessKill(): %v", err) + } } func TestWaitForReadyNotRunning(t *testing.T) { @@ -71,7 +75,7 @@ func TestWaitForReadyNotRunning(t *testing.T) { if err := cmd.Start(); err != nil { t.Fatalf("cmd.Start() failed, err: %v", err) } - defer cmd.Wait() + defer func() { _ = cmd.Wait() }() err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) { return false, nil @@ -89,15 +93,17 @@ func TestWaitForReadyTimeout(t *testing.T) { if err := cmd.Start(); err != nil { t.Fatalf("cmd.Start() failed, err: %v", err) } - defer cmd.Wait() + defer func() { _ = cmd.Wait() }() err := WaitForReady(cmd.Process.Pid, 50*time.Millisecond, func() (bool, error) { return false, nil }) - if !strings.Contains(err.Error(), "not running yet") { + if err == nil || !strings.Contains(err.Error(), "not running yet") { t.Errorf("ProcessWaitReady got: %v, expected: not running yet", err) } - cmd.Process.Kill() + if err := cmd.Process.Kill(); err != nil { + t.Errorf("cmd.ProcessKill(): %v", err) + } } func TestSpecInvalid(t *testing.T) { diff --git a/runsc/version.go b/runsc/version.go index ab9194b9d..c250f4a2a 100644 --- a/runsc/version.go +++ b/runsc/version.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package main // version is set during linking. diff --git a/test/benchmarks/base/startup_test.go b/test/benchmarks/base/startup_test.go index 05a43ad17..197241622 100644 --- a/test/benchmarks/base/startup_test.go +++ b/test/benchmarks/base/startup_test.go @@ -34,15 +34,19 @@ func BenchmarkStartupEmpty(b *testing.B) { defer machine.CleanUp() ctx := context.Background() + b.StopTimer() + b.ResetTimer() for i := 0; i < b.N; i++ { harness.DebugLog(b, "Running container: %d", i) container := machine.GetContainer(ctx, b) - defer container.CleanUp(ctx) - if _, err := container.Run(ctx, dockerutil.RunOpts{ + b.StartTimer() + if err := container.Spawn(ctx, dockerutil.RunOpts{ Image: "benchmarks/alpine", - }, "true"); err != nil { - b.Fatalf("failed to run container: %v", err) + }, "sleep", "100"); err != nil { + b.Fatalf("failed to start container: %v", err) } + b.StopTimer() + container.CleanUp(ctx) harness.DebugLog(b, "Ran container: %d", i) } } diff --git a/test/benchmarks/tcp/tcp_proxy.go b/test/benchmarks/tcp/tcp_proxy.go index be74e4d4a..bc0d8fd38 100644 --- a/test/benchmarks/tcp/tcp_proxy.go +++ b/test/benchmarks/tcp/tcp_proxy.go @@ -208,8 +208,12 @@ func newNetstackImpl(mode string) (impl, error) { if err := s.CreateNIC(nicID, fifo.New(ep, runtime.GOMAXPROCS(0), 1000)); err != nil { return nil, fmt.Errorf("error creating NIC %q: %v", *iface, err) } - if err := s.AddAddress(nicID, ipv4.ProtocolNumber, parsedAddr); err != nil { - return nil, fmt.Errorf("error adding IP address to %q: %v", *iface, err) + protocolAddr := tcpip.ProtocolAddress{ + Protocol: ipv4.ProtocolNumber, + AddressWithPrefix: parsedAddr.WithPrefix(), + } + if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + return nil, fmt.Errorf("error adding IP address %+v to %q: %w", protocolAddr, *iface, err) } subnet, err := tcpip.NewSubnet(parsedDest, parsedMask) diff --git a/test/benchmarks/tools/fio_test.go b/test/benchmarks/tools/fio_test.go index a98277150..3b1f852ce 100644 --- a/test/benchmarks/tools/fio_test.go +++ b/test/benchmarks/tools/fio_test.go @@ -86,7 +86,7 @@ func TestFio(t *testing.T) { fio := Fio{} // WriteBandwidth. got, err := fio.parseBandwidth(sampleData, false) - var want float64 = 1753471.0 * 1024 + want := 1753471.0 * 1024 if err != nil { t.Fatalf("parse failed with err: %v", err) } else if got != want { diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index 1accc3b3b..d41139944 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -30,6 +30,7 @@ import ( "net/http" "os" "path/filepath" + "regexp" "strconv" "strings" "testing" @@ -43,6 +44,12 @@ import ( // defaultWait is the default wait time used for tests. const defaultWait = time.Minute +func TestMain(m *testing.M) { + dockerutil.EnsureSupportedDockerVersion() + flag.Parse() + os.Exit(m.Run()) +} + // httpRequestSucceeds sends a request to a given url and checks that the status is OK. func httpRequestSucceeds(client http.Client, server string, port int) error { url := fmt.Sprintf("http://%s:%d", server, port) @@ -426,10 +433,10 @@ func TestTmpMount(t *testing.T) { // Test that it is allowed to mount a file on top of /dev files, e.g. // /dev/random. func TestMountOverDev(t *testing.T) { - if usingVFS2, err := dockerutil.UsingVFS2(); !usingVFS2 { - t.Skip("VFS1 doesn't allow /dev/random to be mounted.") - } else if err != nil { + if vfs2, err := dockerutil.UsingVFS2(); err != nil { t.Fatalf("Failed to read config for runtime %s: %v", dockerutil.Runtime(), err) + } else if !vfs2 { + t.Skip("VFS1 doesn't allow /dev/random to be mounted.") } random, err := ioutil.TempFile(testutil.TmpDir(), "random") @@ -574,11 +581,12 @@ func runIntegrationTest(t *testing.T, capAdd []string, args ...string) { d := dockerutil.MakeContainer(ctx, t) defer d.CleanUp(ctx) - if got, err := d.Run(ctx, dockerutil.RunOpts{ + opts := dockerutil.RunOpts{ Image: "basic/integrationtest", WorkDir: "/root", CapAdd: capAdd, - }, args...); err != nil { + } + if got, err := d.Run(ctx, opts, args...); err != nil { t.Fatalf("docker run failed: %v", err) } else if got != "" { t.Errorf("test failed:\n%s", got) @@ -609,8 +617,174 @@ func TestBindOverlay(t *testing.T) { } } -func TestMain(m *testing.M) { - dockerutil.EnsureSupportedDockerVersion() - flag.Parse() - os.Exit(m.Run()) +func TestStdios(t *testing.T) { + if vfs2, err := dockerutil.UsingVFS2(); err != nil { + t.Fatalf("Failed to read config for runtime %s: %v", dockerutil.Runtime(), err) + } else if !vfs2 { + t.Skip("VFS1 doesn't adjust stdios user") + } + + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + testStdios(t, func(user string, args ...string) (string, error) { + defer d.CleanUp(ctx) + opts := dockerutil.RunOpts{ + Image: "basic/alpine", + User: user, + } + return d.Run(ctx, opts, args...) + }) +} + +func TestStdiosExec(t *testing.T) { + if vfs2, err := dockerutil.UsingVFS2(); err != nil { + t.Fatalf("Failed to read config for runtime %s: %v", dockerutil.Runtime(), err) + } else if !vfs2 { + t.Skip("VFS1 doesn't adjust stdios user") + } + + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + runOpts := dockerutil.RunOpts{Image: "basic/alpine"} + if err := d.Spawn(ctx, runOpts, "sleep", "100"); err != nil { + t.Fatalf("docker run failed: %v", err) + } + + testStdios(t, func(user string, args ...string) (string, error) { + opts := dockerutil.ExecOpts{User: user} + return d.Exec(ctx, opts, args...) + }) +} + +func testStdios(t *testing.T, run func(string, ...string) (string, error)) { + const cmd = "stat -L /proc/self/fd/0 /proc/self/fd/1 /proc/self/fd/2 | grep 'Uid:'" + got, err := run("123", "/bin/sh", "-c", cmd) + if err != nil { + t.Fatalf("docker exec failed: %v", err) + } + if len(got) == 0 { + t.Errorf("Unexpected empty output from %q", cmd) + } + re := regexp.MustCompile(`Uid: \(\s*(\w+)\/.*\)`) + for _, line := range strings.SplitN(got, "\n", 3) { + t.Logf("stat -L: %s", line) + matches := re.FindSubmatch([]byte(line)) + if len(matches) != 2 { + t.Fatalf("wrong output format: %q: matches: %v", line, matches) + } + if want, got := "123", string(matches[1]); want != got { + t.Errorf("wrong user, want: %q, got: %q", want, got) + } + } + + // Check that stdout and stderr can be open and written to. This checks + // that ownership and permissions are correct inside gVisor. + got, err = run("456", "/bin/sh", "-c", "echo foobar | tee /proc/self/fd/1 > /proc/self/fd/2") + if err != nil { + t.Fatalf("docker run failed: %v", err) + } + t.Logf("echo foobar: %q", got) + // Check it repeats twice, once for stdout and once for stderr. + if want := "foobar\nfoobar\n"; want != got { + t.Errorf("Wrong echo output, want: %q, got: %q", want, got) + } + + // Check that timestamps can be changed. Setting timestamps require an extra + // write check _after_ the file was opened, and may fail if the underlying + // host file is not setup correctly. + if _, err := run("789", "touch", "/proc/self/fd/0", "/proc/self/fd/1", "/proc/self/fd/2"); err != nil { + t.Fatalf("docker run failed: %v", err) + } +} + +func TestStdiosChown(t *testing.T) { + if vfs2, err := dockerutil.UsingVFS2(); err != nil { + t.Fatalf("Failed to read config for runtime %s: %v", dockerutil.Runtime(), err) + } else if !vfs2 { + t.Skip("VFS1 doesn't adjust stdios user") + } + + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + opts := dockerutil.RunOpts{Image: "basic/alpine"} + if _, err := d.Run(ctx, opts, "chown", "123", "/proc/self/fd/0", "/proc/self/fd/1", "/proc/self/fd/2"); err != nil { + t.Fatalf("docker run failed: %v", err) + } +} + +func TestUnmount(t *testing.T) { + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + dir, err := ioutil.TempDir(testutil.TmpDir(), "sub-mount") + if err != nil { + t.Fatalf("TempDir(): %v", err) + } + opts := dockerutil.RunOpts{ + Image: "basic/alpine", + Privileged: true, // Required for umount + Mounts: []mount.Mount{ + { + Type: mount.TypeBind, + Source: dir, + Target: "/foo", + }, + }, + } + if _, err := d.Run(ctx, opts, "umount", "/foo"); err != nil { + t.Fatalf("docker run failed: %v", err) + } +} + +func TestDeleteInterface(t *testing.T) { + if testutil.IsRunningWithHostNet() { + t.Skip("not able to remove interfaces on hostnet") + } + + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + opts := dockerutil.RunOpts{ + Image: "basic/alpine", + CapAdd: []string{"NET_ADMIN"}, + } + if err := d.Spawn(ctx, opts, "sleep", "1000"); err != nil { + t.Fatalf("docker run failed: %v", err) + } + + // We should be able to remove eth0. + output, err := d.Exec(ctx, dockerutil.ExecOpts{}, "/bin/sh", "-c", "ip link del dev eth0") + if err != nil { + t.Fatalf("failed to remove eth0: %s, output: %s", err, output) + } + // Verify that eth0 is no longer there. + output, err = d.Exec(ctx, dockerutil.ExecOpts{}, "/bin/sh", "-c", "ip link show") + if err != nil { + t.Fatalf("docker exec ip link show failed: %s, output: %s", err, output) + } + if strings.Contains(output, "eth0") { + t.Fatalf("failed to remove eth0") + } + + // Loopback device can't be removed. + output, err = d.Exec(ctx, dockerutil.ExecOpts{}, "/bin/sh", "-c", "ip link del dev lo") + if err == nil { + t.Fatalf("should not remove the loopback device: %v", output) + } + // Verify that lo is still there. + output, err = d.Exec(ctx, dockerutil.ExecOpts{}, "/bin/sh", "-c", "ip link show") + if err != nil { + t.Fatalf("docker exec ip link show failed: %s, output: %s", err, output) + } + if !strings.Contains(output, "lo") { + t.Fatalf("loopback interface is removed") + } } diff --git a/test/fsstress/BUILD b/test/fsstress/BUILD index e74e7fff2..402445a7e 100644 --- a/test/fsstress/BUILD +++ b/test/fsstress/BUILD @@ -14,7 +14,11 @@ go_test( "manual", "local", ], - deps = ["//pkg/test/dockerutil"], + deps = [ + "//pkg/test/dockerutil", + "//pkg/test/testutil", + "@com_github_docker_docker//api/types/mount:go_default_library", + ], ) go_library( diff --git a/test/fsstress/fsstress_test.go b/test/fsstress/fsstress_test.go index d53c8f90d..d9513c42c 100644 --- a/test/fsstress/fsstress_test.go +++ b/test/fsstress/fsstress_test.go @@ -18,6 +18,8 @@ package fsstress import ( "context" "flag" + "fmt" + "io/ioutil" "math/rand" "os" "strconv" @@ -25,7 +27,9 @@ import ( "testing" "time" + "github.com/docker/docker/api/types/mount" "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/pkg/test/testutil" ) func init() { @@ -42,6 +46,7 @@ type config struct { operations string processes string target string + mounts []mount.Mount } func fsstress(t *testing.T, conf config) { @@ -52,8 +57,19 @@ func fsstress(t *testing.T, conf config) { const image = "basic/fsstress" seed := strconv.FormatUint(uint64(rand.Uint32()), 10) args := []string{"-d", conf.target, "-n", conf.operations, "-p", conf.processes, "-s", seed, "-X"} - t.Logf("Repro: docker run --rm --runtime=%s gvisor.dev/images/%s %s", dockerutil.Runtime(), image, strings.Join(args, " ")) - out, err := d.Run(ctx, dockerutil.RunOpts{Image: image}, args...) + opts := dockerutil.RunOpts{ + Image: image, + Mounts: conf.mounts, + } + var mounts string + if len(conf.mounts) > 0 { + mounts = " -v " + for _, m := range conf.mounts { + mounts += fmt.Sprintf("-v <any_dir>:%s", m.Target) + } + } + t.Logf("Repro: docker run --rm --runtime=%s%s gvisor.dev/images/%s %s", dockerutil.Runtime(), mounts, image, strings.Join(args, " ")) + out, err := d.Run(ctx, opts, args...) if err != nil { t.Fatalf("docker run failed: %v\noutput: %s", err, out) } @@ -64,6 +80,39 @@ func fsstress(t *testing.T, conf config) { } } +func TestFsstressGofer(t *testing.T) { + // This takes between 30-60s to run on my machine. Adjust as needed. + cfg := config{ + operations: "500", + processes: "20", + target: "/test", + } + fsstress(t, cfg) +} + +func TestFsstressGoferShared(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "fsstress") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + defer os.RemoveAll(dir) + + // This takes between 30-60s to run on my machine. Adjust as needed. + cfg := config{ + operations: "500", + processes: "20", + target: "/test", + mounts: []mount.Mount{ + { + Source: dir, + Target: "/test", + Type: "bind", + }, + }, + } + fsstress(t, cfg) +} + func TestFsstressTmpfs(t *testing.T) { // This takes between 10s to run on my machine. Adjust as needed. cfg := config{ diff --git a/test/packetimpact/README.md b/test/packetimpact/README.md index fe0976ba5..3f0f9d155 100644 --- a/test/packetimpact/README.md +++ b/test/packetimpact/README.md @@ -16,7 +16,7 @@ Packetimpact aims to provide: * A **multi-platform** solution that can test both Linux and gVisor. * **Conciseness** on par with packetdrill scripts. * **Control-flow** like for loops, conditionals, and variables. -* **Flexibilty** to specify every byte in a packet or use multiple sockets. +* **Flexibility** to specify every byte in a packet or use multiple sockets. ## How to run packetimpact tests? @@ -177,7 +177,7 @@ message SocketResponse { The test bench does most of the work in a test. It is a Go program that compiles on the host and is copied by the script into test bench's container. It is a regular [go unit test](https://golang.org/pkg/testing/) that imports the test -bench framework. The test bench framwork is based on three basic utilities: +bench framework. The test bench framework is based on three basic utilities: * Commanding the DUT to run POSIX commands and return responses. * Sending raw packets to the DUT on the test network. @@ -475,9 +475,9 @@ type layerState interface { // as it was sent is available. sent(sent Layer) error - // received updates the layerState based on a Layer that is receieved. The + // received updates the layerState based on a Layer that is received. The // input is a Layer with all prev and next pointers populated so that the - // entire frame as it was receieved is available. + // entire frame as it was received is available. received(received Layer) error // close frees associated resources held by the LayerState. diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc index ea83bbe72..49f41c887 100644 --- a/test/packetimpact/dut/posix_server.cc +++ b/test/packetimpact/dut/posix_server.cc @@ -28,10 +28,10 @@ #include <iostream> #include <unordered_map> +#include "absl/strings/str_format.h" #include "include/grpcpp/security/server_credentials.h" #include "include/grpcpp/server_builder.h" #include "include/grpcpp/server_context.h" -#include "absl/strings/str_format.h" #include "test/packetimpact/proto/posix_server.grpc.pb.h" #include "test/packetimpact/proto/posix_server.pb.h" diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go index 0cac0bf1b..7e89ba2b3 100644 --- a/test/packetimpact/testbench/dut.go +++ b/test/packetimpact/testbench/dut.go @@ -180,9 +180,7 @@ func (dut *DUT) CreateListener(t *testing.T, typ, proto, backlog int32) (int32, func (dut *DUT) Accept(t *testing.T, sockfd int32) (int32, unix.Sockaddr) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - fd, sa, err := dut.AcceptWithErrno(ctx, t, sockfd) + fd, sa, err := dut.AcceptWithErrno(context.Background(), t, sockfd) if fd < 0 { t.Fatalf("failed to accept: %s", err) } @@ -209,9 +207,7 @@ func (dut *DUT) AcceptWithErrno(ctx context.Context, t *testing.T, sockfd int32) func (dut *DUT) Bind(t *testing.T, fd int32, sa unix.Sockaddr) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.BindWithErrno(ctx, t, fd, sa) + ret, err := dut.BindWithErrno(context.Background(), t, fd, sa) if ret != 0 { t.Fatalf("failed to bind socket: %s", err) } @@ -238,9 +234,7 @@ func (dut *DUT) BindWithErrno(ctx context.Context, t *testing.T, fd int32, sa un func (dut *DUT) Close(t *testing.T, fd int32) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.CloseWithErrno(ctx, t, fd) + ret, err := dut.CloseWithErrno(context.Background(), t, fd) if ret != 0 { t.Fatalf("failed to close: %s", err) } @@ -266,9 +260,7 @@ func (dut *DUT) CloseWithErrno(ctx context.Context, t *testing.T, fd int32) (int func (dut *DUT) Connect(t *testing.T, fd int32, sa unix.Sockaddr) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.ConnectWithErrno(ctx, t, fd, sa) + ret, err := dut.ConnectWithErrno(context.Background(), t, fd, sa) // Ignore 'operation in progress' error that can be returned when the socket // is non-blocking. if err != unix.EINPROGRESS && ret != 0 { @@ -297,9 +289,7 @@ func (dut *DUT) ConnectWithErrno(ctx context.Context, t *testing.T, fd int32, sa func (dut *DUT) GetSockName(t *testing.T, sockfd int32) unix.Sockaddr { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, sa, err := dut.GetSockNameWithErrno(ctx, t, sockfd) + ret, sa, err := dut.GetSockNameWithErrno(context.Background(), t, sockfd) if ret != 0 { t.Fatalf("failed to getsockname: %s", err) } @@ -349,9 +339,7 @@ func (dut *DUT) getSockOpt(ctx context.Context, t *testing.T, sockfd, level, opt func (dut *DUT) GetSockOpt(t *testing.T, sockfd, level, optname, optlen int32) []byte { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, optval, err := dut.GetSockOptWithErrno(ctx, t, sockfd, level, optname, optlen) + ret, optval, err := dut.GetSockOptWithErrno(context.Background(), t, sockfd, level, optname, optlen) if ret != 0 { t.Fatalf("failed to GetSockOpt: %s", err) } @@ -378,9 +366,7 @@ func (dut *DUT) GetSockOptWithErrno(ctx context.Context, t *testing.T, sockfd, l func (dut *DUT) GetSockOptInt(t *testing.T, sockfd, level, optname int32) int32 { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, intval, err := dut.GetSockOptIntWithErrno(ctx, t, sockfd, level, optname) + ret, intval, err := dut.GetSockOptIntWithErrno(context.Background(), t, sockfd, level, optname) if ret != 0 { t.Fatalf("failed to GetSockOptInt: %s", err) } @@ -405,9 +391,7 @@ func (dut *DUT) GetSockOptIntWithErrno(ctx context.Context, t *testing.T, sockfd func (dut *DUT) GetSockOptTimeval(t *testing.T, sockfd, level, optname int32) unix.Timeval { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, timeval, err := dut.GetSockOptTimevalWithErrno(ctx, t, sockfd, level, optname) + ret, timeval, err := dut.GetSockOptTimevalWithErrno(context.Background(), t, sockfd, level, optname) if ret != 0 { t.Fatalf("failed to GetSockOptTimeval: %s", err) } @@ -434,9 +418,7 @@ func (dut *DUT) GetSockOptTimevalWithErrno(ctx context.Context, t *testing.T, so func (dut *DUT) GetSockOptTCPInfo(t *testing.T, sockfd int32) linux.TCPInfo { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, info, err := dut.GetSockOptTCPInfoWithErrno(ctx, t, sockfd) + ret, info, err := dut.GetSockOptTCPInfoWithErrno(context.Background(), t, sockfd) if ret != 0 || err != unix.Errno(0) { t.Fatalf("failed to GetSockOptTCPInfo: %s", err) } @@ -463,9 +445,7 @@ func (dut *DUT) GetSockOptTCPInfoWithErrno(ctx context.Context, t *testing.T, so func (dut *DUT) Listen(t *testing.T, sockfd, backlog int32) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.ListenWithErrno(ctx, t, sockfd, backlog) + ret, err := dut.ListenWithErrno(context.Background(), t, sockfd, backlog) if ret != 0 { t.Fatalf("failed to listen: %s", err) } @@ -510,13 +490,7 @@ func (dut *DUT) PollOne(t *testing.T, fd int32, events int16, timeout time.Durat func (dut *DUT) Poll(t *testing.T, pfds []unix.PollFd, timeout time.Duration) []unix.PollFd { t.Helper() - ctx := context.Background() - var cancel context.CancelFunc - if timeout >= 0 { - ctx, cancel = context.WithTimeout(ctx, timeout+RPCTimeout) - defer cancel() - } - ret, result, err := dut.PollWithErrno(ctx, t, pfds, timeout) + ret, result, err := dut.PollWithErrno(context.Background(), t, pfds, timeout) if ret < 0 { t.Fatalf("failed to poll: %s", err) } @@ -559,9 +533,7 @@ func (dut *DUT) PollWithErrno(ctx context.Context, t *testing.T, pfds []unix.Pol func (dut *DUT) Send(t *testing.T, sockfd int32, buf []byte, flags int32) int32 { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.SendWithErrno(ctx, t, sockfd, buf, flags) + ret, err := dut.SendWithErrno(context.Background(), t, sockfd, buf, flags) if ret == -1 { t.Fatalf("failed to send: %s", err) } @@ -590,9 +562,7 @@ func (dut *DUT) SendWithErrno(ctx context.Context, t *testing.T, sockfd int32, b func (dut *DUT) SendTo(t *testing.T, sockfd int32, buf []byte, flags int32, destAddr unix.Sockaddr) int32 { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.SendToWithErrno(ctx, t, sockfd, buf, flags, destAddr) + ret, err := dut.SendToWithErrno(context.Background(), t, sockfd, buf, flags, destAddr) if ret == -1 { t.Fatalf("failed to sendto: %s", err) } @@ -625,10 +595,8 @@ func (dut *DUT) SetNonBlocking(t *testing.T, fd int32, nonblocking bool) { Fd: fd, Nonblocking: nonblocking, } - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - resp, err := dut.posixServer.SetNonblocking(ctx, req) + resp, err := dut.posixServer.SetNonblocking(context.Background(), req) if err != nil { t.Fatalf("failed to call SetNonblocking: %s", err) } @@ -661,9 +629,7 @@ func (dut *DUT) setSockOpt(ctx context.Context, t *testing.T, sockfd, level, opt func (dut *DUT) SetSockOpt(t *testing.T, sockfd, level, optname int32, optval []byte) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.SetSockOptWithErrno(ctx, t, sockfd, level, optname, optval) + ret, err := dut.SetSockOptWithErrno(context.Background(), t, sockfd, level, optname, optval) if ret != 0 { t.Fatalf("failed to SetSockOpt: %s", err) } @@ -684,9 +650,7 @@ func (dut *DUT) SetSockOptWithErrno(ctx context.Context, t *testing.T, sockfd, l func (dut *DUT) SetSockOptInt(t *testing.T, sockfd, level, optname, optval int32) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.SetSockOptIntWithErrno(ctx, t, sockfd, level, optname, optval) + ret, err := dut.SetSockOptIntWithErrno(context.Background(), t, sockfd, level, optname, optval) if ret != 0 { t.Fatalf("failed to SetSockOptInt: %s", err) } @@ -705,9 +669,7 @@ func (dut *DUT) SetSockOptIntWithErrno(ctx context.Context, t *testing.T, sockfd func (dut *DUT) SetSockOptTimeval(t *testing.T, sockfd, level, optname int32, tv *unix.Timeval) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.SetSockOptTimevalWithErrno(ctx, t, sockfd, level, optname, tv) + ret, err := dut.SetSockOptTimevalWithErrno(context.Background(), t, sockfd, level, optname, tv) if ret != 0 { t.Fatalf("failed to SetSockOptTimeval: %s", err) } @@ -746,8 +708,7 @@ func (dut *DUT) SocketWithErrno(t *testing.T, domain, typ, proto int32) (int32, Type: typ, Protocol: proto, } - ctx := context.Background() - resp, err := dut.posixServer.Socket(ctx, req) + resp, err := dut.posixServer.Socket(context.Background(), req) if err != nil { t.Fatalf("failed to call Socket: %s", err) } @@ -760,9 +721,7 @@ func (dut *DUT) SocketWithErrno(t *testing.T, domain, typ, proto int32) (int32, func (dut *DUT) Recv(t *testing.T, sockfd, len, flags int32) []byte { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, buf, err := dut.RecvWithErrno(ctx, t, sockfd, len, flags) + ret, buf, err := dut.RecvWithErrno(context.Background(), t, sockfd, len, flags) if ret == -1 { t.Fatalf("failed to recv: %s", err) } @@ -805,9 +764,7 @@ func (dut *DUT) SetSockLingerOption(t *testing.T, sockfd int32, timeout time.Dur func (dut *DUT) Shutdown(t *testing.T, fd, how int32) { t.Helper() - ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout) - defer cancel() - ret, err := dut.ShutdownWithErrno(ctx, t, fd, how) + ret, err := dut.ShutdownWithErrno(context.Background(), t, fd, how) if ret != 0 { t.Fatalf("failed to shutdown(%d, %d): %s", fd, how, err) } diff --git a/test/packetimpact/testbench/dut_client.go b/test/packetimpact/testbench/dut_client.go index 0fc3d97b4..3b69b28aa 100644 --- a/test/packetimpact/testbench/dut_client.go +++ b/test/packetimpact/testbench/dut_client.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package testbench import ( diff --git a/test/packetimpact/testbench/testbench.go b/test/packetimpact/testbench/testbench.go index caa389780..38ae9c1d7 100644 --- a/test/packetimpact/testbench/testbench.go +++ b/test/packetimpact/testbench/testbench.go @@ -31,8 +31,6 @@ var ( Native = false // RPCKeepalive is the gRPC keepalive. RPCKeepalive = 10 * time.Second - // RPCTimeout is the gRPC timeout. - RPCTimeout = 100 * time.Millisecond // dutInfosJSON is the json string that describes information about all the // duts available to use. @@ -124,7 +122,6 @@ func (n *DUTTestNet) SubnetBroadcast() net.IP { // functions. func registerFlags(fs *flag.FlagSet) { fs.BoolVar(&Native, "native", Native, "whether the test is running natively") - fs.DurationVar(&RPCTimeout, "rpc_timeout", RPCTimeout, "gRPC timeout") fs.DurationVar(&RPCKeepalive, "rpc_keepalive", RPCKeepalive, "gRPC keepalive") fs.StringVar(&dutInfosJSON, "dut_infos_json", dutInfosJSON, "json that describes the DUTs") } diff --git a/test/packetimpact/tests/generic_dgram_socket_send_recv_test.go b/test/packetimpact/tests/generic_dgram_socket_send_recv_test.go index 00e0f7690..a9ffafc74 100644 --- a/test/packetimpact/tests/generic_dgram_socket_send_recv_test.go +++ b/test/packetimpact/tests/generic_dgram_socket_send_recv_test.go @@ -48,7 +48,6 @@ func maxUDPPayloadSize(addr net.IP) int { func init() { testbench.Initialize(flag.CommandLine) - testbench.RPCTimeout = 500 * time.Millisecond } func expectedEthLayer(t *testing.T, dut testbench.DUT, socketFD int32, sendTo net.IP) testbench.Layer { @@ -437,9 +436,7 @@ func (test *icmpV6Test) Send(t *testing.T, dut testbench.DUT, bindTo, sendTo net copy(destSockaddr.Addr[:], sendTo.To16()) // Tell the DUT to send a packet out the ICMPv6 socket. - ctx, cancel := context.WithTimeout(context.Background(), testbench.RPCTimeout) - defer cancel() - gotRet, gotErrno := dut.SendToWithErrno(ctx, t, env.socketFD, bytes, 0, &destSockaddr) + gotRet, gotErrno := dut.SendToWithErrno(context.Background(), t, env.socketFD, bytes, 0, &destSockaddr) if gotErrno != wantErrno { t.Fatalf("got dut.SendToWithErrno(_, _, %d, _, _, %s) = (_, %s), want = (_, %s)", env.socketFD, sendTo, gotErrno, wantErrno) @@ -677,9 +674,7 @@ func (test *udpTest) Send(t *testing.T, dut testbench.DUT, bindTo, sendTo net.IP } // Tell the DUT to send a packet out the UDP socket. - ctx, cancel := context.WithTimeout(context.Background(), testbench.RPCTimeout) - defer cancel() - gotRet, gotErrno := dut.SendToWithErrno(ctx, t, env.socketFD, payload, 0, destSockaddr) + gotRet, gotErrno := dut.SendToWithErrno(context.Background(), t, env.socketFD, payload, 0, destSockaddr) if gotErrno != wantErrno { t.Fatalf("got dut.SendToWithErrno(_, _, %d, _, _, %s) = (_, %s), want = (_, %s)", env.socketFD, sendTo, gotErrno, wantErrno) diff --git a/test/packetimpact/tests/tcp_connect_icmp_error_test.go b/test/packetimpact/tests/tcp_connect_icmp_error_test.go index 3b4c4cd63..15d603328 100644 --- a/test/packetimpact/tests/tcp_connect_icmp_error_test.go +++ b/test/packetimpact/tests/tcp_connect_icmp_error_test.go @@ -15,9 +15,7 @@ package tcp_connect_icmp_error_test import ( - "context" "flag" - "sync" "testing" "time" @@ -66,35 +64,38 @@ func TestTCPConnectICMPError(t *testing.T) { t.Fatalf("expected SYN, %s", err) } - done := make(chan bool) - defer close(done) - var wg sync.WaitGroup - defer wg.Wait() - wg.Add(1) - var block sync.WaitGroup - block.Add(1) + // Continuously try to read the ICMP error in an attempt to trigger a race + // condition. + start := make(chan struct{}) + done := make(chan struct{}) go func() { - defer wg.Done() - _, cancel := context.WithTimeout(context.Background(), time.Second*3) - defer cancel() + defer close(done) - block.Done() + close(start) for { select { case <-done: return default: - if errno := dut.GetSockOptInt(t, clientFD, unix.SOL_SOCKET, unix.SO_ERROR); errno != 0 { - return - } } + const want = unix.EHOSTUNREACH + switch got := unix.Errno(dut.GetSockOptInt(t, clientFD, unix.SOL_SOCKET, unix.SO_ERROR)); got { + case unix.Errno(0): + continue + case want: + return + default: + t.Fatalf("got SO_ERROR = %s, want %s", got, want) + } + } }() - block.Wait() + <-start sendICMPError(t, &conn, tcp) dut.PollOne(t, clientFD, unix.POLLHUP, time.Second) + <-done conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) // The DUT should reply with RST to our ACK as the state should have diff --git a/test/packetimpact/tests/tcp_info_test.go b/test/packetimpact/tests/tcp_info_test.go index b7514e846..5410cc368 100644 --- a/test/packetimpact/tests/tcp_info_test.go +++ b/test/packetimpact/tests/tcp_info_test.go @@ -47,7 +47,7 @@ func TestTCPInfo(t *testing.T) { samplePayload := &testbench.Payload{Bytes: sampleData} dut.Send(t, acceptFD, sampleData, 0) if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { - t.Fatalf("expected a packet with payload %v: %s", samplePayload, err) + t.Fatalf("expected a packet with payload %s: %s", samplePayload, err) } conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) @@ -55,20 +55,23 @@ func TestTCPInfo(t *testing.T) { if got, want := uint32(info.State), linux.TCP_ESTABLISHED; got != want { t.Fatalf("got %d want %d", got, want) } - rtt := time.Duration(info.RTT) * time.Microsecond - rttvar := time.Duration(info.RTTVar) * time.Microsecond - rto := time.Duration(info.RTO) * time.Microsecond - if rtt == 0 || rttvar == 0 || rto == 0 { - t.Errorf("expected rtt(%v), rttvar(%v) and rto(%v) to be greater than zero", rtt, rttvar, rto) + if info.RTT == 0 { + t.Errorf("got RTT=0, want nonzero") + } + if info.RTTVar == 0 { + t.Errorf("got RTTVar=0, want nonzero") + } + if info.RTO == 0 { + t.Errorf("got RTO=0, want nonzero") } if info.ReordSeen != 0 { - t.Errorf("expected the connection to not have any reordering, got: %v want: 0", info.ReordSeen) + t.Errorf("expected the connection to not have any reordering, got: %d want: 0", info.ReordSeen) } if info.SndCwnd == 0 { t.Errorf("expected send congestion window to be greater than zero") } if info.CaState != linux.TCP_CA_Open { - t.Errorf("expected the connection to be in open state, got: %v want: %v", info.CaState, linux.TCP_CA_Open) + t.Errorf("expected the connection to be in open state, got: %d want: %d", info.CaState, linux.TCP_CA_Open) } if t.Failed() { @@ -80,20 +83,20 @@ func TestTCPInfo(t *testing.T) { seq := testbench.Uint32(uint32(*conn.RemoteSeqNum(t))) dut.Send(t, acceptFD, sampleData, 0) if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { - t.Fatalf("expected a packet with payload %v: %s", samplePayload, err) + t.Fatalf("expected a packet with payload %s: %s", samplePayload, err) } - // Expect retransmission of the packet within 1.5*RTO. - timeout := time.Duration(float64(info.RTO)*1.5) * time.Microsecond + // Given a generous retransmission timeout. + timeout := time.Duration(info.RTO) * 2 * time.Microsecond if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: seq}, samplePayload, timeout); err != nil { - t.Fatalf("expected a packet with payload %v: %s", samplePayload, err) + t.Fatalf("expected a packet with payload %s: %s", samplePayload, err) } info = dut.GetSockOptTCPInfo(t, acceptFD) if info.CaState != linux.TCP_CA_Loss { - t.Errorf("expected the connection to be in loss recovery, got: %v want: %v", info.CaState, linux.TCP_CA_Loss) + t.Errorf("expected the connection to be in loss recovery, got: %d want: %d", info.CaState, linux.TCP_CA_Loss) } if info.SndCwnd != 1 { - t.Errorf("expected send congestion window to be 1, got: %v %v", info.SndCwnd) + t.Errorf("expected send congestion window to be 1, got: %d", info.SndCwnd) } } diff --git a/test/packetimpact/tests/tcp_linger_test.go b/test/packetimpact/tests/tcp_linger_test.go index 88942904d..46b5ca5d8 100644 --- a/test/packetimpact/tests/tcp_linger_test.go +++ b/test/packetimpact/tests/tcp_linger_test.go @@ -98,20 +98,19 @@ func TestTCPLingerNonZeroTimeout(t *testing.T) { dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn) - // Increase timeout as Close will take longer time to - // return when SO_LINGER is set with non-zero timeout. - timeout := lingerDuration + 1*time.Second - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() start := time.Now() - dut.CloseWithErrno(ctx, t, acceptFD) - end := time.Now() - diff := end.Sub(start) - - if tt.lingerOn && diff < lingerDuration { - t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration) - } else if !tt.lingerOn && diff > 1*time.Second { - t.Errorf("expected close to return within a second, but returned later") + dut.CloseWithErrno(context.Background(), t, acceptFD) + elapsed := time.Since(start) + + expectedMaximum := time.Second + if tt.lingerOn { + expectedMaximum += lingerDuration + if elapsed < lingerDuration { + t.Errorf("expected close to take at least %s, but took %s", lingerDuration, elapsed) + } + } + if elapsed >= expectedMaximum { + t.Errorf("expected close to take at most %s, but took %s", expectedMaximum, elapsed) } if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil { @@ -144,20 +143,19 @@ func TestTCPLingerSendNonZeroTimeout(t *testing.T) { sampleData := []byte("Sample Data") dut.Send(t, acceptFD, sampleData, 0) - // Increase timeout as Close will take longer time to - // return when SO_LINGER is set with non-zero timeout. - timeout := lingerDuration + 1*time.Second - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() start := time.Now() - dut.CloseWithErrno(ctx, t, acceptFD) - end := time.Now() - diff := end.Sub(start) - - if tt.lingerOn && diff < lingerDuration { - t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration) - } else if !tt.lingerOn && diff > 1*time.Second { - t.Errorf("expected close to return within a second, but returned later") + dut.CloseWithErrno(context.Background(), t, acceptFD) + elapsed := time.Since(start) + + expectedMaximum := time.Second + if tt.lingerOn { + expectedMaximum += lingerDuration + if elapsed < lingerDuration { + t.Errorf("expected close to take at least %s, but took %s", lingerDuration, elapsed) + } + } + if elapsed >= expectedMaximum { + t.Errorf("expected close to take at most %s, but took %s", expectedMaximum, elapsed) } samplePayload := &testbench.Payload{Bytes: sampleData} @@ -221,20 +219,19 @@ func TestTCPLingerShutdownSendNonZeroTimeout(t *testing.T) { dut.Shutdown(t, acceptFD, unix.SHUT_RDWR) - // Increase timeout as Close will take longer time to - // return when SO_LINGER is set with non-zero timeout. - timeout := lingerDuration + 1*time.Second - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() start := time.Now() - dut.CloseWithErrno(ctx, t, acceptFD) - end := time.Now() - diff := end.Sub(start) - - if tt.lingerOn && diff < lingerDuration { - t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration) - } else if !tt.lingerOn && diff > 1*time.Second { - t.Errorf("expected close to return within a second, but returned later") + dut.CloseWithErrno(context.Background(), t, acceptFD) + elapsed := time.Since(start) + + expectedMaximum := time.Second + if tt.lingerOn { + expectedMaximum += lingerDuration + if elapsed < lingerDuration { + t.Errorf("expected close to take at least %s, but took %s", lingerDuration, elapsed) + } + } + if elapsed >= expectedMaximum { + t.Errorf("expected close to take at most %s, but took %s", expectedMaximum, elapsed) } samplePayload := &testbench.Payload{Bytes: sampleData} @@ -259,9 +256,10 @@ func TestTCPLingerNonEstablished(t *testing.T) { // and return immediately. start := time.Now() dut.CloseWithErrno(context.Background(), t, newFD) - diff := time.Since(start) + elapsed := time.Since(start) - if diff > lingerDuration { - t.Errorf("expected close to return within %s, but returned after %s", lingerDuration, diff) + expectedMaximum := time.Second + if elapsed >= time.Second { + t.Errorf("expected close to take at most %s, but took %s", expectedMaximum, elapsed) } } diff --git a/test/packetimpact/tests/tcp_network_unreachable_test.go b/test/packetimpact/tests/tcp_network_unreachable_test.go index 60a2dbf3d..e92e6aa9b 100644 --- a/test/packetimpact/tests/tcp_network_unreachable_test.go +++ b/test/packetimpact/tests/tcp_network_unreachable_test.go @@ -41,11 +41,9 @@ func TestTCPSynSentUnreachable(t *testing.T) { defer conn.Close(t) // Bring the DUT to SYN-SENT state with a non-blocking connect. - ctx, cancel := context.WithTimeout(context.Background(), testbench.RPCTimeout) - defer cancel() sa := unix.SockaddrInet4{Port: int(port)} copy(sa.Addr[:], dut.Net.LocalIPv4) - if _, err := dut.ConnectWithErrno(ctx, t, clientFD, &sa); err != unix.EINPROGRESS { + if _, err := dut.ConnectWithErrno(context.Background(), t, clientFD, &sa); err != unix.EINPROGRESS { t.Errorf("got connect() = %v, want EINPROGRESS", err) } @@ -86,14 +84,12 @@ func TestTCPSynSentUnreachable6(t *testing.T) { defer conn.Close(t) // Bring the DUT to SYN-SENT state with a non-blocking connect. - ctx, cancel := context.WithTimeout(context.Background(), testbench.RPCTimeout) - defer cancel() sa := unix.SockaddrInet6{ Port: int(conn.SrcPort()), ZoneId: dut.Net.RemoteDevID, } copy(sa.Addr[:], dut.Net.LocalIPv6) - if _, err := dut.ConnectWithErrno(ctx, t, clientFD, &sa); err != unix.EINPROGRESS { + if _, err := dut.ConnectWithErrno(context.Background(), t, clientFD, &sa); err != unix.EINPROGRESS { t.Errorf("got connect() = %v, want EINPROGRESS", err) } diff --git a/test/packetimpact/tests/tcp_queue_send_recv_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_send_recv_in_syn_sent_test.go index 1c8b72ebe..974c15384 100644 --- a/test/packetimpact/tests/tcp_queue_send_recv_in_syn_sent_test.go +++ b/test/packetimpact/tests/tcp_queue_send_recv_in_syn_sent_test.go @@ -20,7 +20,6 @@ import ( "encoding/hex" "errors" "flag" - "sync" "testing" "time" @@ -54,37 +53,39 @@ func TestQueueSendInSynSentHandshake(t *testing.T) { // Test blocking send. dut.SetNonBlocking(t, socket, false) - var wg sync.WaitGroup - defer wg.Wait() - wg.Add(1) - var block sync.WaitGroup - block.Add(1) + start := make(chan struct{}) + done := make(chan struct{}) go func() { - defer wg.Done() - ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) - defer cancel() + defer close(done) - block.Done() + close(start) // Issue SEND call in SYN-SENT, this should be queued for // process until the connection is established. - n, err := dut.SendWithErrno(ctx, t, socket, sampleData, 0) - if n == -1 { + if _, err := dut.SendWithErrno(context.Background(), t, socket, sampleData, 0); err != unix.Errno(0) { t.Errorf("failed to send on DUT: %s", err) - return } }() // Wait for the goroutine to be scheduled and before it // blocks on endpoint send/receive. - block.Wait() + <-start // The following sleep is used to prevent the connection // from being established before we are blocked: there is // still a small time window between we sending the RPC // request and the system actually being blocked. time.Sleep(100 * time.Millisecond) + select { + case <-done: + t.Fatal("expected send to be blocked in SYN-SENT") + default: + } + // Bring the connection to Established. conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}) + + <-done + // Expect the data from the DUT's enqueued send request. // // On Linux, this can be piggybacked with the ACK completing the @@ -126,21 +127,16 @@ func TestQueueRecvInSynSentHandshake(t *testing.T) { // Test blocking read. dut.SetNonBlocking(t, socket, false) - var wg sync.WaitGroup - defer wg.Wait() - wg.Add(1) - var block sync.WaitGroup - block.Add(1) + start := make(chan struct{}) + done := make(chan struct{}) go func() { - defer wg.Done() - ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) - defer cancel() + defer close(done) - block.Done() + close(start) // Issue RECEIVE call in SYN-SENT, this should be queued for // process until the connection is established. - n, buff, err := dut.RecvWithErrno(ctx, t, socket, int32(len(sampleData)), 0) - if n == -1 { + n, buff, err := dut.RecvWithErrno(context.Background(), t, socket, int32(len(sampleData)), 0) + if err != unix.Errno(0) { t.Errorf("failed to recv on DUT: %s", err) return } @@ -151,7 +147,8 @@ func TestQueueRecvInSynSentHandshake(t *testing.T) { // Wait for the goroutine to be scheduled and before it // blocks on endpoint send/receive. - block.Wait() + <-start + // The following sleep is used to prevent the connection // from being established before we are blocked: there is // still a small time window between we sending the RPC @@ -169,6 +166,8 @@ func TestQueueRecvInSynSentHandshake(t *testing.T) { if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}, time.Second); err != nil { t.Fatalf("expected an ACK from DUT, but got none: %s", err) } + + <-done } // TestQueueSendInSynSentRST tests send behavior when the TCP state @@ -192,20 +191,15 @@ func TestQueueSendInSynSentRST(t *testing.T) { // Test blocking send. dut.SetNonBlocking(t, socket, false) - var wg sync.WaitGroup - defer wg.Wait() - wg.Add(1) - var block sync.WaitGroup - block.Add(1) + start := make(chan struct{}) + done := make(chan struct{}) go func() { - defer wg.Done() - ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) - defer cancel() + defer close(done) - block.Done() + close(start) // Issue SEND call in SYN-SENT, this should be queued for // process until the connection is established. - n, err := dut.SendWithErrno(ctx, t, socket, sampleData, 0) + n, err := dut.SendWithErrno(context.Background(), t, socket, sampleData, 0) if err != unix.ECONNREFUSED { t.Errorf("expected error %s, got %s", unix.ECONNREFUSED, err) } @@ -216,14 +210,23 @@ func TestQueueSendInSynSentRST(t *testing.T) { // Wait for the goroutine to be scheduled and before it // blocks on endpoint send/receive. - block.Wait() + <-start + // The following sleep is used to prevent the connection // from being established before we are blocked: there is // still a small time window between we sending the RPC // request and the system actually being blocked. time.Sleep(100 * time.Millisecond) + select { + case <-done: + t.Fatal("expected send to be blocked in SYN-SENT") + default: + } + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagRst | header.TCPFlagAck)}) + + <-done } // TestQueueRecvInSynSentRST tests recv behavior when the TCP state @@ -251,20 +254,15 @@ func TestQueueRecvInSynSentRST(t *testing.T) { // Test blocking read. dut.SetNonBlocking(t, socket, false) - var wg sync.WaitGroup - defer wg.Wait() - wg.Add(1) - var block sync.WaitGroup - block.Add(1) + start := make(chan struct{}) + done := make(chan struct{}) go func() { - defer wg.Done() - ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) - defer cancel() + defer close(done) - block.Done() + close(start) // Issue RECEIVE call in SYN-SENT, this should be queued for // process until the connection is established. - n, _, err := dut.RecvWithErrno(ctx, t, socket, int32(len(sampleData)), 0) + n, _, err := dut.RecvWithErrno(context.Background(), t, socket, int32(len(sampleData)), 0) if err != unix.ECONNREFUSED { t.Errorf("expected error %s, got %s", unix.ECONNREFUSED, err) } @@ -275,7 +273,8 @@ func TestQueueRecvInSynSentRST(t *testing.T) { // Wait for the goroutine to be scheduled and before it // blocks on endpoint send/receive. - block.Wait() + <-start + // The following sleep is used to prevent the connection // from being established before we are blocked: there is // still a small time window between we sending the RPC @@ -283,4 +282,5 @@ func TestQueueRecvInSynSentRST(t *testing.T) { time.Sleep(100 * time.Millisecond) conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagRst | header.TCPFlagAck)}) + <-done } diff --git a/test/packetimpact/tests/udp_icmp_error_propagation_test.go b/test/packetimpact/tests/udp_icmp_error_propagation_test.go index 087aeb66e..556cee1d9 100644 --- a/test/packetimpact/tests/udp_icmp_error_propagation_test.go +++ b/test/packetimpact/tests/udp_icmp_error_propagation_test.go @@ -141,8 +141,6 @@ func testRecv(ctx context.Context, t *testing.T, d testData) { d.conn.Send(t, testbench.UDP{}) if d.wantErrno != unix.Errno(0) { - ctx, cancel := context.WithTimeout(ctx, time.Second) - defer cancel() ret, _, err := d.dut.RecvWithErrno(ctx, t, d.remoteFD, 100, 0) if ret != -1 { t.Fatalf("recv after ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", d.wantErrno) @@ -157,7 +155,7 @@ func testRecv(ctx context.Context, t *testing.T, d testData) { // testSendTo tests observing the ICMP error through the send syscall. If // wantErrno is non-zero, the first send should fail and a subsequent send -// should suceed; while if wantErrno is zero then the first send should just +// should succeed; while if wantErrno is zero then the first send should just // succeed. func testSendTo(ctx context.Context, t *testing.T, d testData) { // Check that sending on the clean socket works. @@ -167,8 +165,6 @@ func testSendTo(ctx context.Context, t *testing.T, d testData) { } if d.wantErrno != unix.Errno(0) { - ctx, cancel := context.WithTimeout(ctx, time.Second) - defer cancel() ret, err := d.dut.SendToWithErrno(ctx, t, d.remoteFD, nil, 0, d.conn.LocalAddr(t)) if ret != -1 { @@ -315,10 +311,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { defer wg.Done() if wantErrno != unix.Errno(0) { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - ret, _, err := dut.RecvWithErrno(ctx, t, remoteFD, 100, 0) + ret, _, err := dut.RecvWithErrno(context.Background(), t, remoteFD, 100, 0) if ret != -1 { t.Errorf("recv during ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", wantErrno) return @@ -329,10 +322,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { } } - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - if ret, _, err := dut.RecvWithErrno(ctx, t, remoteFD, 100, 0); ret == -1 { + if ret, _, err := dut.RecvWithErrno(context.Background(), t, remoteFD, 100, 0); ret == -1 { t.Errorf("recv after ICMP error failed with (%[1]d) %[1]", err) } }() @@ -340,10 +330,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) { go func() { defer wg.Done() - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - if ret, _, err := dut.RecvWithErrno(ctx, t, cleanFD, 100, 0); ret == -1 { + if ret, _, err := dut.RecvWithErrno(context.Background(), t, cleanFD, 100, 0); ret == -1 { t.Errorf("recv on clean socket failed with (%[1]d) %[1]", err) } }() diff --git a/test/perf/BUILD b/test/perf/BUILD index 75b5003e2..59923da47 100644 --- a/test/perf/BUILD +++ b/test/perf/BUILD @@ -70,6 +70,13 @@ syscall_test( ) syscall_test( + size = "large", + add_overlay = True, + debug = False, + test = "//test/perf/linux:dup_benchmark", +) + +syscall_test( debug = False, test = "//test/perf/linux:pipe_benchmark", ) @@ -139,3 +146,38 @@ syscall_test( debug = False, test = "//test/perf/linux:write_benchmark", ) + +syscall_test( + size = "large", + debug = False, + test = "//test/perf/linux:verity_open_benchmark", + vfs1 = False, +) + +syscall_test( + size = "large", + debug = False, + test = "//test/perf/linux:verity_read_benchmark", + vfs1 = False, +) + +syscall_test( + size = "large", + debug = False, + test = "//test/perf/linux:verity_randread_benchmark", + vfs1 = False, +) + +syscall_test( + size = "large", + debug = False, + test = "//test/perf/linux:verity_open_read_close_benchmark", + vfs1 = False, +) + +syscall_test( + size = "large", + debug = False, + test = "//test/perf/linux:verity_stat_benchmark", + vfs1 = False, +) diff --git a/test/perf/linux/BUILD b/test/perf/linux/BUILD index dd1d2438c..020a69ab5 100644 --- a/test/perf/linux/BUILD +++ b/test/perf/linux/BUILD @@ -27,10 +27,10 @@ cc_binary( deps = [ gbenchmark, gtest, - "//test/syscalls/linux:socket_test_util", "//test/util:file_descriptor", "//test/util:logging", "//test/util:posix_error", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", "//test/util:thread_util", @@ -109,6 +109,22 @@ cc_binary( ) cc_binary( + name = "dup_benchmark", + testonly = 1, + srcs = [ + "dup_benchmark.cc", + ], + deps = [ + gbenchmark, + gtest, + "//test/util:fs_util", + "//test/util:logging", + "//test/util:temp_path", + "//test/util:test_main", + ], +) + +cc_binary( name = "read_benchmark", testonly = 1, srcs = [ @@ -370,3 +386,99 @@ cc_binary( "//test/util:test_main", ], ) + +cc_binary( + name = "verity_open_benchmark", + testonly = 1, + srcs = [ + "verity_open_benchmark.cc", + ], + deps = [ + gbenchmark, + gtest, + "//test/util:capability_util", + "//test/util:fs_util", + "//test/util:logging", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:verity_util", + ], +) + +cc_binary( + name = "verity_read_benchmark", + testonly = 1, + srcs = [ + "verity_read_benchmark.cc", + ], + deps = [ + gbenchmark, + gtest, + "//test/util:capability_util", + "//test/util:fs_util", + "//test/util:logging", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:verity_util", + ], +) + +cc_binary( + name = "verity_randread_benchmark", + testonly = 1, + srcs = [ + "verity_randread_benchmark.cc", + ], + deps = [ + gbenchmark, + gtest, + "//test/util:capability_util", + "//test/util:fs_util", + "//test/util:logging", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:verity_util", + ], +) + +cc_binary( + name = "verity_open_read_close_benchmark", + testonly = 1, + srcs = [ + "verity_open_read_close_benchmark.cc", + ], + deps = [ + gbenchmark, + gtest, + "//test/util:capability_util", + "//test/util:fs_util", + "//test/util:logging", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:verity_util", + ], +) + +cc_binary( + name = "verity_stat_benchmark", + testonly = 1, + srcs = [ + "verity_stat_benchmark.cc", + ], + deps = [ + gbenchmark, + gtest, + "//test/util:capability_util", + "//test/util:fs_util", + "//test/util:logging", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:verity_util", + "@com_google_absl//absl/strings", + ], +) diff --git a/test/perf/linux/dup_benchmark.cc b/test/perf/linux/dup_benchmark.cc new file mode 100644 index 000000000..5d808d225 --- /dev/null +++ b/test/perf/linux/dup_benchmark.cc @@ -0,0 +1,55 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" +#include "benchmark/benchmark.h" +#include "test/util/fs_util.h" +#include "test/util/logging.h" +#include "test/util/temp_path.h" + +namespace gvisor { + +namespace { + +void BM_Dup(benchmark::State& state) { + const int size = state.range(0); + + for (auto _ : state) { + std::vector<int> v; + for (int i = 0; i < size; i++) { + int fd = dup(2); + TEST_CHECK(fd != -1); + v.push_back(fd); + } + for (int i = 0; i < size; i++) { + int fd = v[i]; + close(fd); + } + } + state.SetItemsProcessed(state.iterations() * size); +} + +BENCHMARK(BM_Dup)->Range(1, 1 << 15)->UseRealTime(); + +} // namespace + +} // namespace gvisor diff --git a/test/perf/linux/randread_benchmark.cc b/test/perf/linux/randread_benchmark.cc index b0eb8c24e..11b56a8cb 100644 --- a/test/perf/linux/randread_benchmark.cc +++ b/test/perf/linux/randread_benchmark.cc @@ -85,7 +85,7 @@ void BM_RandRead(benchmark::State& state) { unsigned int seed = 1; for (auto _ : state) { TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(), - rand_r(&seed) % kFileSize) == size); + rand_r(&seed) % (kFileSize - buf.size())) == size); } state.SetBytesProcessed(static_cast<int64_t>(size) * diff --git a/test/perf/linux/send_recv_benchmark.cc b/test/perf/linux/send_recv_benchmark.cc index d73e49523..2d443f54f 100644 --- a/test/perf/linux/send_recv_benchmark.cc +++ b/test/perf/linux/send_recv_benchmark.cc @@ -23,10 +23,10 @@ #include "gtest/gtest.h" #include "absl/synchronization/notification.h" #include "benchmark/benchmark.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" #include "test/util/logging.h" #include "test/util/posix_error.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -80,6 +80,9 @@ void BM_Recvmsg(benchmark::State& state) { int64_t bytes_received = 0; for (auto ignored : state) { int n = recvmsg(recv_socket.get(), recv_msg.header(), 0); + if (n == -1 && errno == EINTR) { + continue; + } TEST_CHECK(n > 0); bytes_received += n; } @@ -108,6 +111,9 @@ void BM_Sendmsg(benchmark::State& state) { int64_t bytes_sent = 0; for (auto ignored : state) { int n = sendmsg(send_socket.get(), send_msg.header(), 0); + if (n == -1 && errno == EINTR) { + continue; + } TEST_CHECK(n > 0); bytes_sent += n; } @@ -137,6 +143,9 @@ void BM_Recvfrom(benchmark::State& state) { for (auto ignored : state) { int n = recvfrom(recv_socket.get(), recv_buffer, kMessageSize, 0, nullptr, nullptr); + if (n == -1 && errno == EINTR) { + continue; + } TEST_CHECK(n > 0); bytes_received += n; } @@ -166,6 +175,9 @@ void BM_Sendto(benchmark::State& state) { int64_t bytes_sent = 0; for (auto ignored : state) { int n = sendto(send_socket.get(), send_buffer, kMessageSize, 0, nullptr, 0); + if (n == -1 && errno == EINTR) { + continue; + } TEST_CHECK(n > 0); bytes_sent += n; } @@ -247,6 +259,9 @@ void BM_RecvmsgWithControlBuf(benchmark::State& state) { int64_t bytes_received = 0; for (auto ignored : state) { int n = recvmsg(recv_socket.get(), recv_msg.header(), 0); + if (n == -1 && errno == EINTR) { + continue; + } TEST_CHECK(n > 0); bytes_received += n; } @@ -316,7 +331,11 @@ void BM_SendmsgTCP(benchmark::State& state) { ScopedThread t([&recv_msg, &recv_socket, ¬ification] { while (!notification.HasBeenNotified()) { - TEST_CHECK(recvmsg(recv_socket.get(), recv_msg.header(), 0) >= 0); + int rc = recvmsg(recv_socket.get(), recv_msg.header(), 0); + if (rc == -1 && errno == EINTR) { + continue; + } + TEST_CHECK(rc >= 0); } }); diff --git a/test/perf/linux/verity_open_benchmark.cc b/test/perf/linux/verity_open_benchmark.cc new file mode 100644 index 000000000..026b6f101 --- /dev/null +++ b/test/perf/linux/verity_open_benchmark.cc @@ -0,0 +1,71 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <fcntl.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <unistd.h> + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" +#include "benchmark/benchmark.h" +#include "test/util/capability_util.h" +#include "test/util/fs_util.h" +#include "test/util/logging.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" +#include "test/util/verity_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +void BM_Open(benchmark::State& state) { + const int size = state.range(0); + std::vector<TempPath> cache; + std::vector<EnableTarget> targets; + + // Mount a tmpfs file system to be wrapped by a verity fs. + TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + TEST_CHECK(mount("", dir.path().c_str(), "tmpfs", 0, "") == 0); + + for (int i = 0; i < size; i++) { + auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path())); + targets.emplace_back( + EnableTarget(std::string(Basename(path.path())), O_RDONLY)); + cache.emplace_back(std::move(path)); + } + + std::string verity_dir = + TEST_CHECK_NO_ERRNO_AND_VALUE(MountVerity(dir.path(), targets)); + + unsigned int seed = 1; + for (auto _ : state) { + const int chosen = rand_r(&seed) % size; + int fd = open(JoinPath(verity_dir, targets[chosen].path).c_str(), O_RDONLY); + TEST_CHECK(fd != -1); + close(fd); + } +} + +BENCHMARK(BM_Open)->Range(1, 128)->UseRealTime(); + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/perf/linux/verity_open_read_close_benchmark.cc b/test/perf/linux/verity_open_read_close_benchmark.cc new file mode 100644 index 000000000..e77577f22 --- /dev/null +++ b/test/perf/linux/verity_open_read_close_benchmark.cc @@ -0,0 +1,75 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <fcntl.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <unistd.h> + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" +#include "benchmark/benchmark.h" +#include "test/util/capability_util.h" +#include "test/util/fs_util.h" +#include "test/util/logging.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" +#include "test/util/verity_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +void BM_VerityOpenReadClose(benchmark::State& state) { + const int size = state.range(0); + + // Mount a tmpfs file system to be wrapped by a verity fs. + TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + TEST_CHECK(mount("", dir.path().c_str(), "tmpfs", 0, "") == 0); + + std::vector<TempPath> cache; + std::vector<EnableTarget> targets; + + for (int i = 0; i < size; i++) { + auto file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(dir.path(), "some contents", 0644)); + targets.emplace_back( + EnableTarget(std::string(Basename(file.path())), O_RDONLY)); + cache.emplace_back(std::move(file)); + } + + std::string verity_dir = + TEST_CHECK_NO_ERRNO_AND_VALUE(MountVerity(dir.path(), targets)); + + char buf[1]; + unsigned int seed = 1; + for (auto _ : state) { + const int chosen = rand_r(&seed) % size; + int fd = open(JoinPath(verity_dir, targets[chosen].path).c_str(), O_RDONLY); + TEST_CHECK(fd != -1); + TEST_CHECK(read(fd, buf, 1) == 1); + close(fd); + } +} + +BENCHMARK(BM_VerityOpenReadClose)->Range(1000, 16384)->UseRealTime(); + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/perf/linux/verity_randread_benchmark.cc b/test/perf/linux/verity_randread_benchmark.cc new file mode 100644 index 000000000..4178cfad8 --- /dev/null +++ b/test/perf/linux/verity_randread_benchmark.cc @@ -0,0 +1,108 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <fcntl.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <unistd.h> + +#include "gtest/gtest.h" +#include "benchmark/benchmark.h" +#include "test/util/logging.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" +#include "test/util/verity_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +// Create a 1GB file that will be read from at random positions. This should +// invalid any performance gains from caching. +const uint64_t kFileSize = Megabytes(1024); + +// How many bytes to write at once to initialize the file used to read from. +const uint32_t kWriteSize = 65536; + +// Largest benchmarked read unit. +const uint32_t kMaxRead = Megabytes(64); + +// Global test state, initialized once per process lifetime. +struct GlobalState { + explicit GlobalState() { + // Mount a tmpfs file system to be wrapped by a verity fs. + tmp_dir_ = TempPath::CreateDir().ValueOrDie(); + TEST_CHECK(mount("", tmp_dir_.path().c_str(), "tmpfs", 0, "") == 0); + file_ = TempPath::CreateFileIn(tmp_dir_.path()).ValueOrDie(); + filename_ = std::string(Basename(file_.path())); + + FileDescriptor fd = Open(file_.path(), O_WRONLY).ValueOrDie(); + + // Try to minimize syscalls by using maximum size writev() requests. + std::vector<char> buffer(kWriteSize); + RandomizeBuffer(buffer.data(), buffer.size()); + const std::vector<std::vector<struct iovec>> iovecs_list = + GenerateIovecs(kFileSize + kMaxRead, buffer.data(), buffer.size()); + for (const auto& iovecs : iovecs_list) { + TEST_CHECK(writev(fd.get(), iovecs.data(), iovecs.size()) >= 0); + } + verity_dir_ = + MountVerity(tmp_dir_.path(), {EnableTarget(filename_, O_RDONLY)}) + .ValueOrDie(); + } + TempPath tmp_dir_; + TempPath file_; + std::string verity_dir_; + std::string filename_; +}; + +GlobalState& GetGlobalState() { + // This gets created only once throughout the lifetime of the process. + // Use a dynamically allocated object (that is never deleted) to avoid order + // of destruction of static storage variables issues. + static GlobalState* const state = + // The actual file size is the maximum random seek range (kFileSize) + the + // maximum read size so we can read that number of bytes at the end of the + // file. + new GlobalState(); + return *state; +} + +void BM_VerityRandRead(benchmark::State& state) { + const int size = state.range(0); + + GlobalState& global_state = GetGlobalState(); + FileDescriptor verity_fd = ASSERT_NO_ERRNO_AND_VALUE(Open( + JoinPath(global_state.verity_dir_, global_state.filename_), O_RDONLY)); + std::vector<char> buf(size); + + unsigned int seed = 1; + for (auto _ : state) { + TEST_CHECK(PreadFd(verity_fd.get(), buf.data(), buf.size(), + rand_r(&seed) % kFileSize) == size); + } + + state.SetBytesProcessed(static_cast<int64_t>(size) * + static_cast<int64_t>(state.iterations())); +} + +BENCHMARK(BM_VerityRandRead)->Range(1, kMaxRead)->UseRealTime(); + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/perf/linux/verity_read_benchmark.cc b/test/perf/linux/verity_read_benchmark.cc new file mode 100644 index 000000000..738b5ba45 --- /dev/null +++ b/test/perf/linux/verity_read_benchmark.cc @@ -0,0 +1,69 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <fcntl.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <unistd.h> + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" +#include "benchmark/benchmark.h" +#include "test/util/capability_util.h" +#include "test/util/fs_util.h" +#include "test/util/logging.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" +#include "test/util/verity_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +void BM_VerityRead(benchmark::State& state) { + const int size = state.range(0); + const std::string contents(size, 0); + + // Mount a tmpfs file system to be wrapped by a verity fs. + TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + TEST_CHECK(mount("", dir.path().c_str(), "tmpfs", 0, "") == 0); + + auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( + dir.path(), contents, TempPath::kDefaultFileMode)); + std::string filename = std::string(Basename(path.path())); + + std::string verity_dir = TEST_CHECK_NO_ERRNO_AND_VALUE( + MountVerity(dir.path(), {EnableTarget(filename, O_RDONLY)})); + + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(JoinPath(verity_dir, filename), O_RDONLY)); + std::vector<char> buf(size); + for (auto _ : state) { + TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(), 0) == size); + } + + state.SetBytesProcessed(static_cast<int64_t>(size) * + static_cast<int64_t>(state.iterations())); +} + +BENCHMARK(BM_VerityRead)->Range(1, 1 << 26)->UseRealTime(); + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/perf/linux/verity_stat_benchmark.cc b/test/perf/linux/verity_stat_benchmark.cc new file mode 100644 index 000000000..b43a9e266 --- /dev/null +++ b/test/perf/linux/verity_stat_benchmark.cc @@ -0,0 +1,84 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include <algorithm> +#include <vector> + +#include "gtest/gtest.h" +#include "absl/strings/str_cat.h" +#include "benchmark/benchmark.h" +#include "test/util/fs_util.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" +#include "test/util/verity_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +// Creates a file in a nested directory hierarchy at least `depth` directories +// deep, and stats that file multiple times. +void BM_VerityStat(benchmark::State& state) { + // Create nested directories with given depth. + int depth = state.range(0); + + // Mount a tmpfs file system to be wrapped by a verity fs. + TempPath top_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + TEST_CHECK(mount("", top_dir.path().c_str(), "tmpfs", 0, "") == 0); + std::string dir_path = top_dir.path(); + std::string child_path = ""; + std::vector<EnableTarget> targets; + + while (depth-- > 0) { + // Don't use TempPath because it will make paths too long to use. + // + // The top_dir destructor will clean up this whole tree. + dir_path = JoinPath(dir_path, absl::StrCat(depth)); + ASSERT_NO_ERRNO(Mkdir(dir_path, 0755)); + child_path = JoinPath(child_path, Basename(dir_path)); + targets.emplace_back(EnableTarget(child_path, O_RDONLY)); + } + + // Create the file that will be stat'd. + const TempPath file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir_path)); + + targets.emplace_back( + EnableTarget(JoinPath(child_path, Basename(file.path())), O_RDONLY)); + + // Reverse the targets because verity should be enabled from the lowest level. + std::reverse(targets.begin(), targets.end()); + + std::string verity_dir = + TEST_CHECK_NO_ERRNO_AND_VALUE(MountVerity(top_dir.path(), targets)); + + struct stat st; + for (auto _ : state) { + ASSERT_THAT(stat(JoinPath(verity_dir, targets[0].path).c_str(), &st), + SyscallSucceeds()); + } +} + +BENCHMARK(BM_VerityStat)->Range(1, 100)->UseRealTime(); + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/root/chroot_test.go b/test/root/chroot_test.go index 58fcd6f08..5114a9602 100644 --- a/test/root/chroot_test.go +++ b/test/root/chroot_test.go @@ -68,13 +68,15 @@ func TestChroot(t *testing.T) { if err != nil { t.Fatalf("error listing %q: %v", chroot, err) } - if want, got := 1, len(fi); want != got { + if want, got := 2, len(fi); want != got { t.Fatalf("chroot dir got %d entries, want %d", got, want) } - // chroot dir is prepared by runsc and should contains only /proc. - if fi[0].Name() != "proc" { - t.Errorf("chroot got children %v, want %v", fi[0].Name(), "proc") + // chroot dir is prepared by runsc and should contains only /etc and /proc. + for i, want := range []string{"etc", "proc"} { + if got := fi[i].Name(); got != want { + t.Errorf("chroot got child %v, want %v", got, want) + } } d.CleanUp(ctx) diff --git a/test/runner/BUILD b/test/runner/BUILD index f9f788726..2d93aa6af 100644 --- a/test/runner/BUILD +++ b/test/runner/BUILD @@ -8,6 +8,7 @@ go_binary( srcs = ["main.go"], data = [ "//runsc", + "//test/runner/setup_container", ], visibility = ["//:sandbox"], deps = [ diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl index 416f51935..05c75b130 100644 --- a/test/runner/defs.bzl +++ b/test/runner/defs.bzl @@ -103,6 +103,8 @@ def _syscall_test( if platform == "native": tags.append("nogotsan") + container = "container" in tags + runner_args = [ # Arguments are passed directly to runner binary. "--platform=" + platform, @@ -115,6 +117,7 @@ def _syscall_test( "--fuse=" + str(fuse), "--strace=" + str(debug), "--debug=" + str(debug), + "--container=" + str(container), ] # Call the rule above. @@ -132,6 +135,7 @@ def syscall_test( add_overlay = False, add_uds_tree = False, add_hostinet = False, + vfs1 = True, vfs2 = True, fuse = False, debug = True, @@ -145,6 +149,7 @@ def syscall_test( add_overlay: add an overlay test. add_uds_tree: add a UDS test. add_hostinet: add a hostinet test. + vfs1: enable VFS1 tests. Could be false only if vfs2 is true. vfs2: enable VFS2 support. fuse: enable FUSE support. debug: enable debug output. @@ -154,7 +159,7 @@ def syscall_test( if not tags: tags = [] - if vfs2 and not fuse: + if vfs2 and vfs1 and not fuse: # Generate a vfs1 plain test. Most testing will now be # biased towards vfs2, with only a single vfs1 case. _syscall_test( @@ -168,7 +173,7 @@ def syscall_test( **kwargs ) - if not fuse: + if vfs1 and not fuse: # Generate a native test if fuse is not required. _syscall_test( test = test, diff --git a/test/runner/gtest/gtest.go b/test/runner/gtest/gtest.go index 38e57d62f..affbf1973 100644 --- a/test/runner/gtest/gtest.go +++ b/test/runner/gtest/gtest.go @@ -133,17 +133,28 @@ func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]Tes } // Run again to extract benchmarks. - args = append([]string{listBenchmarkFlag}, extraArgs...) - cmd = exec.Command(testBin, args...) - out, err = cmd.Output() + tb, err := ParseBenchmarks(testBin, extraArgs...) + if err != nil { + return nil, err + } + t = append(t, tb...) + return t, nil +} + +// ParseBenchmarks returns each benchmark in a third_party/benchmark binary's list as a single test case. +func ParseBenchmarks(binary string, extraArgs ...string) ([]TestCase, error) { + var t []TestCase + args := append([]string{listBenchmarkFlag}, extraArgs...) + cmd := exec.Command(binary, args...) + out, err := cmd.Output() if err != nil { // We were able to enumerate tests above, but not benchmarks? // We requested them, so we return an error in this case. exitErr, ok := err.(*exec.ExitError) if !ok { - return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v", err) + return nil, fmt.Errorf("could not enumerate benchmarks: %v", err) } - return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v\nstderr\n%s", err, exitErr.Stderr) + return nil, fmt.Errorf("could not enumerate benchmarks: %v\nstderr\n%s", err, exitErr.Stderr) } benches := strings.Trim(string(out), "\n") diff --git a/test/runner/main.go b/test/runner/main.go index 7e8e88ba2..2cab8d2d4 100644 --- a/test/runner/main.go +++ b/test/runner/main.go @@ -40,16 +40,18 @@ import ( ) var ( - debug = flag.Bool("debug", false, "enable debug logs") - strace = flag.Bool("strace", false, "enable strace logs") - platform = flag.String("platform", "ptrace", "platform to run on") - network = flag.String("network", "none", "network stack to run on (sandbox, host, none)") - useTmpfs = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp") - fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode") - overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay") - vfs2 = flag.Bool("vfs2", false, "enable VFS2") - fuse = flag.Bool("fuse", false, "enable FUSE") - runscPath = flag.String("runsc", "", "path to runsc binary") + debug = flag.Bool("debug", false, "enable debug logs") + strace = flag.Bool("strace", false, "enable strace logs") + platform = flag.String("platform", "ptrace", "platform to run on") + network = flag.String("network", "none", "network stack to run on (sandbox, host, none)") + useTmpfs = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp") + fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode") + overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay") + vfs2 = flag.Bool("vfs2", false, "enable VFS2") + fuse = flag.Bool("fuse", false, "enable FUSE") + container = flag.Bool("container", false, "run tests in their own namespaces (user ns, network ns, etc), pretending to be root") + setupContainerPath = flag.String("setup-container", "", "path to setup_container binary (for use with --container)") + runscPath = flag.String("runsc", "", "path to runsc binary") addUDSTree = flag.Bool("add-uds-tree", false, "expose a tree of UDS utilities for use in tests") // TODO(gvisor.dev/issue/4572): properly support leak checking for runsc, and @@ -105,6 +107,27 @@ func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) { cmd.Stderr = os.Stderr cmd.SysProcAttr = &unix.SysProcAttr{} + if *container { + // setup_container takes in its target argv as positional arguments. + cmd.Path = *setupContainerPath + cmd.Args = append([]string{cmd.Path}, cmd.Args...) + cmd.SysProcAttr = &unix.SysProcAttr{ + Cloneflags: unix.CLONE_NEWUSER | unix.CLONE_NEWNET | unix.CLONE_NEWIPC | unix.CLONE_NEWUTS, + // Set current user/group as root inside the namespace. + UidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getuid(), Size: 1}, + }, + GidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getgid(), Size: 1}, + }, + GidMappingsEnableSetgroups: false, + Credential: &syscall.Credential{ + Uid: 0, + Gid: 0, + }, + } + } + if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) { cmd.SysProcAttr.Cloneflags |= unix.CLONE_NEWUTS } @@ -147,6 +170,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error { "-network", *network, "-log-format=text", "-TESTONLY-unsafe-nonroot=true", + "-TESTONLY-allow-packet-endpoint-write=true", "-net-raw=true", fmt.Sprintf("-panic-signal=%d", unix.SIGTERM), "-watchdog-action=panic", @@ -454,6 +478,13 @@ func main() { } *runscPath = specutils.ExePath } + if *container && *setupContainerPath == "" { + setupContainer, err := testutil.FindFile("test/runner/setup_container/setup_container") + if err != nil { + fatalf("cannot find setup_container: %v", err) + } + *setupContainerPath = setupContainer + } // Make sure stdout and stderr are opened with O_APPEND, otherwise logs // from outside the sandbox can (and will) stomp on logs from inside diff --git a/test/runner/setup_container/BUILD b/test/runner/setup_container/BUILD new file mode 100644 index 000000000..f879ef649 --- /dev/null +++ b/test/runner/setup_container/BUILD @@ -0,0 +1,19 @@ +# setup_container contains a shim binary that runs within the test container +# for syscall tests with container=True. + +load("//tools:defs.bzl", "cc_binary") + +package(licenses = ["notice"]) + +cc_binary( + name = "setup_container", + testonly = 1, + srcs = ["setup_container.cc"], + visibility = ["//test/runner:__subpackages__"], + deps = [ + "//test/syscalls/linux:socket_netlink_util", + "//test/util:capability_util", + "//test/util:posix_error", + "//test/util:socket_util", + ], +) diff --git a/test/runner/setup_container/setup_container.cc b/test/runner/setup_container/setup_container.cc new file mode 100644 index 000000000..e6fae37e1 --- /dev/null +++ b/test/runner/setup_container/setup_container.cc @@ -0,0 +1,79 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <linux/capability.h> +#include <sys/ioctl.h> +#include <unistd.h> + +#include "test/syscalls/linux/socket_netlink_util.h" +#include "test/util/capability_util.h" +#include "test/util/posix_error.h" +#include "test/util/socket_util.h" + +namespace gvisor { +namespace testing { + +// SetupContainer sets up the networking settings in the current container. +PosixError SetupContainer() { + const PosixErrorOr<bool> have_net_admin = HaveCapability(CAP_NET_ADMIN); + if (!have_net_admin.ok()) { + std::cerr << "Cannot determine if we have CAP_NET_ADMIN." << std::endl; + return have_net_admin.error(); + } + if (have_net_admin.ValueOrDie() && !IsRunningOnGvisor()) { + PosixErrorOr<FileDescriptor> sockfd = Socket(AF_INET, SOCK_DGRAM, 0); + if (!sockfd.ok()) { + std::cerr << "Cannot open socket." << std::endl; + return sockfd.error(); + } + int sock = sockfd.ValueOrDie().get(); + struct ifreq ifr = {}; + strncpy(ifr.ifr_name, "lo", IFNAMSIZ); + if (ioctl(sock, SIOCGIFFLAGS, &ifr) == -1) { + std::cerr << "Cannot get 'lo' flags: " << strerror(errno) << std::endl; + return PosixError(errno); + } + if ((ifr.ifr_flags & IFF_UP) == 0) { + ifr.ifr_flags |= IFF_UP; + if (ioctl(sock, SIOCSIFFLAGS, &ifr) == -1) { + std::cerr << "Cannot set 'lo' as UP: " << strerror(errno) << std::endl; + return PosixError(errno); + } + } + } + return NoError(); +} + +} // namespace testing +} // namespace gvisor + +using ::gvisor::testing::SetupContainer; + +// Binary setup_container initializes the container environment in which tests +// with container=True will run, then execs the actual test binary. +// Usage: +// ./setup_container test_binary [arguments forwarded to test_binary...] +int main(int argc, char *argv[], char *envp[]) { + if (!SetupContainer().ok()) { + return 1; + } + if (argc < 2) { + std::cerr << "Must provide arguments to exec." << std::endl; + return 2; + } + if (execve(argv[1], &argv[1], envp) == -1) { + std::cerr << "execv returned errno " << errno << std::endl; + return 1; + } +} diff --git a/test/runtimes/runner/lib/lib.go b/test/runtimes/runner/lib/lib.go index f2db5f9ea..d6b652897 100644 --- a/test/runtimes/runner/lib/lib.go +++ b/test/runtimes/runner/lib/lib.go @@ -152,7 +152,7 @@ func getTests(ctx context.Context, d *dockerutil.Container, lang, image string, return itests, nil } -// getBlacklist reads the exclude file and returns a set of test names to +// getExcludes reads the exclude file and returns a set of test names to // exclude. func getExcludes(excludeFile string) (map[string]struct{}, error) { excludes := make(map[string]struct{}) diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 99743b14a..8494862fa 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -331,6 +331,10 @@ syscall_test( ) syscall_test( + test = "//test/syscalls/linux:msgqueue_test", +) + +syscall_test( size = "medium", test = "//test/syscalls/linux:msync_test", ) @@ -355,6 +359,10 @@ syscall_test( ) syscall_test( + test = "//test/syscalls/linux:packet_socket_dgram_test", +) + +syscall_test( test = "//test/syscalls/linux:packet_socket_raw_test", ) @@ -648,6 +656,13 @@ syscall_test( syscall_test( size = "large", shard_count = most_shards, + tags = ["container"], + test = "//test/syscalls/linux:socket_inet_loopback_isolated_test", +) + +syscall_test( + size = "large", + shard_count = most_shards, # Takes too long for TSAN. Creates a lot of TCP sockets. tags = ["nogotsan"], test = "//test/syscalls/linux:socket_inet_loopback_nogotsan_test", @@ -696,6 +711,11 @@ syscall_test( syscall_test( size = "medium", + test = "//test/syscalls/linux:socket_ipv4_datagram_based_socket_unbound_loopback_test", +) + +syscall_test( + size = "medium", test = "//test/syscalls/linux:socket_ipv4_udp_unbound_loopback_test", ) @@ -731,6 +751,7 @@ syscall_test( ) syscall_test( + add_hostinet = True, test = "//test/syscalls/linux:socket_netdevice_test", ) @@ -876,6 +897,10 @@ syscall_test( ) syscall_test( + test = "//test/syscalls/linux:verity_symlink_test", +) + +syscall_test( add_overlay = True, test = "//test/syscalls/linux:sync_test", ) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index d8b562e9d..5efb3e620 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "gbenchmark", "gtest", "select_arch", "select_system") +load("//tools:defs.bzl", "cc_binary", "cc_library", "gbenchmark", "gtest", "select_arch", "select_system") package( default_visibility = ["//:sandbox"], @@ -7,8 +7,16 @@ package( exports_files( [ + "packet_socket.cc", + "packet_socket_dgram.cc", + "packet_socket_raw.cc", + "raw_socket.cc", + "raw_socket_hdrincl.cc", + "raw_socket_icmp.cc", "socket.cc", "socket_inet_loopback.cc", + "socket_inet_loopback_isolated.cc", + "socket_inet_loopback_test_params.h", "socket_ip_loopback_blocking.cc", "socket_ip_tcp_generic_loopback.cc", "socket_ip_tcp_loopback.cc", @@ -19,6 +27,7 @@ exports_files( "socket_ip_udp_loopback_blocking.cc", "socket_ip_udp_loopback_nonblock.cc", "socket_ip_unbound.cc", + "socket_ipv4_datagram_based_socket_unbound_loopback.cc", "socket_ipv4_udp_unbound_external_networking_test.cc", "socket_ipv6_udp_unbound_external_networking_test.cc", "socket_ipv4_udp_unbound_loopback.cc", @@ -126,9 +135,9 @@ cc_library( srcs = ["socket_netlink_util.cc"], hdrs = ["socket_netlink_util.h"], deps = [ - ":socket_test_util", "//test/util:file_descriptor", "//test/util:posix_error", + "//test/util:socket_util", "@com_google_absl//absl/strings", ], ) @@ -144,36 +153,12 @@ cc_library( ) cc_library( - name = "socket_test_util", - testonly = 1, - srcs = [ - "socket_test_util.cc", - "socket_test_util_impl.cc", - ], - hdrs = ["socket_test_util.h"], - defines = select_system(), - deps = default_net_util() + [ - gtest, - "@com_google_absl//absl/memory", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/time", - "@com_google_absl//absl/types:optional", - "//test/util:file_descriptor", - "//test/util:posix_error", - "//test/util:temp_path", - "//test/util:test_util", - "//test/util:thread_util", - ], -) - -cc_library( name = "unix_domain_socket_test_util", testonly = 1, srcs = ["unix_domain_socket_test_util.cc"], hdrs = ["unix_domain_socket_test_util.h"], deps = [ - ":socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/strings", gtest, "//test/util:test_util", @@ -186,7 +171,7 @@ cc_library( srcs = ["ip_socket_test_util.cc"], hdrs = ["ip_socket_test_util.h"], deps = [ - ":socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/strings", ], ) @@ -231,9 +216,9 @@ cc_binary( srcs = ["accept_bind.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -246,9 +231,9 @@ cc_binary( srcs = ["accept_bind_stream.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -347,8 +332,8 @@ cc_binary( srcs = ["bind.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -361,7 +346,7 @@ cc_binary( srcs = ["socket.cc"], linkstatic = 1, deps = [ - ":socket_test_util", + "//test/util:socket_util", gtest, "//test/util:file_descriptor", "//test/util:temp_umask", @@ -376,9 +361,9 @@ cc_binary( srcs = ["socket_capability.cc"], linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:capability_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -477,6 +462,7 @@ cc_binary( "//test/util:cleanup", "//test/util:file_descriptor", "//test/util:fs_util", + "@com_google_absl//absl/cleanup", "@com_google_absl//absl/strings", gtest, "//test/util:logging", @@ -537,9 +523,9 @@ cc_binary( srcs = ["connect_external.cc"], linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:file_descriptor", "//test/util:fs_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -581,6 +567,7 @@ cc_binary( deps = [ "//test/util:eventfd_util", "//test/util:file_descriptor", + "@com_google_absl//absl/memory", gtest, "//test/util:fs_util", "//test/util:posix_error", @@ -752,10 +739,10 @@ cc_binary( linkstatic = 1, deps = [ ":file_base", - ":socket_test_util", "//test/util:cleanup", "//test/util:eventfd_util", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/strings", "@com_google_absl//absl/time", gtest, @@ -798,12 +785,12 @@ cc_binary( srcs = ["fcntl.cc"], linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:capability_util", "//test/util:cleanup", "//test/util:eventfd_util", "//test/util:file_descriptor", "//test/util:fs_util", + "//test/util:socket_util", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/flags:flag", "@com_google_absl//absl/memory", @@ -831,8 +818,8 @@ cc_binary( ], linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/strings", "@com_google_absl//absl/time", gtest, @@ -1018,9 +1005,9 @@ cc_binary( linkstatic = 1, deps = [ ":ip_socket_test_util", - ":socket_test_util", ":unix_domain_socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:signal_util", "//test/util:test_main", @@ -1062,9 +1049,9 @@ cc_binary( linkstatic = 1, deps = [ ":iptables_types", - ":socket_test_util", "//test/util:capability_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -1080,9 +1067,9 @@ cc_binary( linkstatic = 1, deps = [ ":iptables_types", - ":socket_test_util", "//test/util:capability_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -1455,16 +1442,17 @@ cc_binary( ) cc_binary( - name = "packet_socket_raw_test", + name = "packet_socket_dgram_test", testonly = 1, - srcs = ["packet_socket_raw.cc"], - defines = select_system(), + srcs = ["packet_socket_dgram.cc"], linkstatic = 1, deps = [ - ":socket_test_util", + ":ip_socket_test_util", ":unix_domain_socket_test_util", "//test/util:capability_util", + "//test/util:cleanup", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/base:endian", gtest, @@ -1474,15 +1462,17 @@ cc_binary( ) cc_binary( - name = "packet_socket_test", + name = "packet_socket_raw_test", testonly = 1, - srcs = ["packet_socket.cc"], + srcs = ["packet_socket_raw.cc"], linkstatic = 1, deps = [ - ":socket_test_util", + ":ip_socket_test_util", ":unix_domain_socket_test_util", "//test/util:capability_util", + "//test/util:cleanup", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/base:endian", gtest, @@ -1492,6 +1482,21 @@ cc_binary( ) cc_binary( + name = "packet_socket_test", + testonly = 1, + srcs = ["packet_socket.cc"], + linkstatic = 1, + deps = [ + ":ip_socket_test_util", + "//test/util:capability_util", + "//test/util:file_descriptor", + "//test/util:socket_util", + gtest, + "//test/util:test_main", + ], +) + +cc_binary( name = "pty_test", testonly = 1, srcs = ["pty.cc"], @@ -1538,9 +1543,9 @@ cc_binary( srcs = select_system(linux = ["partial_bad_buffer.cc"]), linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:file_descriptor", "//test/util:fs_util", + "//test/util:socket_util", "@com_google_absl//absl/time", gtest, "//test/util:posix_error", @@ -1573,8 +1578,8 @@ cc_binary( linkstatic = 1, deps = [ ":ip_socket_test_util", - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/strings", "@com_google_absl//absl/types:optional", @@ -1771,6 +1776,7 @@ cc_binary( "//test/util:mount_util", "@com_google_absl//absl/container:node_hash_set", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/synchronization", "@com_google_absl//absl/time", gtest, @@ -1792,10 +1798,10 @@ cc_binary( srcs = ["proc_net.cc"], linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:capability_util", "//test/util:file_descriptor", "//test/util:fs_util", + "//test/util:socket_util", "@com_google_absl//absl/strings", "@com_google_absl//absl/time", gtest, @@ -1883,6 +1889,7 @@ cc_binary( linkstatic = 1, deps = [ "@com_google_absl//absl/flags:flag", + "@com_google_absl//absl/strings", "@com_google_absl//absl/time", gtest, "//test/util:capability_util", @@ -1938,10 +1945,10 @@ cc_binary( srcs = ["raw_socket_hdrincl.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", "//test/util:capability_util", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/base:endian", gtest, @@ -1957,10 +1964,10 @@ cc_binary( defines = select_system(), linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", "//test/util:capability_util", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/base:core_headers", gtest, "//test/util:test_main", @@ -1974,10 +1981,10 @@ cc_binary( srcs = ["raw_socket_icmp.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", "//test/util:capability_util", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/base:core_headers", gtest, "//test/util:test_main", @@ -2007,8 +2014,8 @@ cc_binary( srcs = ["readahead.cc"], linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:temp_path", "//test/util:test_main", @@ -2207,8 +2214,8 @@ cc_binary( srcs = ["sendfile_socket.cc"], linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/strings", gtest, ":ip_socket_test_util", @@ -2386,11 +2393,12 @@ cc_library( "socket_generic.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", gtest, + "//test/util:capability_util", "//test/util:test_util", ], alwayslink = 1, @@ -2406,8 +2414,8 @@ cc_binary( deps = [ gtest, ":ip_socket_test_util", - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", "//test/util:thread_util", @@ -2422,8 +2430,8 @@ cc_library( srcs = ["socket_unix_dgram.cc"], hdrs = ["socket_unix_dgram.h"], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/time", gtest, "//test/util:test_util", @@ -2437,8 +2445,8 @@ cc_library( srcs = ["socket_unix_seqpacket.cc"], hdrs = ["socket_unix_seqpacket.h"], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/time", gtest, "//test/util:test_util", @@ -2456,7 +2464,7 @@ cc_library( "socket_ip_tcp_generic.h", ], deps = [ - ":socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/memory", "@com_google_absl//absl/time", gtest, @@ -2477,8 +2485,8 @@ cc_library( "socket_non_blocking.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_util", ], @@ -2495,8 +2503,8 @@ cc_library( "socket_unix_non_stream.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:memory_util", "//test/util:test_util", @@ -2515,8 +2523,8 @@ cc_library( ], deps = [ ":ip_socket_test_util", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_util", ], @@ -2534,7 +2542,7 @@ cc_library( ], deps = [ ":ip_socket_test_util", - ":socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_util", ], @@ -2542,6 +2550,24 @@ cc_library( ) cc_library( + name = "socket_ipv4_datagram_based_socket_unbound_test_cases", + testonly = 1, + srcs = [ + "socket_ipv4_datagram_based_socket_unbound.cc", + ], + hdrs = [ + "socket_ipv4_datagram_based_socket_unbound.h", + ], + deps = [ + ":ip_socket_test_util", + "//test/util:capability_util", + "//test/util:socket_util", + gtest, + ], + alwayslink = 1, +) + +cc_library( name = "socket_ipv4_udp_unbound_test_cases", testonly = 1, srcs = [ @@ -2552,12 +2578,9 @@ cc_library( ], deps = [ ":ip_socket_test_util", - ":socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/memory", gtest, - "//test/util:posix_error", - "//test/util:save_util", - "//test/util:test_util", ], alwayslink = 1, ) @@ -2573,7 +2596,7 @@ cc_library( ], deps = [ ":ip_socket_test_util", - ":socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/memory", gtest, "//test/util:posix_error", @@ -2594,9 +2617,9 @@ cc_library( ], deps = [ ":socket_netlink_route_util", - ":socket_test_util", "//test/util:capability_util", "//test/util:cleanup", + "//test/util:socket_util", gtest, ], alwayslink = 1, @@ -2613,8 +2636,8 @@ cc_library( ], deps = [ ":socket_netlink_route_util", - ":socket_test_util", "//test/util:capability_util", + "//test/util:socket_util", gtest, ], alwayslink = 1, @@ -2631,7 +2654,7 @@ cc_library( ], deps = [ ":ip_socket_test_util", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_util", ], alwayslink = 1, @@ -2678,10 +2701,10 @@ cc_binary( linkstatic = 1, deps = [ ":socket_generic_test_cases", - ":socket_test_util", ":socket_unix_cmsg_test_cases", ":socket_unix_test_cases", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2696,8 +2719,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_non_blocking_test_cases", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2710,10 +2733,10 @@ cc_binary( linkstatic = 1, deps = [ ":socket_non_stream_test_cases", - ":socket_test_util", ":socket_unix_dgram_test_cases", ":socket_unix_non_stream_test_cases", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2725,8 +2748,8 @@ cc_binary( srcs = ["socket_unix_dgram_non_blocking.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -2742,10 +2765,10 @@ cc_binary( linkstatic = 1, deps = [ ":socket_non_stream_test_cases", - ":socket_test_util", ":socket_unix_non_stream_test_cases", ":socket_unix_seqpacket_test_cases", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2757,8 +2780,8 @@ cc_binary( srcs = ["socket_unix_stream.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/time", gtest, "//test/util:test_main", @@ -2776,7 +2799,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_ip_tcp_generic_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2791,7 +2814,7 @@ cc_binary( linkstatic = 1, deps = [ ":ip_socket_test_util", - ":socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -2808,7 +2831,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_generic_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2824,7 +2847,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_non_blocking_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2842,7 +2865,7 @@ cc_binary( ":socket_generic_test_cases", ":socket_ip_udp_test_cases", ":socket_non_stream_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2858,7 +2881,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_ipv4_udp_unbound_external_networking_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2874,7 +2897,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_ipv6_udp_unbound_external_networking_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2890,8 +2913,8 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_bind_to_device_util", - ":socket_test_util", "//test/util:capability_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -2909,8 +2932,8 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_bind_to_device_util", - ":socket_test_util", "//test/util:capability_util", + "//test/util:socket_util", "@com_google_absl//absl/container:node_hash_map", gtest, "//test/util:test_main", @@ -2929,8 +2952,8 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_bind_to_device_util", - ":socket_test_util", "//test/util:capability_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -2948,7 +2971,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_non_blocking_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2964,9 +2987,21 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_ipv4_udp_unbound_test_cases", - ":socket_test_util", "//test/util:test_main", - "//test/util:test_util", + ], +) + +cc_binary( + name = "socket_ipv4_datagram_based_socket_unbound_loopback_test", + testonly = 1, + srcs = [ + "socket_ipv4_datagram_based_socket_unbound_loopback.cc", + ], + linkstatic = 1, + deps = [ + ":ip_socket_test_util", + ":socket_ipv4_datagram_based_socket_unbound_test_cases", + "//test/util:test_main", ], ) @@ -2980,7 +3015,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_ipv6_udp_unbound_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -2995,7 +3030,7 @@ cc_binary( linkstatic = 1, deps = [ ":ip_socket_test_util", - ":socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3013,7 +3048,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_ipv4_udp_unbound_netlink_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -3029,7 +3064,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_ipv6_udp_unbound_netlink_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -3044,7 +3079,7 @@ cc_binary( linkstatic = 1, deps = [ ":ip_socket_test_util", - ":socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3061,8 +3096,8 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_netlink_route_util", - ":socket_test_util", "//test/util:capability_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3078,8 +3113,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_generic_test_cases", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -3094,8 +3129,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_non_blocking_test_cases", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -3110,10 +3145,10 @@ cc_binary( linkstatic = 1, deps = [ ":socket_generic_test_cases", - ":socket_test_util", ":socket_unix_cmsg_test_cases", ":socket_unix_test_cases", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -3128,13 +3163,23 @@ cc_binary( linkstatic = 1, deps = [ ":socket_non_blocking_test_cases", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], ) +cc_library( + name = "socket_inet_loopback_test_params", + testonly = 1, + hdrs = ["socket_inet_loopback_test_params.h"], + deps = [ + "//test/util:socket_util", + gtest, + ], +) + cc_binary( name = "socket_inet_loopback_test", testonly = 1, @@ -3142,8 +3187,9 @@ cc_binary( linkstatic = 1, deps = [ ":ip_socket_test_util", - ":socket_test_util", + ":socket_inet_loopback_test_params", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", "@com_google_absl//absl/time", @@ -3163,16 +3209,31 @@ cc_binary( linkstatic = 1, deps = [ ":ip_socket_test_util", - ":socket_test_util", + ":socket_inet_loopback_test_params", "//test/util:file_descriptor", - "@com_google_absl//absl/memory", + "//test/util:socket_util", "@com_google_absl//absl/strings", gtest, "//test/util:posix_error", "//test/util:save_util", "//test/util:test_main", "//test/util:test_util", - "//test/util:thread_util", + ], +) + +cc_binary( + name = "socket_inet_loopback_isolated_test", + testonly = 1, + srcs = ["socket_inet_loopback_isolated.cc"], + linkstatic = 1, + deps = [ + ":socket_inet_loopback_test_params", + ":socket_netlink_util", + "//test/util:socket_util", + gtest, + "//test/util:test_main", + "//test/util:test_util", + "@com_google_absl//absl/time", ], ) @@ -3182,8 +3243,8 @@ cc_binary( srcs = ["socket_netlink.cc"], linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3198,10 +3259,10 @@ cc_binary( deps = [ ":socket_netlink_route_util", ":socket_netlink_util", - ":socket_test_util", "//test/util:capability_util", "//test/util:cleanup", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/strings:str_format", gtest, "//test/util:test_main", @@ -3216,8 +3277,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_netlink_util", - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3236,8 +3297,8 @@ cc_library( "socket_stream.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/time", gtest, "//test/util:test_util", @@ -3255,8 +3316,8 @@ cc_library( "socket_blocking.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/time", gtest, "//test/util:test_util", @@ -3276,10 +3337,13 @@ cc_library( "socket_unix.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:file_descriptor", + "//test/util:memory_util", + "//test/util:socket_util", "@com_google_absl//absl/strings", gtest, + "//test/util:temp_path", "//test/util:test_util", "//test/util:thread_util", ], @@ -3296,8 +3360,8 @@ cc_library( "socket_unix_cmsg.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/strings", gtest, "//test/util:test_util", @@ -3316,8 +3380,8 @@ cc_library( "socket_stream_blocking.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/time", gtest, "//test/util:test_util", @@ -3337,8 +3401,8 @@ cc_library( "socket_stream_nonblock.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_util", ], @@ -3355,8 +3419,8 @@ cc_library( "socket_non_stream_blocking.h", ], deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/time", gtest, "//test/util:test_util", @@ -3391,8 +3455,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_stream_test_cases", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -3407,8 +3471,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_stream_blocking_test_cases", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -3424,7 +3488,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_stream_blocking_test_cases", - ":socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -3439,8 +3503,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_stream_nonblocking_test_cases", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "//test/util:test_main", "//test/util:test_util", ], @@ -3452,8 +3516,8 @@ cc_binary( srcs = ["socket_unix_unbound_dgram.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3466,8 +3530,8 @@ cc_binary( srcs = ["socket_unix_unbound_abstract.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3480,8 +3544,8 @@ cc_binary( srcs = ["socket_unix_unbound_filesystem.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:file_descriptor", "//test/util:test_main", @@ -3498,8 +3562,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_blocking_test_cases", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3516,7 +3580,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_blocking_test_cases", - ":socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3532,8 +3596,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_non_stream_blocking_test_cases", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3550,7 +3614,7 @@ cc_binary( deps = [ ":ip_socket_test_util", ":socket_non_stream_blocking_test_cases", - ":socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3565,10 +3629,10 @@ cc_binary( ], linkstatic = 1, deps = [ - ":socket_test_util", ":socket_unix_cmsg_test_cases", ":socket_unix_test_cases", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3581,8 +3645,8 @@ cc_binary( srcs = ["socket_unix_unbound_seqpacket.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3595,8 +3659,8 @@ cc_binary( srcs = ["socket_unix_unbound_stream.cc"], linkstatic = 1, deps = [ - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -3610,8 +3674,8 @@ cc_binary( linkstatic = 1, deps = [ ":socket_netlink_util", - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/base:endian", gtest, "//test/util:test_main", @@ -3693,6 +3757,23 @@ cc_binary( ) cc_binary( + name = "verity_symlink_test", + testonly = 1, + srcs = ["verity_symlink.cc"], + linkstatic = 1, + deps = [ + "//test/util:capability_util", + gtest, + "//test/util:fs_util", + "//test/util:mount_util", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:verity_util", + ], +) + +cc_binary( name = "sync_test", testonly = 1, # Android does not support syncfs in r22. @@ -3751,8 +3832,8 @@ cc_binary( defines = select_system(), linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", "@com_google_absl//absl/time", gtest, "//test/util:posix_error", @@ -3862,7 +3943,7 @@ cc_binary( srcs = ["tuntap.cc"], linkstatic = 1, deps = [ - ":socket_test_util", + "//test/util:socket_util", gtest, ":socket_netlink_route_util", "//test/util:capability_util", @@ -3895,8 +3976,8 @@ cc_binary( linkstatic = 1, deps = [ ":ip_socket_test_util", - ":socket_test_util", ":unix_domain_socket_test_util", + "//test/util:socket_util", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/time", @@ -3915,8 +3996,8 @@ cc_binary( srcs = ["udp_bind.cc"], linkstatic = 1, deps = [ - ":socket_test_util", "//test/util:file_descriptor", + "//test/util:socket_util", gtest, "//test/util:test_main", "//test/util:test_util", @@ -4096,12 +4177,10 @@ cc_binary( srcs = ["network_namespace.cc"], linkstatic = 1, deps = [ - ":socket_test_util", gtest, + ":ip_socket_test_util", "//test/util:capability_util", - "//test/util:posix_error", "//test/util:test_main", - "//test/util:test_util", "//test/util:thread_util", ], ) @@ -4141,6 +4220,22 @@ cc_binary( ) cc_binary( + name = "msgqueue_test", + testonly = 1, + srcs = ["msgqueue.cc"], + linkstatic = 1, + deps = [ + "//test/util:capability_util", + "//test/util:signal_util", + "//test/util:temp_path", + "//test/util:test_util", + "//test/util:thread_util", + "@com_google_absl//absl/synchronization", + "@com_google_absl//absl/time", + ], +) + +cc_binary( name = "fadvise64_test", testonly = 1, srcs = ["fadvise64.cc"], diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc index fe560cfc5..0d16d1d83 100644 --- a/test/syscalls/linux/accept_bind.cc +++ b/test/syscalls/linux/accept_bind.cc @@ -20,9 +20,9 @@ #include <vector> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { @@ -37,8 +37,7 @@ TEST_P(AllSocketPairTest, Listen) { sockets->first_addr_size()), SyscallSucceeds()); - ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 5), - SyscallSucceeds()); + ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds()); } TEST_P(AllSocketPairTest, ListenIncreaseBacklog) { @@ -48,10 +47,8 @@ TEST_P(AllSocketPairTest, ListenIncreaseBacklog) { sockets->first_addr_size()), SyscallSucceeds()); - ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 5), - SyscallSucceeds()); - ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 10), - SyscallSucceeds()); + ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds()); + ASSERT_THAT(listen(sockets->first_fd(), 10), SyscallSucceeds()); } TEST_P(AllSocketPairTest, ListenDecreaseBacklog) { @@ -61,10 +58,8 @@ TEST_P(AllSocketPairTest, ListenDecreaseBacklog) { sockets->first_addr_size()), SyscallSucceeds()); - ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 5), - SyscallSucceeds()); - ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 1), - SyscallSucceeds()); + ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds()); + ASSERT_THAT(listen(sockets->first_fd(), 1), SyscallSucceeds()); } TEST_P(AllSocketPairTest, ListenBacklogSizes) { diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc index 4857f160b..5f2b07105 100644 --- a/test/syscalls/linux/accept_bind_stream.cc +++ b/test/syscalls/linux/accept_bind_stream.cc @@ -19,9 +19,9 @@ #include <vector> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/bind.cc b/test/syscalls/linux/bind.cc index 9547c4ab2..8e1d00619 100644 --- a/test/syscalls/linux/bind.cc +++ b/test/syscalls/linux/bind.cc @@ -17,8 +17,8 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/cgroup.cc b/test/syscalls/linux/cgroup.cc index f29891571..ca23dfeee 100644 --- a/test/syscalls/linux/cgroup.cc +++ b/test/syscalls/linux/cgroup.cc @@ -279,6 +279,23 @@ TEST(Cgroup, UnmountRepeated) { EXPECT_THAT(umount(c.Path().c_str()), SyscallFailsWithErrno(EINVAL)); } +TEST(Cgroup, Create) { + SKIP_IF(!CgroupsAvailable()); + Mounter m(ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir())); + Cgroup c = ASSERT_NO_ERRNO_AND_VALUE(m.MountCgroupfs("")); + ASSERT_NO_ERRNO(c.CreateChild("child1")); + EXPECT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(Exists(c.Path()))); +} + +TEST(Cgroup, SubcontainerInitiallyEmpty) { + SKIP_IF(!CgroupsAvailable()); + Mounter m(ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir())); + Cgroup c = ASSERT_NO_ERRNO_AND_VALUE(m.MountCgroupfs("")); + Cgroup child = ASSERT_NO_ERRNO_AND_VALUE(c.CreateChild("child1")); + auto procs = ASSERT_NO_ERRNO_AND_VALUE(child.Procs()); + EXPECT_TRUE(procs.empty()); +} + TEST(MemoryCgroup, MemoryUsageInBytes) { SKIP_IF(!CgroupsAvailable()); diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc index fab79d300..7e4626f03 100644 --- a/test/syscalls/linux/chroot.cc +++ b/test/syscalls/linux/chroot.cc @@ -20,16 +20,17 @@ #include <syscall.h> #include <unistd.h> +#include <algorithm> #include <string> #include <vector> #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "absl/cleanup/cleanup.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_split.h" #include "absl/strings/string_view.h" #include "test/util/capability_util.h" -#include "test/util/cleanup.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" #include "test/util/logging.h" @@ -46,13 +47,52 @@ namespace testing { namespace { +// Async-signal-safe conversion from integer to string, appending the string +// (including a terminating NUL) to buf, which is a buffer of size len bytes. +// Returns the number of bytes written, or 0 if the buffer is too small. +// +// Preconditions: 2 <= radix <= 16. +template <typename T> +size_t SafeItoa(T val, char* buf, size_t len, int radix) { + size_t n = 0; +#define _WRITE_OR_FAIL(c) \ + do { \ + if (len == 0) { \ + return 0; \ + } \ + buf[n] = (c); \ + n++; \ + len--; \ + } while (false) + if (val == 0) { + _WRITE_OR_FAIL('0'); + } else { + // Write digits in reverse order, then reverse them at the end. + bool neg = val < 0; + while (val != 0) { + // C/C++ define modulo such that the result is negative if exactly one of + // the dividend or divisor is negative, so this handles both positive and + // negative values. + char c = "fedcba9876543210123456789abcdef"[val % radix + 15]; + _WRITE_OR_FAIL(c); + val /= 10; + } + if (neg) { + _WRITE_OR_FAIL('-'); + } + std::reverse(buf, buf + n); + } + _WRITE_OR_FAIL('\0'); + return n; +#undef _WRITE_OR_FAIL +} + TEST(ChrootTest, Success) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT))); + auto temp_dir = TempPath::CreateDir().ValueOrDie(); + const std::string temp_dir_path = temp_dir.path(); - const auto rest = [] { - auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - TEST_CHECK_SUCCESS(chroot(temp_dir.path().c_str())); - }; + const auto rest = [&] { TEST_CHECK_SUCCESS(chroot(temp_dir_path.c_str())); }; EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0)); } @@ -101,28 +141,34 @@ TEST(ChrootTest, CreatesNewRoot) { SyscallSucceeds()); auto new_root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const std::string new_root_path = new_root.path(); auto file_in_new_root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(new_root.path())); + const std::string file_in_new_root_path = file_in_new_root.path(); const auto rest = [&] { // chroot into new_root. - TEST_CHECK_SUCCESS(chroot(new_root.path().c_str())); + TEST_CHECK_SUCCESS(chroot(new_root_path.c_str())); // getcwd should return "(unreachable)" followed by the initial_cwd. - char cwd[1024]; - TEST_CHECK_SUCCESS(syscall(__NR_getcwd, cwd, sizeof(cwd))); - std::string expected_cwd = "(unreachable)"; - expected_cwd += initial_cwd; - TEST_CHECK(strcmp(cwd, expected_cwd.c_str()) == 0); + char buf[1024]; + TEST_CHECK_SUCCESS(syscall(__NR_getcwd, buf, sizeof(buf))); + constexpr char kUnreachablePrefix[] = "(unreachable)"; + TEST_CHECK( + strncmp(buf, kUnreachablePrefix, sizeof(kUnreachablePrefix) - 1) == 0); + TEST_CHECK(strcmp(buf + sizeof(kUnreachablePrefix) - 1, initial_cwd) == 0); // Should not be able to stat file by its full path. struct stat statbuf; - TEST_CHECK_ERRNO(stat(file_in_new_root.path().c_str(), &statbuf), ENOENT); + TEST_CHECK_ERRNO(stat(file_in_new_root_path.c_str(), &statbuf), ENOENT); // Should be able to stat file at new rooted path. - auto basename = std::string(Basename(file_in_new_root.path())); - auto rootedFile = "/" + basename; - TEST_CHECK_SUCCESS(stat(rootedFile.c_str(), &statbuf)); + buf[0] = '/'; + absl::string_view basename = Basename(file_in_new_root_path); + TEST_CHECK(basename.length() < (sizeof(buf) - 2)); + memcpy(buf + 1, basename.data(), basename.length()); + buf[basename.length() + 1] = '\0'; + TEST_CHECK_SUCCESS(stat(buf, &statbuf)); // Should be able to stat cwd at '.' even though it's outside root. TEST_CHECK_SUCCESS(stat(".", &statbuf)); @@ -131,8 +177,8 @@ TEST(ChrootTest, CreatesNewRoot) { TEST_CHECK_SUCCESS(chdir("/")); // getcwd should return "/". - TEST_CHECK_SUCCESS(syscall(__NR_getcwd, cwd, sizeof(cwd))); - TEST_CHECK_SUCCESS(strcmp(cwd, "/") == 0); + TEST_CHECK_SUCCESS(syscall(__NR_getcwd, buf, sizeof(buf))); + TEST_CHECK_SUCCESS(strcmp(buf, "/") == 0); // Statting '.', '..', '/', and '/..' all return the same dev and inode. struct stat statbuf_dot; @@ -160,10 +206,11 @@ TEST(ChrootTest, DotDotFromOpenFD) { auto fd = ASSERT_NO_ERRNO_AND_VALUE( Open(dir_outside_root.path(), O_RDONLY | O_DIRECTORY)); auto new_root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const std::string new_root_path = new_root.path(); const auto rest = [&] { // chroot into new_root. - TEST_CHECK_SUCCESS(chroot(new_root.path().c_str())); + TEST_CHECK_SUCCESS(chroot(new_root_path.c_str())); // openat on fd with path .. will succeed. int other_fd; @@ -184,15 +231,18 @@ TEST(ChrootTest, ProcFdLinkResolutionInChroot) { const TempPath file_outside_chroot = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const std::string file_outside_chroot_path = file_outside_chroot.path(); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file_outside_chroot.path(), O_RDONLY)); const FileDescriptor proc_fd = ASSERT_NO_ERRNO_AND_VALUE( Open("/proc", O_DIRECTORY | O_RDONLY | O_CLOEXEC)); + auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const std::string temp_dir_path = temp_dir.path(); + const auto rest = [&] { - auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - TEST_CHECK_SUCCESS(chroot(temp_dir.path().c_str())); + TEST_CHECK_SUCCESS(chroot(temp_dir_path.c_str())); // Opening relative to an already open fd to a node outside the chroot // works. @@ -201,9 +251,10 @@ TEST(ChrootTest, ProcFdLinkResolutionInChroot) { // Proc fd symlinks can escape the chroot if the fd the symlink refers to // refers to an object outside the chroot. + char fd_buf[11]; + TEST_CHECK(SafeItoa(fd.get(), fd_buf, sizeof(fd_buf), 10)); struct stat s = {}; - TEST_CHECK_SUCCESS( - fstatat(proc_self_fd.get(), absl::StrCat(fd.get()).c_str(), &s, 0)); + TEST_CHECK_SUCCESS(fstatat(proc_self_fd.get(), fd_buf, &s, 0)); // Try to stat the stdin fd. Internally, this is handled differently from a // proc fd entry pointing to a file, since stdin is backed by a host fd, and @@ -223,10 +274,12 @@ TEST(ChrootTest, ProcMemSelfFdsNoEscapeProcOpen) { const FileDescriptor proc = ASSERT_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY)); + const auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const std::string temp_dir_path = temp_dir.path(); + const auto rest = [&] { - // Create and enter a chroot directory. - const auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - TEST_CHECK_SUCCESS(chroot(temp_dir.path().c_str())); + // Enter the chroot directory. + TEST_CHECK_SUCCESS(chroot(temp_dir_path.c_str())); // Open a file inside the chroot at /foo. const FileDescriptor foo = @@ -234,11 +287,15 @@ TEST(ChrootTest, ProcMemSelfFdsNoEscapeProcOpen) { // Examine /proc/self/fd/{foo_fd} to see if it exposes the fact that we're // inside a chroot, the path should be /foo and NOT {chroot_dir}/foo. - const std::string fd_path = absl::StrCat("self/fd/", foo.get()); + constexpr char kSelfFdRelpath[] = "self/fd/"; + char path_buf[20]; + strcpy(path_buf, kSelfFdRelpath); // NOLINT: need async-signal-safety + TEST_CHECK(SafeItoa(foo.get(), path_buf + sizeof(kSelfFdRelpath) - 1, + sizeof(path_buf) - (sizeof(kSelfFdRelpath) - 1), 10)); char buf[1024] = {}; size_t bytes_read = 0; - TEST_CHECK_SUCCESS(bytes_read = readlinkat(proc.get(), fd_path.c_str(), buf, - sizeof(buf) - 1)); + TEST_CHECK_SUCCESS( + bytes_read = readlinkat(proc.get(), path_buf, buf, sizeof(buf) - 1)); // The link should resolve to something. TEST_CHECK(bytes_read > 0); @@ -258,10 +315,12 @@ TEST(ChrootTest, ProcMemSelfMapsNoEscapeProcOpen) { const FileDescriptor proc = ASSERT_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY)); + const auto temp_dir = TEST_CHECK_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const std::string temp_dir_path = temp_dir.path(); + const auto rest = [&] { - // Create and enter a chroot directory. - const auto temp_dir = TEST_CHECK_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - TEST_CHECK_SUCCESS(chroot(temp_dir.path().c_str())); + // Enter the chroot directory. + TEST_CHECK_SUCCESS(chroot(temp_dir_path.c_str())); // Open a file inside the chroot at /foo. const FileDescriptor foo = @@ -272,9 +331,12 @@ TEST(ChrootTest, ProcMemSelfMapsNoEscapeProcOpen) { MAP_PRIVATE, foo.get(), 0); TEST_CHECK_SUCCESS(reinterpret_cast<int64_t>(foo_map)); - // Always unmap. - auto cleanup_map = - Cleanup([&] { TEST_CHECK_SUCCESS(munmap(foo_map, kPageSize)); }); + // Always unmap. Since this function is called between fork() and execve(), + // we can't use gvisor::testing::Cleanup, which uses std::function + // and thus may heap-allocate (which is async-signal-unsafe); instead, use + // absl::Cleanup, which is templated on the callback type. + auto cleanup_map = absl::MakeCleanup( + [&] { TEST_CHECK_SUCCESS(munmap(foo_map, kPageSize)); }); // Examine /proc/self/maps to be sure that /foo doesn't appear to be // mapped with the full chroot path. @@ -289,8 +351,8 @@ TEST(ChrootTest, ProcMemSelfMapsNoEscapeProcOpen) { TEST_CHECK(bytes_read > 0); // Finally we want to make sure the maps don't contain the chroot path - TEST_CHECK(std::string(buf, bytes_read).find(temp_dir.path()) == - std::string::npos); + TEST_CHECK( + !absl::StrContains(absl::string_view(buf, bytes_read), temp_dir_path)); }; EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0)); } @@ -302,72 +364,72 @@ TEST(ChrootTest, ProcMountsMountinfoNoEscape) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT))); // Create nested tmpfs mounts. - auto const outer_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - auto const outer_mount = ASSERT_NO_ERRNO_AND_VALUE( - Mount("none", outer_dir.path(), "tmpfs", 0, "mode=0700", 0)); - - auto const inner_dir = - ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(outer_dir.path())); - auto const inner_mount = ASSERT_NO_ERRNO_AND_VALUE( - Mount("none", inner_dir.path(), "tmpfs", 0, "mode=0700", 0)); + const auto outer_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const std::string outer_dir_path = outer_dir.path(); + const auto outer_mount = ASSERT_NO_ERRNO_AND_VALUE( + Mount("none", outer_dir_path, "tmpfs", 0, "mode=0700", 0)); + + const auto inner_dir = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(outer_dir_path)); + const std::string inner_dir_path = inner_dir.path(); + const auto inner_mount = ASSERT_NO_ERRNO_AND_VALUE( + Mount("none", inner_dir_path, "tmpfs", 0, "mode=0700", 0)); + const std::string inner_dir_in_outer_chroot_path = + absl::StrCat("/", Basename(inner_dir_path)); + + // Filenames that will be checked for mounts, all relative to /proc dir. + std::string paths[3] = {"mounts", "self/mounts", "self/mountinfo"}; + + for (const std::string& path : paths) { + // We should have both inner and outer mounts. + const std::string contents = + ASSERT_NO_ERRNO_AND_VALUE(GetContents(JoinPath("/proc", path))); + EXPECT_THAT(contents, + AllOf(HasSubstr(outer_dir_path), HasSubstr(inner_dir_path))); + // We better have at least two mounts: the mounts we created plus the + // root. + std::vector<absl::string_view> submounts = + absl::StrSplit(contents, '\n', absl::SkipWhitespace()); + ASSERT_GT(submounts.size(), 2); + } - const auto rest = [&outer_dir, &inner_dir] { - // Filenames that will be checked for mounts, all relative to /proc dir. - std::string paths[3] = {"mounts", "self/mounts", "self/mountinfo"}; - - for (const std::string& path : paths) { - // We should have both inner and outer mounts. - const std::string contents = - TEST_CHECK_NO_ERRNO_AND_VALUE(GetContents(JoinPath("/proc", path))); - EXPECT_THAT(contents, AllOf(HasSubstr(outer_dir.path()), - HasSubstr(inner_dir.path()))); - // We better have at least two mounts: the mounts we created plus the - // root. - std::vector<absl::string_view> submounts = - absl::StrSplit(contents, '\n', absl::SkipWhitespace()); - TEST_CHECK(submounts.size() > 2); - } - - // Get a FD to /proc before we enter the chroot. - const FileDescriptor proc = - TEST_CHECK_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY)); + // Get a FD to /proc before we enter the chroot. + const FileDescriptor proc = + ASSERT_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY)); + const auto rest = [&] { // Chroot to outer mount. - TEST_CHECK_SUCCESS(chroot(outer_dir.path().c_str())); + TEST_CHECK_SUCCESS(chroot(outer_dir_path.c_str())); + char buf[8 * 1024]; for (const std::string& path : paths) { const FileDescriptor proc_file = TEST_CHECK_NO_ERRNO_AND_VALUE(OpenAt(proc.get(), path, O_RDONLY)); // Only two mounts visible from this chroot: the inner and outer. Both // paths should be relative to the new chroot. - const std::string contents = - TEST_CHECK_NO_ERRNO_AND_VALUE(GetContentsFD(proc_file.get())); - EXPECT_THAT(contents, - AllOf(HasSubstr(absl::StrCat(Basename(inner_dir.path()))), - Not(HasSubstr(outer_dir.path())), - Not(HasSubstr(inner_dir.path())))); - std::vector<absl::string_view> submounts = - absl::StrSplit(contents, '\n', absl::SkipWhitespace()); - TEST_CHECK(submounts.size() == 2); + ssize_t n = ReadFd(proc_file.get(), buf, sizeof(buf)); + TEST_PCHECK(n >= 0); + buf[n] = '\0'; + TEST_CHECK(absl::StrContains(buf, Basename(inner_dir_path))); + TEST_CHECK(!absl::StrContains(buf, outer_dir_path)); + TEST_CHECK(!absl::StrContains(buf, inner_dir_path)); + TEST_CHECK(std::count(buf, buf + n, '\n') == 2); } // Chroot to inner mount. We must use an absolute path accessible to our // chroot. - const std::string inner_dir_basename = - absl::StrCat("/", Basename(inner_dir.path())); - TEST_CHECK_SUCCESS(chroot(inner_dir_basename.c_str())); + TEST_CHECK_SUCCESS(chroot(inner_dir_in_outer_chroot_path.c_str())); for (const std::string& path : paths) { const FileDescriptor proc_file = TEST_CHECK_NO_ERRNO_AND_VALUE(OpenAt(proc.get(), path, O_RDONLY)); - const std::string contents = - TEST_CHECK_NO_ERRNO_AND_VALUE(GetContentsFD(proc_file.get())); // Only the inner mount visible from this chroot. - std::vector<absl::string_view> submounts = - absl::StrSplit(contents, '\n', absl::SkipWhitespace()); - TEST_CHECK(submounts.size() == 1); + ssize_t n = ReadFd(proc_file.get(), buf, sizeof(buf)); + TEST_PCHECK(n >= 0); + buf[n] = '\0'; + TEST_CHECK(std::count(buf, buf + n, '\n') == 1); } }; EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0)); diff --git a/test/syscalls/linux/connect_external.cc b/test/syscalls/linux/connect_external.cc index 1edb50e47..fb2476da4 100644 --- a/test/syscalls/linux/connect_external.cc +++ b/test/syscalls/linux/connect_external.cc @@ -22,9 +22,9 @@ #include <tuple> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" // This file contains tests specific to connecting to host UDS managed outside diff --git a/test/syscalls/linux/dup.cc b/test/syscalls/linux/dup.cc index ba4e13fb9..8f0974f45 100644 --- a/test/syscalls/linux/dup.cc +++ b/test/syscalls/linux/dup.cc @@ -13,9 +13,11 @@ // limitations under the License. #include <fcntl.h> +#include <sys/resource.h> #include <unistd.h> #include "gtest/gtest.h" +#include "absl/memory/memory.h" #include "test/util/eventfd_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" @@ -98,6 +100,45 @@ TEST(DupTest, Dup2) { ASSERT_NO_ERRNO(CheckSameFile(fd, nfd2)); } +TEST(DupTest, Rlimit) { + auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY)); + + struct rlimit rl = {}; + EXPECT_THAT(getrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds()); + + ASSERT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds()); + + constexpr int kFDLimit = 101; + // Create a file descriptor that will be above the limit. + FileDescriptor aboveLimitFD = + ASSERT_NO_ERRNO_AND_VALUE(Dup2(fd, kFDLimit * 2 - 1)); + + rl.rlim_cur = kFDLimit; + ASSERT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds()); + ASSERT_THAT(dup3(fd.get(), kFDLimit, 0), SyscallFails()); + + std::vector<std::unique_ptr<FileDescriptor>> fds; + int prev_fd = fd.get(); + int used_fds = 0; + for (int i = 0; i < kFDLimit; ++i) { + int new_fd = dup(fd.get()); + if (new_fd == -1) { + break; + } + auto f = absl::make_unique<FileDescriptor>(new_fd); + EXPECT_LT(new_fd, kFDLimit); + EXPECT_GT(new_fd, prev_fd); + // Check that all fds in (prev_fd, new_fd) are used. + for (int j = prev_fd + 1; j < new_fd; ++j) { + if (fcntl(j, F_GETFD) != -1) used_fds++; + } + prev_fd = new_fd; + fds.push_back(std::move(f)); + } + EXPECT_EQ(fds.size() + used_fds, kFDLimit - fd.get() - 1); +} + TEST(DupTest, Dup2SameFD) { auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY)); diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc index 8202d35fa..a2cc59e83 100644 --- a/test/syscalls/linux/eventfd.cc +++ b/test/syscalls/linux/eventfd.cc @@ -149,31 +149,6 @@ TEST(EventfdTest, BigWriteBigRead) { EXPECT_EQ(l[0], 1); } -TEST(EventfdTest, SpliceFromPipePartialSucceeds) { - int pipes[2]; - ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds()); - const FileDescriptor pipe_rfd(pipes[0]); - const FileDescriptor pipe_wfd(pipes[1]); - constexpr uint64_t kVal{1}; - - FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK)); - - uint64_t event_array[2]; - event_array[0] = kVal; - event_array[1] = kVal; - ASSERT_THAT(write(pipe_wfd.get(), event_array, sizeof(event_array)), - SyscallSucceedsWithValue(sizeof(event_array))); - EXPECT_THAT(splice(pipe_rfd.get(), /*__offin=*/nullptr, efd.get(), - /*__offout=*/nullptr, sizeof(event_array[0]) + 1, - SPLICE_F_NONBLOCK), - SyscallSucceedsWithValue(sizeof(event_array[0]))); - - uint64_t val; - ASSERT_THAT(read(efd.get(), &val, sizeof(val)), - SyscallSucceedsWithValue(sizeof(val))); - EXPECT_EQ(val, kVal); -} - // NotifyNonZero is inherently racy, so random save is disabled. TEST(EventfdTest, NotifyNonZero) { // Waits will time out at 10 seconds. diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc index 4df429ed8..a0016146a 100644 --- a/test/syscalls/linux/exec.cc +++ b/test/syscalls/linux/exec.cc @@ -306,9 +306,6 @@ TEST(ExecTest, InterpreterScriptNoPath) { TempPath script = ASSERT_NO_ERRNO_AND_VALUE( TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "#!\n\n", 0755)); - std::cerr << "path: " << script.path() << std::endl; - std::cerr << system(absl::StrCat("cat ", script.path()).c_str()) << std::endl; - int execve_errno; ASSERT_NO_ERRNO_AND_VALUE( ForkAndExec(script.path(), {script.path()}, {}, nullptr, &execve_errno)); diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc index 5c839447e..5f1b4d5e5 100644 --- a/test/syscalls/linux/fallocate.cc +++ b/test/syscalls/linux/fallocate.cc @@ -31,11 +31,11 @@ #include "absl/strings/str_cat.h" #include "absl/time/time.h" #include "test/syscalls/linux/file_base.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/cleanup.h" #include "test/util/eventfd_util.h" #include "test/util/file_descriptor.h" #include "test/util/posix_error.h" +#include "test/util/socket_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 91526572b..0e78a4d4a 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -35,7 +35,6 @@ #include "absl/strings/str_cat.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/cleanup.h" #include "test/util/eventfd_util.h" @@ -46,6 +45,7 @@ #include "test/util/posix_error.h" #include "test/util/save_util.h" #include "test/util/signal_util.h" +#include "test/util/socket_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc index 10dad042f..686b779be 100644 --- a/test/syscalls/linux/flock.cc +++ b/test/syscalls/linux/flock.cc @@ -21,8 +21,8 @@ #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/syscalls/linux/file_base.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index f6b78989b..e2622232d 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -1849,34 +1849,6 @@ TEST(Inotify, SpliceOnWatchTarget) { })); } -TEST(Inotify, SpliceOnInotifyFD) { - int pipefds[2]; - ASSERT_THAT(pipe2(pipefds, O_NONBLOCK), SyscallSucceeds()); - - const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - const FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); - const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( - root.path(), "some content", TempPath::kDefaultFileMode)); - - const FileDescriptor file1_fd = - ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY)); - const int watcher = ASSERT_NO_ERRNO_AND_VALUE( - InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS)); - - char buf; - EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds()); - - EXPECT_THAT(splice(fd.get(), nullptr, pipefds[1], nullptr, - sizeof(struct inotify_event) + 1, SPLICE_F_NONBLOCK), - SyscallSucceedsWithValue(sizeof(struct inotify_event))); - - const FileDescriptor read_fd(pipefds[0]); - const std::vector<Event> events = - ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(read_fd.get())); - ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)})); -} - // Watches on a parent should not be triggered by actions on a hard link to one // of its children that has a different parent. TEST(Inotify, LinkOnOtherParent) { diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc index 9b16d1558..88056ef2e 100644 --- a/test/syscalls/linux/ioctl.cc +++ b/test/syscalls/linux/ioctl.cc @@ -26,10 +26,10 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/file_descriptor.h" #include "test/util/signal_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/ip6tables.cc b/test/syscalls/linux/ip6tables.cc index e0e146067..d11b45d4a 100644 --- a/test/syscalls/linux/ip6tables.cc +++ b/test/syscalls/linux/ip6tables.cc @@ -17,9 +17,9 @@ #include "gtest/gtest.h" #include "test/syscalls/linux/iptables.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc index e90a7e411..a1216d23f 100644 --- a/test/syscalls/linux/ip_socket_test_util.cc +++ b/test/syscalls/linux/ip_socket_test_util.cc @@ -156,6 +156,14 @@ SocketKind ICMPv6UnboundSocket(int type) { UnboundSocketCreator(AF_INET6, type | SOCK_DGRAM, IPPROTO_ICMPV6)}; } +SocketKind IPv4RawUDPUnboundSocket(int type) { + std::string description = + absl::StrCat(DescribeSocketType(type), "IPv4 Raw UDP socket"); + return SocketKind{ + description, AF_INET, type | SOCK_RAW, IPPROTO_UDP, + UnboundSocketCreator(AF_INET, type | SOCK_RAW, IPPROTO_UDP)}; +} + SocketKind IPv4UDPUnboundSocket(int type) { std::string description = absl::StrCat(DescribeSocketType(type), "IPv4 UDP socket"); diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h index bde481f7e..556838356 100644 --- a/test/syscalls/linux/ip_socket_test_util.h +++ b/test/syscalls/linux/ip_socket_test_util.h @@ -21,7 +21,7 @@ #include <string> -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { @@ -35,6 +35,9 @@ uint16_t PortFromInetSockaddr(const struct sockaddr* addr); // InterfaceIndex returns the index of the named interface. PosixErrorOr<int> InterfaceIndex(std::string name); +// GetLoopbackIndex returns the index of the loopback interface. +inline PosixErrorOr<int> GetLoopbackIndex() { return InterfaceIndex("lo"); } + // IPv6TCPAcceptBindSocketPair returns a SocketPairKind that represents // SocketPairs created with bind() and accept() syscalls with AF_INET6 and the // given type bound to the IPv6 loopback. @@ -92,6 +95,10 @@ SocketKind ICMPUnboundSocket(int type); // created with AF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6, and the given type. SocketKind ICMPv6UnboundSocket(int type); +// IPv4RawUDPUnboundSocket returns a SocketKind that represents a SimpleSocket +// created with AF_INET, SOCK_RAW, IPPROTO_UDP, and the given type. +SocketKind IPv4RawUDPUnboundSocket(int type); + // IPv4UDPUnboundSocket returns a SocketKind that represents a SimpleSocket // created with AF_INET, SOCK_DGRAM, IPPROTO_UDP, and the given type. SocketKind IPv4UDPUnboundSocket(int type); diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc index ac113e6da..9fb04eae6 100644 --- a/test/syscalls/linux/itimer.cc +++ b/test/syscalls/linux/itimer.cc @@ -197,9 +197,9 @@ int TestSIGALRMToMainThread() { // (but don't guarantee it), so we expect to see most samples on the main // thread. // - // The number of SIGALRMs delivered to a worker should not exceed 20% + // The number of SIGALRMs delivered to a worker should not exceed 40% // of the number of total signals expected (this is somewhat arbitrary). - const int worker_threshold = result.expected_total / 5; + const int worker_threshold = result.expected_total / 5 * 2; // // Linux only guarantees timers will never expire before the requested time. @@ -230,7 +230,8 @@ TEST(ItimerTest, DeliversSIGALRMToMainThread) { // Not required anymore. kill.Release(); - EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status; + EXPECT_EQ(WIFEXITED(status) && WEXITSTATUS(status), 0) + << WIFEXITED(status) << " " << WEXITSTATUS(status); } // Signals are delivered to threads fairly. diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc index 4f9ca1a65..8b208f99a 100644 --- a/test/syscalls/linux/link.cc +++ b/test/syscalls/linux/link.cc @@ -142,7 +142,8 @@ TEST(LinkTest, OldnameIsEmpty) { TEST(LinkTest, OldnameDoesNotExist) { const std::string oldname = NewTempAbsPath(); const std::string newname = NewTempAbsPath(); - EXPECT_THAT(link("", newname.c_str()), SyscallFailsWithErrno(ENOENT)); + EXPECT_THAT(link(oldname.c_str(), newname.c_str()), + SyscallFailsWithErrno(ENOENT)); } TEST(LinkTest, NewnameCannotExist) { diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc index d4f89527c..dbc21833f 100644 --- a/test/syscalls/linux/lseek.cc +++ b/test/syscalls/linux/lseek.cc @@ -121,7 +121,8 @@ TEST(LseekTest, InvalidFD) { } TEST(LseekTest, DirCurEnd) { - const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open("/tmp", O_RDONLY)); + const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE( + Open(GetAbsoluteTestTmpdir().c_str(), O_RDONLY)); ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0)); } diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc index 4a450742b..dbd1c93ae 100644 --- a/test/syscalls/linux/memfd.cc +++ b/test/syscalls/linux/memfd.cc @@ -445,9 +445,10 @@ TEST(MemfdTest, SealsAreInodeLevelProperties) { // Tmpfs files also support seals, but are created with F_SEAL_SEAL. TEST(MemfdTest, TmpfsFilesHaveSealSeal) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs("/tmp"))); + std::string tmpdir = GetAbsoluteTestTmpdir(); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(tmpdir.c_str()))); const TempPath tmpfs_file = - ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn("/tmp")); + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(tmpdir.c_str())); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfs_file.path(), O_RDWR, 0644)); EXPECT_THAT(fcntl(fd.get(), F_GET_SEALS), diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc index 93a6d9cde..bb2a0cb57 100644 --- a/test/syscalls/linux/mmap.cc +++ b/test/syscalls/linux/mmap.cc @@ -29,6 +29,7 @@ #include <sys/wait.h> #include <unistd.h> +#include <limits> #include <vector> #include "gmock/gmock.h" @@ -793,6 +794,19 @@ class MMapFileTest : public MMapTest { ASSERT_THAT(unlink(filename_.c_str()), SyscallSucceeds()); } + bool FSSupportsMap() const { + bool supported = true; + void* ret = mmap(nullptr, 1, PROT_NONE, MAP_PRIVATE, fd_.get(), 0); + if (ret == MAP_FAILED && errno != ENODEV) { + supported = false; + } + if (ret != MAP_FAILED) { + munmap(ret, 1); + } + + return supported; + } + ssize_t Read(char* buf, size_t count) { ssize_t len = 0; do { @@ -840,12 +854,14 @@ class MMapFileParamTest // MAP_POPULATE allowed. // There isn't a good way to verify it actually did anything. TEST_P(MMapFileParamTest, MapPopulate) { + SKIP_IF(!FSSupportsMap()); ASSERT_THAT(Map(0, kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0), SyscallSucceeds()); } // MAP_POPULATE on a short file. TEST_P(MMapFileParamTest, MapPopulateShort) { + SKIP_IF(!FSSupportsMap()); ASSERT_THAT( Map(0, 2 * kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0), SyscallSucceeds()); @@ -853,6 +869,7 @@ TEST_P(MMapFileParamTest, MapPopulateShort) { // Read contents from mapped file. TEST_F(MMapFileTest, Read) { + SKIP_IF(!FSSupportsMap()); size_t len = strlen(kFileContents); ASSERT_EQ(len, Write(kFileContents, len)); @@ -866,6 +883,7 @@ TEST_F(MMapFileTest, Read) { // Map at an offset. TEST_F(MMapFileTest, MapOffset) { + SKIP_IF(!FSSupportsMap()); ASSERT_THAT(lseek(fd_.get(), kPageSize, SEEK_SET), SyscallSucceeds()); size_t len = strlen(kFileContents); @@ -881,6 +899,7 @@ TEST_F(MMapFileTest, MapOffset) { } TEST_F(MMapFileTest, MapOffsetBeyondEnd) { + SKIP_IF(!FSSupportsMap()); SetupGvisorDeathTest(); uintptr_t addr; @@ -895,16 +914,46 @@ TEST_F(MMapFileTest, MapOffsetBeyondEnd) { ::testing::KilledBySignal(SIGBUS), ""); } -// Verify mmap fails when sum of length and offset overflows. -TEST_F(MMapFileTest, MapLengthPlusOffsetOverflows) { - const size_t length = static_cast<size_t>(-kPageSize); - const off_t offset = kPageSize; - ASSERT_THAT(Map(0, length, PROT_READ, MAP_PRIVATE, fd_.get(), offset), - SyscallFailsWithErrno(ENOMEM)); +TEST_F(MMapFileTest, MapSecondToLastPositivePage) { + SKIP_IF(!FSSupportsMap()); + EXPECT_THAT( + Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), + (std::numeric_limits<off_t>::max() - kPageSize) & ~(kPageSize - 1)), + SyscallSucceeds()); +} + +TEST_F(MMapFileTest, MapLastPositivePage) { + SKIP_IF(!FSSupportsMap()); + // For regular files, this should fail due to integer overflow of the end + // offset. + EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), + std::numeric_limits<off_t>::max() & ~(kPageSize - 1)), + SyscallFailsWithErrno(EOVERFLOW)); +} + +TEST_F(MMapFileTest, MapFirstNegativePage) { + SKIP_IF(!FSSupportsMap()); + EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), + std::numeric_limits<off_t>::min()), + SyscallFailsWithErrno(EOVERFLOW)); +} + +TEST_F(MMapFileTest, MapSecondToLastNegativePage) { + SKIP_IF(!FSSupportsMap()); + EXPECT_THAT( + Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), -(2 * kPageSize)), + SyscallFailsWithErrno(EOVERFLOW)); +} + +TEST_F(MMapFileTest, MapLastNegativePage) { + SKIP_IF(!FSSupportsMap()); + EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), -kPageSize), + SyscallFailsWithErrno(EOVERFLOW)); } // MAP_PRIVATE PROT_WRITE is allowed on read-only FDs. TEST_F(MMapFileTest, WritePrivateOnReadOnlyFd) { + SKIP_IF(!FSSupportsMap()); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_RDONLY)); @@ -921,6 +970,7 @@ TEST_F(MMapFileTest, WritePrivateOnReadOnlyFd) { // MAP_SHARED PROT_WRITE not allowed on read-only FDs. TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) { + SKIP_IF(!FSSupportsMap()); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_RDONLY)); @@ -932,6 +982,7 @@ TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) { // Mmap not allowed on O_PATH FDs. TEST_F(MMapFileTest, MmapFileWithOpath) { + SKIP_IF(!FSSupportsMap()); SKIP_IF(IsRunningWithVFS1()); const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); const FileDescriptor fd = @@ -944,6 +995,7 @@ TEST_F(MMapFileTest, MmapFileWithOpath) { // The FD must be readable. TEST_P(MMapFileParamTest, WriteOnlyFd) { + SKIP_IF(!FSSupportsMap()); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY)); @@ -955,6 +1007,7 @@ TEST_P(MMapFileParamTest, WriteOnlyFd) { // Overwriting the contents of a file mapped MAP_SHARED PROT_READ // should cause the new data to be reflected in the mapping. TEST_F(MMapFileTest, ReadSharedConsistentWithOverwrite) { + SKIP_IF(!FSSupportsMap()); // Start from scratch. EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds()); @@ -994,6 +1047,7 @@ TEST_F(MMapFileTest, ReadSharedConsistentWithOverwrite) { // Partially overwriting a file mapped MAP_SHARED PROT_READ should be reflected // in the mapping. TEST_F(MMapFileTest, ReadSharedConsistentWithPartialOverwrite) { + SKIP_IF(!FSSupportsMap()); // Start from scratch. EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds()); @@ -1034,6 +1088,7 @@ TEST_F(MMapFileTest, ReadSharedConsistentWithPartialOverwrite) { // Overwriting a file mapped MAP_SHARED PROT_READ should be reflected in the // mapping and the file. TEST_F(MMapFileTest, ReadSharedConsistentWithWriteAndFile) { + SKIP_IF(!FSSupportsMap()); // Start from scratch. EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds()); @@ -1077,6 +1132,7 @@ TEST_F(MMapFileTest, ReadSharedConsistentWithWriteAndFile) { // Write data to mapped file. TEST_F(MMapFileTest, WriteShared) { + SKIP_IF(!FSSupportsMap()); uintptr_t addr; ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd_.get(), 0), @@ -1101,6 +1157,7 @@ TEST_F(MMapFileTest, WriteShared) { // Write data to portion of mapped page beyond the end of the file. // These writes are not reflected in the file. TEST_F(MMapFileTest, WriteSharedBeyondEnd) { + SKIP_IF(!FSSupportsMap()); // The file is only half of a page. We map an entire page. Writes to the // end of the mapping must not be reflected in the file. uintptr_t addr; @@ -1137,6 +1194,7 @@ TEST_F(MMapFileTest, WriteSharedBeyondEnd) { // The portion of a mapped page that becomes part of the file after a truncate // is reflected in the file. TEST_F(MMapFileTest, WriteSharedTruncateUp) { + SKIP_IF(!FSSupportsMap()); // The file is only half of a page. We map an entire page. Writes to the // end of the mapping must not be reflected in the file. uintptr_t addr; @@ -1174,6 +1232,7 @@ TEST_F(MMapFileTest, WriteSharedTruncateUp) { } TEST_F(MMapFileTest, ReadSharedTruncateDownThenUp) { + SKIP_IF(!FSSupportsMap()); // Start from scratch. EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds()); @@ -1213,6 +1272,7 @@ TEST_F(MMapFileTest, ReadSharedTruncateDownThenUp) { } TEST_F(MMapFileTest, WriteSharedTruncateDownThenUp) { + SKIP_IF(!FSSupportsMap()); // The file is only half of a page. We map an entire page. Writes to the // end of the mapping must not be reflected in the file. uintptr_t addr; @@ -1247,6 +1307,7 @@ TEST_F(MMapFileTest, WriteSharedTruncateDownThenUp) { } TEST_F(MMapFileTest, ReadSharedTruncateSIGBUS) { + SKIP_IF(!FSSupportsMap()); SetupGvisorDeathTest(); // Start from scratch. @@ -1277,6 +1338,7 @@ TEST_F(MMapFileTest, ReadSharedTruncateSIGBUS) { } TEST_F(MMapFileTest, WriteSharedTruncateSIGBUS) { + SKIP_IF(!FSSupportsMap()); SetupGvisorDeathTest(); uintptr_t addr; @@ -1298,6 +1360,7 @@ TEST_F(MMapFileTest, WriteSharedTruncateSIGBUS) { } TEST_F(MMapFileTest, ReadSharedTruncatePartialPage) { + SKIP_IF(!FSSupportsMap()); // Start from scratch. EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds()); @@ -1327,6 +1390,7 @@ TEST_F(MMapFileTest, ReadSharedTruncatePartialPage) { // Page can still be accessed and contents are intact after truncating a partial // page. TEST_F(MMapFileTest, WriteSharedTruncatePartialPage) { + SKIP_IF(!FSSupportsMap()); // Expand the file to a full page. EXPECT_THAT(ftruncate(fd_.get(), kPageSize), SyscallSucceeds()); @@ -1354,6 +1418,7 @@ TEST_F(MMapFileTest, WriteSharedTruncatePartialPage) { // MAP_PRIVATE writes are not carried through to the underlying file. TEST_F(MMapFileTest, WritePrivate) { + SKIP_IF(!FSSupportsMap()); uintptr_t addr; ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd_.get(), 0), @@ -1378,6 +1443,7 @@ TEST_F(MMapFileTest, WritePrivate) { // SIGBUS raised when reading or writing past end of a mapped file. TEST_P(MMapFileParamTest, SigBusDeath) { + SKIP_IF(!FSSupportsMap()); SetupGvisorDeathTest(); uintptr_t addr; @@ -1406,6 +1472,7 @@ TEST_P(MMapFileParamTest, SigBusDeath) { // // See b/27877699. TEST_P(MMapFileParamTest, NoSigBusOnPagesBeforeEOF) { + SKIP_IF(!FSSupportsMap()); uintptr_t addr; ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0), SyscallSucceeds()); @@ -1424,6 +1491,7 @@ TEST_P(MMapFileParamTest, NoSigBusOnPagesBeforeEOF) { // Tests that SIGBUS is not raised when reading or writing from a file-mapped // page containing EOF, *after* the EOF. TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) { + SKIP_IF(!FSSupportsMap()); uintptr_t addr; ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0), SyscallSucceeds()); @@ -1446,6 +1514,7 @@ TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) { // page cache (which does not yet support writing to shared mappings), a bug // caused reads to fail unnecessarily on such mappings. See b/28913513. TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) { + SKIP_IF(!FSSupportsMap()); uintptr_t addr; size_t len = strlen(kFileContents); @@ -1463,6 +1532,7 @@ TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) { // read past end of file (resulting in a fault in sentry context in the gVisor // case). See b/28913513. TEST_F(MMapFileTest, InternalSigBus) { + SKIP_IF(!FSSupportsMap()); uintptr_t addr; ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd_.get(), 0), @@ -1483,6 +1553,7 @@ TEST_F(MMapFileTest, InternalSigBus) { // /dev/zero to a shared mapping (so that the SIGBUS isn't caught during // copy-on-write breaking). TEST_F(MMapFileTest, InternalSigBusZeroing) { + SKIP_IF(!FSSupportsMap()); uintptr_t addr; ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd_.get(), 0), @@ -1578,6 +1649,7 @@ TEST_F(MMapTest, NoReserve) { // Map more than the gVisor page-cache map unit (64k) and ensure that // it is consistent with reading from the file. TEST_F(MMapFileTest, Bug38498194) { + SKIP_IF(!FSSupportsMap()); // Choose a sufficiently large map unit. constexpr int kSize = 4 * 1024 * 1024; EXPECT_THAT(ftruncate(fd_.get(), kSize), SyscallSucceeds()); @@ -1606,6 +1678,7 @@ TEST_F(MMapFileTest, Bug38498194) { // Tests that reading from a file to a memory mapping of the same file does not // deadlock. See b/34813270. TEST_F(MMapFileTest, SelfRead) { + SKIP_IF(!FSSupportsMap()); uintptr_t addr; ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd_.get(), 0), @@ -1618,6 +1691,7 @@ TEST_F(MMapFileTest, SelfRead) { // Tests that writing to a file from a memory mapping of the same file does not // deadlock. Regression test for b/34813270. TEST_F(MMapFileTest, SelfWrite) { + SKIP_IF(!FSSupportsMap()); uintptr_t addr; ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0), SyscallSucceeds()); @@ -1633,8 +1707,12 @@ TEST(MMapDeathTest, TruncateAfterCOWBreak) { auto const temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDWR)); ASSERT_THAT(ftruncate(fd.get(), kPageSize), SyscallSucceeds()); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(Mmap( - nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd.get(), 0)); + + auto maybe_mapping = Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd.get(), 0); + // Does FS support mmap? + SKIP_IF(maybe_mapping.error().errno_value() == ENODEV); + auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(std::move(maybe_mapping)); // Write to this mapping, causing the page to be copied for write. memset(mapping.ptr(), 'a', mapping.len()); @@ -1661,8 +1739,12 @@ TEST(MMapNoFixtureTest, MapReadOnlyAfterCreateWriteOnly) { auto const wo_fd = ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_WRONLY)); ASSERT_THAT(ftruncate(wo_fd.get(), kPageSize), SyscallSucceeds()); - auto const mapping = ASSERT_NO_ERRNO_AND_VALUE( - Mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, ro_fd.get(), 0)); + auto maybe_mapping = + Mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, ro_fd.get(), 0); + // Does FS support mmap? + SKIP_IF(maybe_mapping.error().errno_value() == ENODEV); + auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(std::move(maybe_mapping)); + std::vector<char> buf(kPageSize); // The test passes if this survives. std::copy(static_cast<char*>(mapping.ptr()), diff --git a/test/syscalls/linux/msgqueue.cc b/test/syscalls/linux/msgqueue.cc new file mode 100644 index 000000000..c4761eba8 --- /dev/null +++ b/test/syscalls/linux/msgqueue.cc @@ -0,0 +1,885 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <errno.h> +#include <signal.h> +#include <sys/ipc.h> +#include <sys/msg.h> +#include <sys/types.h> + +#include "absl/synchronization/notification.h" +#include "absl/time/clock.h" +#include "test/util/capability_util.h" +#include "test/util/signal_util.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" +#include "test/util/thread_util.h" + +namespace gvisor { +namespace testing { +namespace { + +// Source: include/uapi/linux/msg.h +constexpr int msgMnb = 16384; // Maximum number of bytes in a queue. +constexpr int msgMni = 32000; // Max number of identifiers. +constexpr int msgPool = + (msgMni * msgMnb / 1024); // Size of buffer pool used to hold message data. +constexpr int msgMap = msgMnb; // Maximum number of entries in message map. +constexpr int msgMax = 8192; // Maximum number of bytes in a single message. +constexpr int msgSsz = 16; // Message segment size. +constexpr int msgTql = msgMnb; // Maximum number of messages on all queues. + +constexpr int kInterruptSignal = SIGALRM; + +// Queue is a RAII class used to automatically clean message queues. +class Queue { + public: + explicit Queue(int id) : id_(id) {} + Queue(const Queue&) = delete; + Queue& operator=(const Queue&) = delete; + + Queue(Queue&& other) { id_ = other.release(); } + + ~Queue() { + if (id_ >= 0) { + EXPECT_THAT(msgctl(id_, IPC_RMID, nullptr), SyscallSucceeds()); + } + } + + int release() { + int old = id_; + id_ = -1; + return old; + } + + int get() { return id_; } + + private: + int id_ = -1; +}; + +PosixErrorOr<Queue> Msgget(key_t key, int flags) { + int id = msgget(key, flags); + if (id == -1) { + return PosixError(errno, absl::StrFormat("msgget(%d, %d)", key, flags)); + } + return Queue(id); +} + +// Default size for messages. +constexpr size_t msgSize = 50; + +// msgbuf is a simple buffer using to send and receive text messages for +// testing purposes. +struct msgbuf { + int64_t mtype; + char mtext[msgSize]; +}; + +bool operator==(msgbuf& a, msgbuf& b) { + for (size_t i = 0; i < msgSize; i++) { + if (a.mtext[i] != b.mtext[i]) { + return false; + } + } + return a.mtype == b.mtype; +} + +// msgmax represents a buffer for the largest possible single message. +struct msgmax { + int64_t mtype; + char mtext[msgMax]; +}; + +// Test simple creation and retrieval for msgget(2). +TEST(MsgqueueTest, MsgGet) { + const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const key_t key = ftok(keyfile.path().c_str(), 1); + ASSERT_THAT(key, SyscallSucceeds()); + + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(key, IPC_CREAT)); + EXPECT_THAT(msgget(key, 0), SyscallSucceedsWithValue(queue.get())); +} + +// Test simple failure scenarios for msgget(2). +TEST(MsgqueueTest, MsgGetFail) { + const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const key_t key = ftok(keyfile.path().c_str(), 1); + ASSERT_THAT(key, SyscallSucceeds()); + + EXPECT_THAT(msgget(key, 0), SyscallFailsWithErrno(ENOENT)); + + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(key, IPC_CREAT)); + EXPECT_THAT(msgget(key, IPC_CREAT | IPC_EXCL), SyscallFailsWithErrno(EEXIST)); +} + +// Test using msgget(2) with IPC_PRIVATE option. +TEST(MsgqueueTest, MsgGetIpcPrivate) { + Queue queue1 = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0)); + Queue queue2 = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0)); + EXPECT_NE(queue1.get(), queue2.get()); +} + +// Test simple msgsnd and msgrcv. +TEST(MsgqueueTest, MsgOpSimple) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + msgbuf buf{1, "A message."}; + msgbuf rcv; + + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(buf.mtext) + 1, 0, 0), + SyscallSucceedsWithValue(sizeof(buf.mtext))); + EXPECT_TRUE(buf == rcv); +} + +// Test msgsnd and msgrcv of an empty message. +TEST(MsgqueueTest, MsgOpEmpty) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + msgbuf buf{1, ""}; + msgbuf rcv; + + ASSERT_THAT(msgsnd(queue.get(), &buf, 0, 0), SyscallSucceeds()); + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(buf.mtext) + 1, 0, 0), + SyscallSucceedsWithValue(0)); +} + +// Test truncation of message with MSG_NOERROR flag. +TEST(MsgqueueTest, MsgOpTruncate) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + msgbuf buf{1, ""}; + msgbuf rcv; + + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(buf.mtext) - 1, 0, MSG_NOERROR), + SyscallSucceedsWithValue(sizeof(buf.mtext) - 1)); +} + +// Test msgsnd and msgrcv using invalid arguments. +TEST(MsgqueueTest, MsgOpInvalidArgs) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + msgbuf buf{1, ""}; + + EXPECT_THAT(msgsnd(-1, &buf, 0, 0), SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(msgsnd(queue.get(), &buf, -1, 0), SyscallFailsWithErrno(EINVAL)); + + buf.mtype = -1; + EXPECT_THAT(msgsnd(queue.get(), &buf, 1, 0), SyscallFailsWithErrno(EINVAL)); + + EXPECT_THAT(msgrcv(-1, &buf, 1, 0, 0), SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(msgrcv(queue.get(), &buf, -1, 0, 0), + SyscallFailsWithErrno(EINVAL)); +} + +// Test non-blocking msgrcv with an empty queue. +TEST(MsgqueueTest, MsgOpNoMsg) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + msgbuf rcv; + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(rcv.mtext) + 1, 0, IPC_NOWAIT), + SyscallFailsWithErrno(ENOMSG)); +} + +// Test non-blocking msgrcv with a non-empty queue, but no messages of wanted +// type. +TEST(MsgqueueTest, MsgOpNoMsgType) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + msgbuf buf{1, ""}; + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + + EXPECT_THAT(msgrcv(queue.get(), &buf, sizeof(buf.mtext) + 1, 2, IPC_NOWAIT), + SyscallFailsWithErrno(ENOMSG)); +} + +// Test msgrcv with a larger size message than wanted, and truncation disabled. +TEST(MsgqueueTest, MsgOpTooBig) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + msgbuf buf{1, ""}; + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + + EXPECT_THAT(msgrcv(queue.get(), &buf, sizeof(buf.mtext) - 1, 0, 0), + SyscallFailsWithErrno(E2BIG)); +} + +// Test receiving messages based on type. +TEST(MsgqueueTest, MsgRcvType) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + // Send messages in an order and receive them in reverse, based on type, + // which shouldn't block. + std::map<int64_t, msgbuf> typeToBuf = { + {1, msgbuf{1, "Message 1."}}, {2, msgbuf{2, "Message 2."}}, + {3, msgbuf{3, "Message 3."}}, {4, msgbuf{4, "Message 4."}}, + {5, msgbuf{5, "Message 5."}}, {6, msgbuf{6, "Message 6."}}, + {7, msgbuf{7, "Message 7."}}, {8, msgbuf{8, "Message 8."}}, + {9, msgbuf{9, "Message 9."}}}; + + for (auto const& [type, buf] : typeToBuf) { + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + } + + for (int64_t i = typeToBuf.size(); i > 0; i--) { + msgbuf rcv; + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(typeToBuf[i].mtext) + 1, i, 0), + SyscallSucceedsWithValue(sizeof(typeToBuf[i].mtext))); + EXPECT_TRUE(typeToBuf[i] == rcv); + } +} + +// Test using MSG_EXCEPT to receive a different-type message. +TEST(MsgqueueTest, MsgExcept) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + std::map<int64_t, msgbuf> typeToBuf = { + {1, msgbuf{1, "Message 1."}}, + {2, msgbuf{2, "Message 2."}}, + }; + + for (auto const& [type, buf] : typeToBuf) { + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + } + + for (int64_t i = typeToBuf.size(); i > 0; i--) { + msgbuf actual = typeToBuf[i == 1 ? 2 : 1]; + msgbuf rcv; + + EXPECT_THAT( + msgrcv(queue.get(), &rcv, sizeof(actual.mtext) + 1, i, MSG_EXCEPT), + SyscallSucceedsWithValue(sizeof(actual.mtext))); + EXPECT_TRUE(actual == rcv); + } +} + +// Test msgrcv with a negative type. +TEST(MsgqueueTest, MsgRcvTypeNegative) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + // When msgtyp is negative, msgrcv returns the first message with mtype less + // than or equal to the absolute value. + msgbuf buf{2, "A message."}; + msgbuf rcv; + + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + + // Nothing is less than or equal to 1. + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(buf.mtext) + 1, -1, IPC_NOWAIT), + SyscallFailsWithErrno(ENOMSG)); + + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(buf.mtext) + 1, -3, 0), + SyscallSucceedsWithValue(sizeof(buf.mtext))); + EXPECT_TRUE(buf == rcv); +} + +// Test permission-related failure scenarios. +TEST(MsgqueueTest, MsgOpPermissions) { + AutoCapability cap(CAP_IPC_OWNER, false); + + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0000)); + + msgbuf buf{1, ""}; + + EXPECT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallFailsWithErrno(EACCES)); + EXPECT_THAT(msgrcv(queue.get(), &buf, sizeof(buf.mtext), 0, 0), + SyscallFailsWithErrno(EACCES)); +} + +// Test limits for messages and queues. +TEST(MsgqueueTest, MsgOpLimits) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + msgbuf buf{1, "A message."}; + + // Limit for one message. + EXPECT_THAT(msgsnd(queue.get(), &buf, msgMax + 1, 0), + SyscallFailsWithErrno(EINVAL)); + + // Limit for queue. + msgmax limit{1, ""}; + for (size_t i = 0, msgCount = msgMnb / msgMax; i < msgCount; i++) { + EXPECT_THAT(msgsnd(queue.get(), &limit, sizeof(limit.mtext), 0), + SyscallSucceeds()); + } + EXPECT_THAT(msgsnd(queue.get(), &limit, sizeof(limit.mtext), IPC_NOWAIT), + SyscallFailsWithErrno(EAGAIN)); +} + +// MsgCopySupported returns true if MSG_COPY is supported. +bool MsgCopySupported() { + // msgrcv(2) man page states that MSG_COPY flag is available only if the + // kernel was built with the CONFIG_CHECKPOINT_RESTORE option. If MSG_COPY + // is used when the kernel was configured without the option, msgrcv produces + // a ENOSYS error. + // To avoid test failure, we perform a small test using msgrcv, and skip the + // test if errno == ENOSYS. This means that the test will always run on + // gVisor, but may be skipped on native linux. + + auto maybe_id = Msgget(IPC_PRIVATE, 0600); + if (!maybe_id.ok()) { + return false; + } + Queue queue(std::move(maybe_id.ValueOrDie())); + msgbuf buf{1, "Test message."}; + + msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0); + return !(msgrcv(queue.get(), &buf, sizeof(buf.mtext) + 1, 0, + MSG_COPY | IPC_NOWAIT) == -1 && + errno == ENOSYS); +} + +// Test msgrcv using MSG_COPY. +TEST(MsgqueueTest, MsgCopy) { + SKIP_IF(!MsgCopySupported()); + + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + msgbuf bufs[5] = { + msgbuf{1, "Message 1."}, msgbuf{2, "Message 2."}, msgbuf{3, "Message 3."}, + msgbuf{4, "Message 4."}, msgbuf{5, "Message 5."}, + }; + + for (auto& buf : bufs) { + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + } + + // Receive a copy of the messages. + for (size_t i = 0, size = sizeof(bufs) / sizeof(bufs[0]); i < size; i++) { + msgbuf buf = bufs[i]; + msgbuf rcv; + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(buf.mtext) + 1, i, + MSG_COPY | IPC_NOWAIT), + SyscallSucceedsWithValue(sizeof(buf.mtext))); + EXPECT_TRUE(buf == rcv); + } + + // Re-receive the messages normally. + for (auto& buf : bufs) { + msgbuf rcv; + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(buf.mtext) + 1, 0, 0), + SyscallSucceedsWithValue(sizeof(buf.mtext))); + EXPECT_TRUE(buf == rcv); + } +} + +// Test msgrcv using MSG_COPY with invalid arguments. +TEST(MsgqueueTest, MsgCopyInvalidArgs) { + SKIP_IF(!MsgCopySupported()); + + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + msgbuf rcv; + EXPECT_THAT(msgrcv(queue.get(), &rcv, msgSize, 1, MSG_COPY), + SyscallFailsWithErrno(EINVAL)); + + EXPECT_THAT( + msgrcv(queue.get(), &rcv, msgSize, 5, MSG_COPY | MSG_EXCEPT | IPC_NOWAIT), + SyscallFailsWithErrno(EINVAL)); +} + +// Test msgrcv using MSG_COPY with invalid indices. +TEST(MsgqueueTest, MsgCopyInvalidIndex) { + SKIP_IF(!MsgCopySupported()); + + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + msgbuf rcv; + EXPECT_THAT(msgrcv(queue.get(), &rcv, msgSize, -3, MSG_COPY | IPC_NOWAIT), + SyscallFailsWithErrno(ENOMSG)); + + EXPECT_THAT(msgrcv(queue.get(), &rcv, msgSize, 5, MSG_COPY | IPC_NOWAIT), + SyscallFailsWithErrno(ENOMSG)); +} + +// Test msgrcv (most probably) blocking on an empty queue. +TEST(MsgqueueTest, MsgRcvBlocking) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + msgbuf buf{1, "A message."}; + + ScopedThread t([&] { + msgbuf rcv; + ASSERT_THAT( + RetryEINTR(msgrcv)(queue.get(), &rcv, sizeof(buf.mtext) + 1, 0, 0), + SyscallSucceedsWithValue(sizeof(buf.mtext))); + EXPECT_TRUE(rcv == buf); + }); + + // Sleep to try and make msgrcv block before sending a message. + absl::SleepFor(absl::Milliseconds(150)); + + EXPECT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); +} + +// Test msgrcv (most probably) waiting for a specific-type message. +TEST(MsgqueueTest, MsgRcvTypeBlocking) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + msgbuf bufs[5] = {{1, "A message."}, + {1, "A message."}, + {1, "A message."}, + {1, "A message."}, + {2, "A different message."}}; + + ScopedThread t([&] { + msgbuf buf = bufs[4]; // Buffer that should be received. + msgbuf rcv; + ASSERT_THAT( + RetryEINTR(msgrcv)(queue.get(), &rcv, sizeof(buf.mtext) + 1, 2, 0), + SyscallSucceedsWithValue(sizeof(buf.mtext))); + EXPECT_TRUE(rcv == buf); + }); + + // Sleep to try and make msgrcv block before sending messages. + absl::SleepFor(absl::Milliseconds(150)); + + // Send all buffers in order, only last one should be received. + for (auto& buf : bufs) { + EXPECT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + } +} + +// Test msgsnd (most probably) blocking on a full queue. +TEST(MsgqueueTest, MsgSndBlocking) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + msgmax buf{1, ""}; // Has max amount of bytes. + + const size_t msgCount = msgMnb / msgMax; // Number of messages that can be + // sent without blocking. + + ScopedThread t([&] { + // Fill the queue. + for (size_t i = 0; i < msgCount; i++) { + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + } + + // Next msgsnd should block. + ASSERT_THAT(RetryEINTR(msgsnd)(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + }); + + const DisableSave ds; // Too many syscalls. + + // To increase the chance of the last msgsnd blocking before doing a msgrcv, + // we use MSG_COPY option to copy the last index in the queue. As long as + // MSG_COPY fails, the queue hasn't yet been filled. When MSG_COPY succeeds, + // the queue is filled, and most probably, a blocking msgsnd has been made. + msgmax rcv; + while (msgrcv(queue.get(), &rcv, msgMax, msgCount - 1, + MSG_COPY | IPC_NOWAIT) == -1 && + errno == ENOMSG) { + } + + // Delay a bit more for the blocking msgsnd. + absl::SleepFor(absl::Milliseconds(100)); + + EXPECT_THAT(msgrcv(queue.get(), &rcv, sizeof(buf.mtext), 0, 0), + SyscallSucceedsWithValue(sizeof(buf.mtext))); +} + +// Test removing a queue while a blocking msgsnd is executing. +TEST(MsgqueueTest, MsgSndRmWhileBlocking) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + // Number of messages that can be sent without blocking. + const size_t msgCount = msgMnb / msgMax; + + ScopedThread t([&] { + // Fill the queue. + msgmax buf{1, ""}; + for (size_t i = 0; i < msgCount; i++) { + EXPECT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + } + + // Next msgsnd should block. Because we're repeating on EINTR, msgsnd may + // race with msgctl(IPC_RMID) and return EINVAL. + EXPECT_THAT(RetryEINTR(msgsnd)(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallFails()); + EXPECT_TRUE((errno == EIDRM || errno == EINVAL)); + }); + + const DisableSave ds; // Too many syscalls. + + // Similar to MsgSndBlocking, we do this to increase the chance of msgsnd + // blocking before removing the queue. + msgmax rcv; + while (msgrcv(queue.get(), &rcv, msgMax, msgCount - 1, + MSG_COPY | IPC_NOWAIT) == -1 && + errno == ENOMSG) { + } + absl::SleepFor(absl::Milliseconds(100)); + + EXPECT_THAT(msgctl(queue.release(), IPC_RMID, nullptr), SyscallSucceeds()); +} + +// Test removing a queue while a blocking msgrcv is executing. +TEST(MsgqueueTest, MsgRcvRmWhileBlocking) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + ScopedThread t([&] { + // Because we're repeating on EINTR, msgsnd may race with msgctl(IPC_RMID) + // and return EINVAL. + msgbuf rcv; + EXPECT_THAT(RetryEINTR(msgrcv)(queue.get(), &rcv, 1, 2, 0), SyscallFails()); + EXPECT_TRUE(errno == EIDRM || errno == EINVAL); + }); + + // Sleep to try and make msgrcv block before sending messages. + absl::SleepFor(absl::Milliseconds(150)); + + EXPECT_THAT(msgctl(queue.release(), IPC_RMID, nullptr), SyscallSucceeds()); +} + +// Test a collection of msgsnd/msgrcv operations in different processes. +TEST(MsgqueueTest, MsgOpGeneral) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + + // Create multiple sending/receiving threads that send messages back and + // forth. There's a matching recv for each send, so by the end of the test, + // all threads should succeed and return. + const std::vector<msgbuf> msgs = { + msgbuf{1, "Message 1."}, msgbuf{2, "Message 2."}, msgbuf{3, "Message 3."}, + msgbuf{4, "Message 4."}, msgbuf{5, "Message 5."}}; + + auto receiver = [&](int i) { + return [i, &msgs, &queue]() { + const msgbuf& target = msgs[i]; + msgbuf rcv; + EXPECT_THAT(RetryEINTR(msgrcv)(queue.get(), &rcv, + sizeof(target.mtext) + 1, target.mtype, 0), + SyscallSucceedsWithValue(sizeof(target.mtext))); + EXPECT_EQ(rcv.mtype, target.mtype); + EXPECT_EQ(0, memcmp(rcv.mtext, target.mtext, sizeof(target.mtext))); + }; + }; + + ScopedThread r1(receiver(0)); + ScopedThread r2(receiver(1)); + ScopedThread r3(receiver(2)); + ScopedThread r4(receiver(3)); + ScopedThread r5(receiver(4)); + ScopedThread r6(receiver(0)); + ScopedThread r7(receiver(1)); + ScopedThread r8(receiver(2)); + ScopedThread r9(receiver(3)); + ScopedThread r10(receiver(4)); + + auto sender = [&](int i) { + return [i, &msgs, &queue]() { + const msgbuf& target = msgs[i]; + EXPECT_THAT( + RetryEINTR(msgsnd)(queue.get(), &target, sizeof(target.mtext), 0), + SyscallSucceeds()); + }; + }; + + ScopedThread s1(sender(0)); + ScopedThread s2(sender(1)); + ScopedThread s3(sender(2)); + ScopedThread s4(sender(3)); + ScopedThread s5(sender(4)); + ScopedThread s6(sender(0)); + ScopedThread s7(sender(1)); + ScopedThread s8(sender(2)); + ScopedThread s9(sender(3)); + ScopedThread s10(sender(4)); +} + +void empty_sighandler(int sig, siginfo_t* info, void* context) {} + +TEST(MsgqueueTest, InterruptRecv) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + char buf[64]; + + absl::Notification done, exit; + + // Thread calling msgrcv with no corresponding send. It would block forever, + // but we'll interrupt with a signal below. + ScopedThread t([&] { + struct sigaction sa = {}; + sa.sa_sigaction = empty_sighandler; + sigfillset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + auto cleanup_sigaction = + ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kInterruptSignal, sa)); + auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE( + ScopedSignalMask(SIG_UNBLOCK, kInterruptSignal)); + + EXPECT_THAT(msgrcv(queue.get(), &buf, sizeof(buf), 0, 0), + SyscallFailsWithErrno(EINTR)); + + done.Notify(); + exit.WaitForNotification(); + }); + + const DisableSave ds; // Too many syscalls. + + // We want the signal to arrive while msgrcv is blocking, but not after the + // thread has exited. Signals that arrive before msgrcv are no-ops. + do { + EXPECT_THAT(kill(getpid(), kInterruptSignal), SyscallSucceeds()); + absl::SleepFor(absl::Milliseconds(100)); // Rate limit. + } while (!done.HasBeenNotified()); + + exit.Notify(); + t.Join(); +} + +TEST(MsgqueueTest, InterruptSend) { + Queue queue = ASSERT_NO_ERRNO_AND_VALUE(Msgget(IPC_PRIVATE, 0600)); + msgmax buf{1, ""}; + // Number of messages that can be sent without blocking. + const size_t msgCount = msgMnb / msgMax; + + // Fill the queue. + for (size_t i = 0; i < msgCount; i++) { + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + } + + absl::Notification done, exit; + + // Thread calling msgsnd on a full queue. It would block forever, but we'll + // interrupt with a signal below. + ScopedThread t([&] { + struct sigaction sa = {}; + sa.sa_sigaction = empty_sighandler; + sigfillset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + auto cleanup_sigaction = + ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kInterruptSignal, sa)); + auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE( + ScopedSignalMask(SIG_UNBLOCK, kInterruptSignal)); + + EXPECT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallFailsWithErrno(EINTR)); + + done.Notify(); + exit.WaitForNotification(); + }); + + const DisableSave ds; // Too many syscalls. + + // We want the signal to arrive while msgsnd is blocking, but not after the + // thread has exited. Signals that arrive before msgsnd are no-ops. + do { + EXPECT_THAT(kill(getpid(), kInterruptSignal), SyscallSucceeds()); + absl::SleepFor(absl::Milliseconds(100)); // Rate limit. + } while (!done.HasBeenNotified()); + + exit.Notify(); + t.Join(); +} + +// Test msgctl with IPC_STAT option. +TEST(MsgqueueTest, MsgCtlIpcStat) { + auto start = absl::Now(); + + Queue queue(msgget(IPC_PRIVATE, 0600)); + ASSERT_THAT(queue.get(), SyscallSucceeds()); + + const uid_t uid = getuid(); + const gid_t gid = getgid(); + const pid_t pid = getpid(); + + struct msqid_ds ds; + ASSERT_THAT(msgctl(queue.get(), IPC_STAT, &ds), SyscallSucceeds()); + + EXPECT_EQ(ds.msg_perm.__key, IPC_PRIVATE); + EXPECT_EQ(ds.msg_perm.uid, uid); + EXPECT_EQ(ds.msg_perm.gid, gid); + EXPECT_EQ(ds.msg_perm.cuid, uid); + EXPECT_EQ(ds.msg_perm.cgid, gid); + EXPECT_EQ(ds.msg_perm.mode, 0600); + + EXPECT_EQ(ds.msg_stime, 0); + EXPECT_EQ(ds.msg_rtime, 0); + EXPECT_GE(ds.msg_ctime, absl::ToTimeT(start)); + + EXPECT_EQ(ds.msg_cbytes, 0); + EXPECT_EQ(ds.msg_qnum, 0); + EXPECT_EQ(ds.msg_qbytes, msgMnb); + EXPECT_EQ(ds.msg_lspid, 0); + EXPECT_EQ(ds.msg_lrpid, 0); + + // The timestamps only have a resolution of seconds; slow down so we actually + // see the timestamps change. + absl::SleepFor(absl::Seconds(1)); + auto pre_send = absl::Now(); + + msgbuf buf{1, "A message."}; + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + + ASSERT_THAT(msgctl(queue.get(), IPC_STAT, &ds), SyscallSucceeds()); + + EXPECT_GE(ds.msg_stime, absl::ToTimeT(pre_send)); + EXPECT_EQ(ds.msg_rtime, 0); + EXPECT_GE(ds.msg_ctime, absl::ToTimeT(start)); + + EXPECT_EQ(ds.msg_cbytes, msgSize); + EXPECT_EQ(ds.msg_qnum, 1); + EXPECT_EQ(ds.msg_qbytes, msgMnb); + EXPECT_EQ(ds.msg_lspid, pid); + EXPECT_EQ(ds.msg_lrpid, 0); + + absl::SleepFor(absl::Seconds(1)); + auto pre_receive = absl::Now(); + + ASSERT_THAT(msgrcv(queue.get(), &buf, sizeof(buf.mtext), 0, 0), + SyscallSucceedsWithValue(msgSize)); + + ASSERT_THAT(msgctl(queue.get(), IPC_STAT, &ds), SyscallSucceeds()); + + EXPECT_GE(ds.msg_stime, absl::ToTimeT(pre_send)); + EXPECT_GE(ds.msg_rtime, absl::ToTimeT(pre_receive)); + EXPECT_GE(ds.msg_ctime, absl::ToTimeT(start)); + + EXPECT_EQ(ds.msg_cbytes, 0); + EXPECT_EQ(ds.msg_qnum, 0); + EXPECT_EQ(ds.msg_qbytes, msgMnb); + EXPECT_EQ(ds.msg_lspid, pid); + EXPECT_EQ(ds.msg_lrpid, pid); +} + +// Test msgctl with IPC_STAT option on a write-only queue. +TEST(MsgqueueTest, MsgCtlIpcStatWriteOnly) { + // Drop CAP_IPC_OWNER which allows us to bypass permissions. + AutoCapability cap(CAP_IPC_OWNER, false); + + Queue queue(msgget(IPC_PRIVATE, 0200)); + ASSERT_THAT(queue.get(), SyscallSucceeds()); + + struct msqid_ds ds; + ASSERT_THAT(msgctl(queue.get(), IPC_STAT, &ds), + SyscallFailsWithErrno(EACCES)); +} + +// Test msgctl with IPC_SET option. +TEST(MsgqueueTest, MsgCtlIpcSet) { + Queue queue(msgget(IPC_PRIVATE, 0600)); + ASSERT_THAT(queue.get(), SyscallSucceeds()); + + struct msqid_ds ds; + ASSERT_THAT(msgctl(queue.get(), IPC_STAT, &ds), SyscallSucceeds()); + EXPECT_EQ(ds.msg_perm.mode, 0600); + + ds.msg_perm.mode = 0777; + ASSERT_THAT(msgctl(queue.get(), IPC_SET, &ds), SyscallSucceeds()); + + ASSERT_THAT(msgctl(queue.get(), IPC_STAT, &ds), SyscallSucceeds()); + EXPECT_EQ(ds.msg_perm.mode, 0777); +} + +// Test increasing msg_qbytes beyond limit with IPC_SET. +TEST(MsgqueueTest, MsgCtlIpcSetMaxBytes) { + // Drop CAP_SYS_RESOURCE which allows us to increase msg_qbytes beyond the + // system parameter MSGMNB. + AutoCapability cap(CAP_SYS_RESOURCE, false); + + Queue queue(msgget(IPC_PRIVATE, 0600)); + ASSERT_THAT(queue.get(), SyscallSucceeds()); + + struct msqid_ds ds; + ASSERT_THAT(msgctl(queue.get(), IPC_STAT, &ds), SyscallSucceeds()); + EXPECT_EQ(ds.msg_qbytes, msgMnb); + + ds.msg_qbytes = msgMnb - 10; + ASSERT_THAT(msgctl(queue.get(), IPC_SET, &ds), SyscallSucceeds()); + + ASSERT_THAT(msgctl(queue.get(), IPC_STAT, &ds), SyscallSucceeds()); + EXPECT_EQ(ds.msg_qbytes, msgMnb - 10); + + ds.msg_qbytes = msgMnb + 10; + EXPECT_THAT(msgctl(queue.get(), IPC_SET, &ds), SyscallFailsWithErrno(EPERM)); +} + +// Test msgctl with IPC_INFO option. +TEST(MsgqueueTest, MsgCtlIpcInfo) { + struct msginfo info; + ASSERT_THAT(msgctl(0, IPC_INFO, reinterpret_cast<struct msqid_ds*>(&info)), + SyscallSucceeds()); + + EXPECT_GT(info.msgmax, 0); + EXPECT_GT(info.msgmni, 0); + EXPECT_GT(info.msgmnb, 0); + EXPECT_EQ(info.msgpool, msgPool); + EXPECT_EQ(info.msgmap, msgMap); + EXPECT_EQ(info.msgssz, msgSsz); + EXPECT_EQ(info.msgtql, msgTql); +} + +// Test msgctl with MSG_INFO option. +TEST(MsgqueueTest, MsgCtlMsgInfo) { + struct msginfo info; + ASSERT_THAT(msgctl(0, MSG_INFO, reinterpret_cast<struct msqid_ds*>(&info)), + SyscallSucceeds()); + + EXPECT_GT(info.msgmax, 0); + EXPECT_GT(info.msgmni, 0); + EXPECT_GT(info.msgmnb, 0); + EXPECT_EQ(info.msgpool, 0); // Number of queues in the system. + EXPECT_EQ(info.msgmap, 0); // Total number of messages in all queues. + EXPECT_EQ(info.msgtql, 0); // Total number of bytes in all messages. + EXPECT_EQ(info.msgssz, msgSsz); + + // Add a queue and a message. + Queue queue(msgget(IPC_PRIVATE, 0600)); + ASSERT_THAT(queue.get(), SyscallSucceeds()); + + msgbuf buf{1, "A message."}; + ASSERT_THAT(msgsnd(queue.get(), &buf, sizeof(buf.mtext), 0), + SyscallSucceeds()); + + ASSERT_THAT(msgctl(0, MSG_INFO, reinterpret_cast<struct msqid_ds*>(&info)), + SyscallSucceeds()); + + EXPECT_GT(info.msgmax, 0); + EXPECT_GT(info.msgmni, 0); + EXPECT_GT(info.msgmnb, 0); + EXPECT_EQ(info.msgpool, 1); // Number of queues in the system. + EXPECT_EQ(info.msgmap, 1); // Total number of messages in all queues. + EXPECT_EQ(info.msgtql, msgSize); // Total number of bytes in all messages. + EXPECT_EQ(info.msgssz, msgSsz); +} + +} // namespace +} // namespace testing +} // namespace gvisor + +int main(int argc, char** argv) { + // Some tests depend on delivering a signal to the main thread. Block the + // target signal so that any other threads created by TestInit will also have + // the signal blocked. + sigset_t set; + sigemptyset(&set); + sigaddset(&set, gvisor::testing::kInterruptSignal); + TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0); + + gvisor::testing::TestInit(&argc, &argv); + return gvisor::testing::RunAllTests(); +} diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc index 133fdecf0..aa9e1bc04 100644 --- a/test/syscalls/linux/network_namespace.cc +++ b/test/syscalls/linux/network_namespace.cc @@ -12,18 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include <net/if.h> -#include <sched.h> -#include <sys/ioctl.h> -#include <sys/socket.h> -#include <sys/types.h> - -#include "gmock/gmock.h" #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/syscalls/linux/ip_socket_test_util.h" #include "test/util/capability_util.h" -#include "test/util/posix_error.h" -#include "test/util/test_util.h" #include "test/util/thread_util.h" namespace gvisor { @@ -37,13 +28,7 @@ TEST(NetworkNamespaceTest, LoopbackExists) { ASSERT_THAT(unshare(CLONE_NEWNET), SyscallSucceedsWithValue(0)); // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists. - // Check loopback device exists. - int sock = socket(AF_INET, SOCK_DGRAM, 0); - ASSERT_THAT(sock, SyscallSucceeds()); - struct ifreq ifr; - strncpy(ifr.ifr_name, "lo", IFNAMSIZ); - EXPECT_THAT(ioctl(sock, SIOCGIFINDEX, &ifr), SyscallSucceeds()) - << "lo cannot be found"; + ASSERT_NE(ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()), 0); }); } diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc index 861617ff7..43828a52e 100644 --- a/test/syscalls/linux/packet_socket.cc +++ b/test/syscalls/linux/packet_socket.cc @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,50 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include <arpa/inet.h> -#include <ifaddrs.h> -#include <linux/capability.h> -#include <linux/if_arp.h> -#include <linux/if_packet.h> -#include <net/ethernet.h> -#include <netinet/in.h> -#include <netinet/ip.h> -#include <netinet/udp.h> +#include <net/if.h> +#include <netinet/if_ether.h> +#include <netpacket/packet.h> #include <poll.h> -#include <sys/ioctl.h> #include <sys/socket.h> #include <sys/types.h> -#include <unistd.h> + +#include <limits> #include "gtest/gtest.h" -#include "absl/base/internal/endian.h" -#include "test/syscalls/linux/socket_test_util.h" -#include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/syscalls/linux/ip_socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" -#include "test/util/test_util.h" - -// Some of these tests involve sending packets via AF_PACKET sockets and the -// loopback interface. Because AF_PACKET circumvents so much of the networking -// stack, Linux sees these packets as "martian", i.e. they claim to be to/from -// localhost but don't have the usual associated data. Thus Linux drops them by -// default. You can see where this happens by following the code at: -// -// - net/ipv4/ip_input.c:ip_rcv_finish, which calls -// - net/ipv4/route.c:ip_route_input_noref, which calls -// - net/ipv4/route.c:ip_route_input_slow, which finds and drops martian -// packets. -// -// To tell Linux not to drop these packets, you need to tell it to accept our -// funny packets (which are completely valid and correct, but lack associated -// in-kernel data because we use AF_PACKET): -// -// echo 1 >> /proc/sys/net/ipv4/conf/lo/accept_local -// echo 1 >> /proc/sys/net/ipv4/conf/lo/route_localnet -// -// These tests require CAP_NET_RAW to run. - -// TODO(gvisor.dev/issue/173): gVisor support. +#include "test/util/socket_util.h" namespace gvisor { namespace testing { @@ -63,492 +33,191 @@ namespace testing { namespace { using ::testing::AnyOf; +using ::testing::Combine; using ::testing::Eq; +using ::testing::Values; -constexpr char kMessage[] = "soweoneul malhaebwa"; -constexpr in_port_t kPort = 0x409c; // htons(40000) - -// -// "Cooked" tests. Cooked AF_PACKET sockets do not contain link layer -// headers, and provide link layer destination/source information via a -// returned struct sockaddr_ll. -// - -// Send kMessage via sock to loopback -void SendUDPMessage(int sock) { - struct sockaddr_in dest = {}; - dest.sin_port = kPort; - dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - dest.sin_family = AF_INET; - EXPECT_THAT(sendto(sock, kMessage, sizeof(kMessage), 0, - reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)), - SyscallSucceedsWithValue(sizeof(kMessage))); -} - -// Send an IP packet and make sure ETH_P_<something else> doesn't pick it up. -TEST(BasicCookedPacketTest, WrongType) { - if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { - ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, ETH_P_PUP), - SyscallFailsWithErrno(EPERM)); - GTEST_SKIP(); - } - - FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE( - Socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_PUP))); - - // Let's use a simple IP payload: a UDP datagram. - FileDescriptor udp_sock = - ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); - SendUDPMessage(udp_sock.get()); - - // Wait and make sure the socket never becomes readable. - struct pollfd pfd = {}; - pfd.fd = sock.get(); - pfd.events = POLLIN; - EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0)); -} - -// Tests for "cooked" (SOCK_DGRAM) packet(7) sockets. -class CookedPacketTest : public ::testing::TestWithParam<int> { +class PacketSocketCreationTest + : public ::testing::TestWithParam<std::tuple<int, int>> { protected: - // Creates a socket to be used in tests. - void SetUp() override; - - // Closes the socket created by SetUp(). - void TearDown() override; - - // Gets the device index of the loopback device. - int GetLoopbackIndex(); - - // The socket used for both reading and writing. - int socket_; -}; - -void CookedPacketTest::SetUp() { - if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { - ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())), - SyscallFailsWithErrno(EPERM)); - GTEST_SKIP(); - } - - if (!IsRunningOnGvisor()) { - FileDescriptor acceptLocal = ASSERT_NO_ERRNO_AND_VALUE( - Open("/proc/sys/net/ipv4/conf/lo/accept_local", O_RDONLY)); - FileDescriptor routeLocalnet = ASSERT_NO_ERRNO_AND_VALUE( - Open("/proc/sys/net/ipv4/conf/lo/route_localnet", O_RDONLY)); - char enabled; - ASSERT_THAT(read(acceptLocal.get(), &enabled, 1), - SyscallSucceedsWithValue(1)); - ASSERT_EQ(enabled, '1'); - ASSERT_THAT(read(routeLocalnet.get(), &enabled, 1), - SyscallSucceedsWithValue(1)); - ASSERT_EQ(enabled, '1'); - } - - ASSERT_THAT(socket_ = socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())), - SyscallSucceeds()); -} - -void CookedPacketTest::TearDown() { - // TearDown will be run even if we skip the test. - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { - EXPECT_THAT(close(socket_), SyscallSucceeds()); - } -} - -int CookedPacketTest::GetLoopbackIndex() { - struct ifreq ifr; - snprintf(ifr.ifr_name, IFNAMSIZ, "lo"); - EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds()); - EXPECT_NE(ifr.ifr_ifindex, 0); - return ifr.ifr_ifindex; -} - -// Receive and verify the message via packet socket on interface. -void ReceiveMessage(int sock, int ifindex) { - // Wait for the socket to become readable. - struct pollfd pfd = {}; - pfd.fd = sock; - pfd.events = POLLIN; - EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1)); - - // Read and verify the data. - constexpr size_t packet_size = - sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage); - char buf[64]; - struct sockaddr_ll src = {}; - socklen_t src_len = sizeof(src); - ASSERT_THAT(recvfrom(sock, buf, sizeof(buf), 0, - reinterpret_cast<struct sockaddr*>(&src), &src_len), - SyscallSucceedsWithValue(packet_size)); - - // sockaddr_ll ends with an 8 byte physical address field, but ethernet - // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2 - // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns - // sizeof(sockaddr_ll). - ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); - - // TODO(gvisor.dev/issue/173): Verify protocol once we return it. - // Verify the source address. - EXPECT_EQ(src.sll_family, AF_PACKET); - EXPECT_EQ(src.sll_ifindex, ifindex); - EXPECT_EQ(src.sll_halen, ETH_ALEN); - EXPECT_EQ(ntohs(src.sll_protocol), ETH_P_IP); - // This came from the loopback device, so the address is all 0s. - for (int i = 0; i < src.sll_halen; i++) { - EXPECT_EQ(src.sll_addr[i], 0); + void SetUp() override { + if (!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())) { + const auto [type, protocol] = GetParam(); + ASSERT_THAT(socket(AF_PACKET, type, htons(protocol)), + SyscallFailsWithErrno(EPERM)); + GTEST_SKIP() << "Missing packet socket capability"; + } } +}; - // Verify the IP header. We memcpy to deal with pointer aligment. - struct iphdr ip = {}; - memcpy(&ip, buf, sizeof(ip)); - EXPECT_EQ(ip.ihl, 5); - EXPECT_EQ(ip.version, 4); - EXPECT_EQ(ip.tot_len, htons(packet_size)); - EXPECT_EQ(ip.protocol, IPPROTO_UDP); - EXPECT_EQ(ip.daddr, htonl(INADDR_LOOPBACK)); - EXPECT_EQ(ip.saddr, htonl(INADDR_LOOPBACK)); - - // Verify the UDP header. We memcpy to deal with pointer aligment. - struct udphdr udp = {}; - memcpy(&udp, buf + sizeof(iphdr), sizeof(udp)); - EXPECT_EQ(udp.dest, kPort); - EXPECT_EQ(udp.len, htons(sizeof(udphdr) + sizeof(kMessage))); - - // Verify the payload. - char* payload = reinterpret_cast<char*>(buf + sizeof(iphdr) + sizeof(udphdr)); - EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0); -} - -// Receive via a packet socket. -TEST_P(CookedPacketTest, Receive) { - // Let's use a simple IP payload: a UDP datagram. - FileDescriptor udp_sock = - ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); - SendUDPMessage(udp_sock.get()); - - // Receive and verify the data. - int loopback_index = GetLoopbackIndex(); - ReceiveMessage(socket_, loopback_index); -} - -// Send via a packet socket. -TEST_P(CookedPacketTest, Send) { - // TODO(gvisor.dev/issue/173): Remove once we support packet socket writing. - SKIP_IF(IsRunningOnGvisor()); - - // Let's send a UDP packet and receive it using a regular UDP socket. - FileDescriptor udp_sock = - ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); - struct sockaddr_in bind_addr = {}; - bind_addr.sin_family = AF_INET; - bind_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - bind_addr.sin_port = kPort; - ASSERT_THAT( - bind(udp_sock.get(), reinterpret_cast<struct sockaddr*>(&bind_addr), - sizeof(bind_addr)), - SyscallSucceeds()); - - // Set up the destination physical address. - struct sockaddr_ll dest = {}; - dest.sll_family = AF_PACKET; - dest.sll_halen = ETH_ALEN; - dest.sll_ifindex = GetLoopbackIndex(); - dest.sll_protocol = htons(ETH_P_IP); - // We're sending to the loopback device, so the address is all 0s. - memset(dest.sll_addr, 0x00, ETH_ALEN); - - // Set up the IP header. - struct iphdr iphdr = {0}; - iphdr.ihl = 5; - iphdr.version = 4; - iphdr.tos = 0; - iphdr.tot_len = - htons(sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage)); - // Get a pseudo-random ID. If we clash with an in-use ID the test will fail, - // but we have no way of getting an ID we know to be good. - srand(*reinterpret_cast<unsigned int*>(&iphdr)); - iphdr.id = rand(); - // Linux sets this bit ("do not fragment") for small packets. - iphdr.frag_off = 1 << 6; - iphdr.ttl = 64; - iphdr.protocol = IPPROTO_UDP; - iphdr.daddr = htonl(INADDR_LOOPBACK); - iphdr.saddr = htonl(INADDR_LOOPBACK); - iphdr.check = IPChecksum(iphdr); - - // Set up the UDP header. - struct udphdr udphdr = {}; - udphdr.source = kPort; - udphdr.dest = kPort; - udphdr.len = htons(sizeof(udphdr) + sizeof(kMessage)); - udphdr.check = UDPChecksum(iphdr, udphdr, kMessage, sizeof(kMessage)); - - // Copy both headers and the payload into our packet buffer. - char send_buf[sizeof(iphdr) + sizeof(udphdr) + sizeof(kMessage)]; - memcpy(send_buf, &iphdr, sizeof(iphdr)); - memcpy(send_buf + sizeof(iphdr), &udphdr, sizeof(udphdr)); - memcpy(send_buf + sizeof(iphdr) + sizeof(udphdr), kMessage, sizeof(kMessage)); - - // Send it. - ASSERT_THAT(sendto(socket_, send_buf, sizeof(send_buf), 0, - reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)), - SyscallSucceedsWithValue(sizeof(send_buf))); - - // Wait for the packet to become available on both sockets. - struct pollfd pfd = {}; - pfd.fd = udp_sock.get(); - pfd.events = POLLIN; - ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1)); - pfd.fd = socket_; - pfd.events = POLLIN; - ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1)); - - // Receive on the packet socket. - char recv_buf[sizeof(send_buf)]; - ASSERT_THAT(recv(socket_, recv_buf, sizeof(recv_buf), 0), - SyscallSucceedsWithValue(sizeof(recv_buf))); - ASSERT_EQ(memcmp(recv_buf, send_buf, sizeof(send_buf)), 0); - - // Receive on the UDP socket. - struct sockaddr_in src; - socklen_t src_len = sizeof(src); - ASSERT_THAT(recvfrom(udp_sock.get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT, - reinterpret_cast<struct sockaddr*>(&src), &src_len), - SyscallSucceedsWithValue(sizeof(kMessage))); - // Check src and payload. - EXPECT_EQ(strncmp(recv_buf, kMessage, sizeof(kMessage)), 0); - EXPECT_EQ(src.sin_family, AF_INET); - EXPECT_EQ(src.sin_port, kPort); - EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK)); -} - -// Bind and receive via packet socket. -TEST_P(CookedPacketTest, BindReceive) { - struct sockaddr_ll bind_addr = {}; - bind_addr.sll_family = AF_PACKET; - bind_addr.sll_protocol = htons(GetParam()); - bind_addr.sll_ifindex = GetLoopbackIndex(); - - ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr), - sizeof(bind_addr)), - SyscallSucceeds()); - - // Let's use a simple IP payload: a UDP datagram. - FileDescriptor udp_sock = - ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); - SendUDPMessage(udp_sock.get()); - - // Receive and verify the data. - ReceiveMessage(socket_, bind_addr.sll_ifindex); -} - -// Double Bind socket. -TEST_P(CookedPacketTest, DoubleBindSucceeds) { - struct sockaddr_ll bind_addr = {}; - bind_addr.sll_family = AF_PACKET; - bind_addr.sll_protocol = htons(GetParam()); - bind_addr.sll_ifindex = GetLoopbackIndex(); - - ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr), - sizeof(bind_addr)), - SyscallSucceeds()); - - // Binding socket again should fail. - ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr), - sizeof(bind_addr)), - // Linux 4.09 returns EINVAL here, but some time before 4.19 it - // switched to EADDRINUSE. - SyscallSucceeds()); +TEST_P(PacketSocketCreationTest, Create) { + const auto [type, protocol] = GetParam(); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_PACKET, type, htons(protocol))); + EXPECT_GE(fd.get(), 0); } -// Bind and verify we do not receive data on interface which is not bound -TEST_P(CookedPacketTest, BindDrop) { - // Let's use a simple IP payload: a UDP datagram. - FileDescriptor udp_sock = - ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); - - struct ifaddrs* if_addr_list = nullptr; - auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); }); - - ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds()); +INSTANTIATE_TEST_SUITE_P(AllPacketSocketTests, PacketSocketCreationTest, + Combine(Values(SOCK_DGRAM, SOCK_RAW), + Values(0, 1, 255, ETH_P_IP, ETH_P_IPV6, + std::numeric_limits<uint16_t>::max()))); - // Get interface other than loopback. - struct ifreq ifr = {}; - for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) { - if (strcmp(i->ifa_name, "lo") != 0) { - strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name)); - break; +class PacketSocketTest : public ::testing::TestWithParam<int> { + protected: + void SetUp() override { + if (!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())) { + ASSERT_THAT(socket(AF_PACKET, GetParam(), 0), + SyscallFailsWithErrno(EPERM)); + GTEST_SKIP() << "Missing packet socket capability"; } - } - // Skip if no interface is available other than loopback. - if (strlen(ifr.ifr_name) == 0) { - GTEST_SKIP(); + socket_ = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_PACKET, GetParam(), 0)); } - // Get interface index. - EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds()); - EXPECT_NE(ifr.ifr_ifindex, 0); - - // Bind to packet socket requires only family, protocol and ifindex. - struct sockaddr_ll bind_addr = {}; - bind_addr.sll_family = AF_PACKET; - bind_addr.sll_protocol = htons(GetParam()); - bind_addr.sll_ifindex = ifr.ifr_ifindex; - - ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr), - sizeof(bind_addr)), - SyscallSucceeds()); - - // Send to loopback interface. - struct sockaddr_in dest = {}; - dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - dest.sin_family = AF_INET; - dest.sin_port = kPort; - EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0, - reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)), - SyscallSucceedsWithValue(sizeof(kMessage))); + FileDescriptor socket_; +}; - // Wait and make sure the socket never receives any data. - struct pollfd pfd = {}; - pfd.fd = socket_; - pfd.events = POLLIN; - EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0)); -} +TEST_P(PacketSocketTest, RebindProtocol) { + const bool kEthHdrIncluded = GetParam() == SOCK_RAW; -// Verify that we receive outbound packets. This test requires at least one -// non loopback interface so that we can actually capture an outgoing packet. -TEST_P(CookedPacketTest, ReceiveOutbound) { - // Only ETH_P_ALL sockets can receive outbound packets on linux. - SKIP_IF(GetParam() != ETH_P_ALL); + sockaddr_in udp_bind_addr = { + .sin_family = AF_INET, + .sin_addr = {.s_addr = htonl(INADDR_LOOPBACK)}, + }; - // Let's use a simple IP payload: a UDP datagram. FileDescriptor udp_sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); - - struct ifaddrs* if_addr_list = nullptr; - auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); }); - - ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds()); - - // Get interface other than loopback. - struct ifreq ifr = {}; - for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) { - if (strcmp(i->ifa_name, "lo") != 0) { - strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name)); - break; - } - } - - // Skip if no interface is available other than loopback. - if (strlen(ifr.ifr_name) == 0) { - GTEST_SKIP(); - } - - // Get interface index and name. - EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds()); - EXPECT_NE(ifr.ifr_ifindex, 0); - int ifindex = ifr.ifr_ifindex; - - constexpr int kMACSize = 6; - char hwaddr[kMACSize]; - // Get interface address. - ASSERT_THAT(ioctl(socket_, SIOCGIFHWADDR, &ifr), SyscallSucceeds()); - ASSERT_THAT(ifr.ifr_hwaddr.sa_family, - AnyOf(Eq(ARPHRD_NONE), Eq(ARPHRD_ETHER))); - memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, kMACSize); - - // Just send it to the google dns server 8.8.8.8. It's UDP we don't care - // if it actually gets to the DNS Server we just want to see that we receive - // it on our AF_PACKET socket. - // - // NOTE: We just want to pick an IP that is non-local to avoid having to - // handle ARP as this should cause the UDP packet to be sent to the default - // gateway configured for the system under test. Otherwise the only packet we - // will see is the ARP query unless we picked an IP which will actually - // resolve. The test is a bit brittle but this was the best compromise for - // now. - struct sockaddr_in dest = {}; - ASSERT_EQ(inet_pton(AF_INET, "8.8.8.8", &dest.sin_addr.s_addr), 1); - dest.sin_family = AF_INET; - dest.sin_port = kPort; - EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0, - reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)), - SyscallSucceedsWithValue(sizeof(kMessage))); - - // Wait and make sure the socket receives the data. - struct pollfd pfd = {}; - pfd.fd = socket_; - pfd.events = POLLIN; - EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(1)); - - // Now read and check that the packet is the one we just sent. - // Read and verify the data. - constexpr size_t packet_size = - sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage); - char buf[64]; - struct sockaddr_ll src = {}; - socklen_t src_len = sizeof(src); - ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0, - reinterpret_cast<struct sockaddr*>(&src), &src_len), - SyscallSucceedsWithValue(packet_size)); - - // sockaddr_ll ends with an 8 byte physical address field, but ethernet - // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2 - // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns - // sizeof(sockaddr_ll). - ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); - - // Verify the source address. - EXPECT_EQ(src.sll_family, AF_PACKET); - EXPECT_EQ(src.sll_ifindex, ifindex); - EXPECT_EQ(src.sll_halen, ETH_ALEN); - EXPECT_EQ(ntohs(src.sll_protocol), ETH_P_IP); - EXPECT_EQ(src.sll_pkttype, PACKET_OUTGOING); - // Verify the link address of the interface matches that of the non - // non loopback interface address we stored above. - for (int i = 0; i < src.sll_halen; i++) { - EXPECT_EQ(src.sll_addr[i], hwaddr[i]); + { + // Bind the socket so that we have something to send packets to. + // + // If we didn't do this, the UDP packets we send will be responded to with + // ICMP Destination Port Unreachable errors. + ASSERT_THAT( + bind(udp_sock.get(), reinterpret_cast<const sockaddr*>(&udp_bind_addr), + sizeof(udp_bind_addr)), + SyscallSucceeds()); + socklen_t addrlen = sizeof(udp_bind_addr); + ASSERT_THAT( + getsockname(udp_sock.get(), reinterpret_cast<sockaddr*>(&udp_bind_addr), + &addrlen), + SyscallSucceeds()); + ASSERT_THAT(addrlen, sizeof(udp_bind_addr)); } - // Verify the IP header. - struct iphdr ip = {}; - memcpy(&ip, buf, sizeof(ip)); - EXPECT_EQ(ip.ihl, 5); - EXPECT_EQ(ip.version, 4); - EXPECT_EQ(ip.tot_len, htons(packet_size)); - EXPECT_EQ(ip.protocol, IPPROTO_UDP); - EXPECT_EQ(ip.daddr, dest.sin_addr.s_addr); - EXPECT_NE(ip.saddr, htonl(INADDR_LOOPBACK)); - - // Verify the UDP header. - struct udphdr udp = {}; - memcpy(&udp, buf + sizeof(iphdr), sizeof(udp)); - EXPECT_EQ(udp.dest, kPort); - EXPECT_EQ(udp.len, htons(sizeof(udphdr) + sizeof(kMessage))); - - // Verify the payload. - char* payload = reinterpret_cast<char*>(buf + sizeof(iphdr) + sizeof(udphdr)); - EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0); -} + const int loopback_index = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); + + auto send_udp_message = [&](const uint64_t v) { + ASSERT_THAT( + sendto(udp_sock.get(), reinterpret_cast<const char*>(&v), sizeof(v), + 0 /* flags */, reinterpret_cast<const sockaddr*>(&udp_bind_addr), + sizeof(udp_bind_addr)), + SyscallSucceeds()); + }; + + auto bind_to_network_protocol = [&](uint16_t protocol) { + const sockaddr_ll packet_bind_addr = { + .sll_family = AF_PACKET, + .sll_protocol = htons(protocol), + .sll_ifindex = loopback_index, + }; + + ASSERT_THAT(bind(socket_.get(), + reinterpret_cast<const sockaddr*>(&packet_bind_addr), + sizeof(packet_bind_addr)), + SyscallSucceeds()); + }; + + auto test_recv = [&, this](const uint64_t v) { + constexpr int kInfiniteTimeout = -1; + pollfd pfd = { + .fd = socket_.get(), + .events = POLLIN, + }; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, kInfiniteTimeout), + SyscallSucceedsWithValue(1)); -// Bind with invalid address. -TEST_P(CookedPacketTest, BindFail) { - // Null address. - ASSERT_THAT( - bind(socket_, nullptr, sizeof(struct sockaddr)), - AnyOf(SyscallFailsWithErrno(EFAULT), SyscallFailsWithErrno(EINVAL))); + struct { + ethhdr eth; + iphdr ip; + udphdr udp; + uint64_t payload; + char unused; + } ABSL_ATTRIBUTE_PACKED read_pkt; + sockaddr_ll src; + socklen_t src_len = sizeof(src); + + char* buf = reinterpret_cast<char*>(&read_pkt); + size_t buflen = sizeof(read_pkt); + size_t expected_read_len = sizeof(read_pkt) - sizeof(read_pkt.unused); + if (!kEthHdrIncluded) { + buf += sizeof(read_pkt.eth); + buflen -= sizeof(read_pkt.eth); + expected_read_len -= sizeof(read_pkt.eth); + } - // Address of size 1. - uint8_t addr = 0; - ASSERT_THAT( - bind(socket_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)), - SyscallFailsWithErrno(EINVAL)); + ASSERT_THAT(recvfrom(socket_.get(), buf, buflen, 0, + reinterpret_cast<sockaddr*>(&src), &src_len), + SyscallSucceedsWithValue(expected_read_len)); + // sockaddr_ll ends with an 8 byte physical address field, but ethernet + // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2 + // here, but returns sizeof(sockaddr_ll) since + // https://github.com/torvalds/linux/commit/b2cf86e1563e33a14a1c69b3e508d15dc12f804c. + ASSERT_THAT(src_len, ::testing::AnyOf( + ::testing::Eq(sizeof(src)), + ::testing::Eq(sizeof(src) - sizeof(src.sll_addr) + + ETH_ALEN))); + EXPECT_EQ(src.sll_family, AF_PACKET); + EXPECT_EQ(src.sll_ifindex, loopback_index); + EXPECT_EQ(src.sll_halen, ETH_ALEN); + EXPECT_EQ(ntohs(src.sll_protocol), ETH_P_IP); + // This came from the loopback device, so the address is all 0s. + constexpr uint8_t allZeroesMAC[ETH_ALEN] = {}; + EXPECT_EQ(memcmp(src.sll_addr, allZeroesMAC, sizeof(allZeroesMAC)), 0); + if (kEthHdrIncluded) { + EXPECT_EQ(memcmp(read_pkt.eth.h_dest, allZeroesMAC, sizeof(allZeroesMAC)), + 0); + EXPECT_EQ( + memcmp(read_pkt.eth.h_source, allZeroesMAC, sizeof(allZeroesMAC)), 0); + EXPECT_EQ(ntohs(read_pkt.eth.h_proto), ETH_P_IP); + } + // IHL hold the size of the header in 4 byte units. + EXPECT_EQ(read_pkt.ip.ihl, sizeof(iphdr) / 4); + EXPECT_EQ(read_pkt.ip.version, IPVERSION); + const uint16_t ip_pkt_size = + sizeof(read_pkt) - sizeof(read_pkt.eth) - sizeof(read_pkt.unused); + EXPECT_EQ(ntohs(read_pkt.ip.tot_len), ip_pkt_size); + EXPECT_EQ(read_pkt.ip.protocol, IPPROTO_UDP); + EXPECT_EQ(ntohl(read_pkt.ip.daddr), INADDR_LOOPBACK); + EXPECT_EQ(ntohl(read_pkt.ip.saddr), INADDR_LOOPBACK); + EXPECT_EQ(read_pkt.udp.source, udp_bind_addr.sin_port); + EXPECT_EQ(read_pkt.udp.dest, udp_bind_addr.sin_port); + EXPECT_EQ(ntohs(read_pkt.udp.len), ip_pkt_size - sizeof(read_pkt.ip)); + EXPECT_EQ(read_pkt.payload, v); + }; + + // The packet socket is not bound to IPv4 so we should not receive the sent + // message. + uint64_t counter = 0; + ASSERT_NO_FATAL_FAILURE(send_udp_message(++counter)); + + // Bind to IPv4 and expect to receive the UDP packet we send after binding. + ASSERT_NO_FATAL_FAILURE(bind_to_network_protocol(ETH_P_IP)); + ASSERT_NO_FATAL_FAILURE(send_udp_message(++counter)); + ASSERT_NO_FATAL_FAILURE(test_recv(counter)); + + // Bind the packet socket to a random protocol. + ASSERT_NO_FATAL_FAILURE(bind_to_network_protocol(255)); + ASSERT_NO_FATAL_FAILURE(send_udp_message(++counter)); + + // Bind back to IPv4 and expect to the UDP packet we send after binding + // back to IPv4. + ASSERT_NO_FATAL_FAILURE(bind_to_network_protocol(ETH_P_IP)); + ASSERT_NO_FATAL_FAILURE(send_udp_message(++counter)); + ASSERT_NO_FATAL_FAILURE(test_recv(counter)); } -INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest, - ::testing::Values(ETH_P_IP, ETH_P_ALL)); +INSTANTIATE_TEST_SUITE_P(AllPacketSocketTests, PacketSocketTest, + Values(SOCK_DGRAM, SOCK_RAW)); } // namespace diff --git a/test/syscalls/linux/packet_socket_dgram.cc b/test/syscalls/linux/packet_socket_dgram.cc new file mode 100644 index 000000000..9515a48f2 --- /dev/null +++ b/test/syscalls/linux/packet_socket_dgram.cc @@ -0,0 +1,550 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <arpa/inet.h> +#include <ifaddrs.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/udp.h> +#include <netpacket/packet.h> +#include <poll.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <unistd.h> + +#include "gtest/gtest.h" +#include "absl/base/internal/endian.h" +#include "test/syscalls/linux/ip_socket_test_util.h" +#include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/capability_util.h" +#include "test/util/cleanup.h" +#include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" +#include "test/util/test_util.h" + +// Some of these tests involve sending packets via AF_PACKET sockets and the +// loopback interface. Because AF_PACKET circumvents so much of the networking +// stack, Linux sees these packets as "martian", i.e. they claim to be to/from +// localhost but don't have the usual associated data. Thus Linux drops them by +// default. You can see where this happens by following the code at: +// +// - net/ipv4/ip_input.c:ip_rcv_finish, which calls +// - net/ipv4/route.c:ip_route_input_noref, which calls +// - net/ipv4/route.c:ip_route_input_slow, which finds and drops martian +// packets. +// +// To tell Linux not to drop these packets, you need to tell it to accept our +// funny packets (which are completely valid and correct, but lack associated +// in-kernel data because we use AF_PACKET): +// +// echo 1 >> /proc/sys/net/ipv4/conf/lo/accept_local +// echo 1 >> /proc/sys/net/ipv4/conf/lo/route_localnet +// +// These tests require CAP_NET_RAW to run. + +namespace gvisor { +namespace testing { + +namespace { + +using ::testing::AnyOf; +using ::testing::Eq; + +constexpr char kMessage[] = "soweoneul malhaebwa"; +constexpr in_port_t kPort = 0x409c; // htons(40000) + +// +// "Cooked" tests. Cooked AF_PACKET sockets do not contain link layer +// headers, and provide link layer destination/source information via a +// returned struct sockaddr_ll. +// + +// Send kMessage via sock to loopback +void SendUDPMessage(int sock) { + struct sockaddr_in dest = {}; + dest.sin_port = kPort; + dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + dest.sin_family = AF_INET; + EXPECT_THAT(sendto(sock, kMessage, sizeof(kMessage), 0, + reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)), + SyscallSucceedsWithValue(sizeof(kMessage))); +} + +// Send an IP packet and make sure ETH_P_<something else> doesn't pick it up. +TEST(BasicCookedPacketTest, WrongType) { + if (!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())) { + ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, ETH_P_PUP), + SyscallFailsWithErrno(EPERM)); + GTEST_SKIP(); + } + + FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE( + Socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_PUP))); + + // Let's use a simple IP payload: a UDP datagram. + FileDescriptor udp_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + SendUDPMessage(udp_sock.get()); + + // Wait and make sure the socket never becomes readable. + struct pollfd pfd = {}; + pfd.fd = sock.get(); + pfd.events = POLLIN; + EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0)); +} + +// Tests for "cooked" (SOCK_DGRAM) packet(7) sockets. +class CookedPacketTest : public ::testing::TestWithParam<int> { + protected: + // Creates a socket to be used in tests. + void SetUp() override; + + // Closes the socket created by SetUp(). + void TearDown() override; + + // Gets the device index of the loopback device. + int GetLoopbackIndex(); + + // The socket used for both reading and writing. + int socket_; +}; + +void CookedPacketTest::SetUp() { + if (!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())) { + ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())), + SyscallFailsWithErrno(EPERM)); + GTEST_SKIP(); + } + + if (!IsRunningOnGvisor()) { + FileDescriptor acceptLocal = ASSERT_NO_ERRNO_AND_VALUE( + Open("/proc/sys/net/ipv4/conf/lo/accept_local", O_RDONLY)); + FileDescriptor routeLocalnet = ASSERT_NO_ERRNO_AND_VALUE( + Open("/proc/sys/net/ipv4/conf/lo/route_localnet", O_RDONLY)); + char enabled; + ASSERT_THAT(read(acceptLocal.get(), &enabled, 1), + SyscallSucceedsWithValue(1)); + ASSERT_EQ(enabled, '1'); + ASSERT_THAT(read(routeLocalnet.get(), &enabled, 1), + SyscallSucceedsWithValue(1)); + ASSERT_EQ(enabled, '1'); + } + + ASSERT_THAT(socket_ = socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())), + SyscallSucceeds()); +} + +void CookedPacketTest::TearDown() { + // TearDown will be run even if we skip the test. + if (ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())) { + EXPECT_THAT(close(socket_), SyscallSucceeds()); + } +} + +int CookedPacketTest::GetLoopbackIndex() { + int v = EXPECT_NO_ERRNO_AND_VALUE(gvisor::testing::GetLoopbackIndex()); + EXPECT_NE(v, 0); + return v; +} + +// Receive and verify the message via packet socket on interface. +void ReceiveMessage(int sock, int ifindex) { + // Wait for the socket to become readable. + struct pollfd pfd = {}; + pfd.fd = sock; + pfd.events = POLLIN; + EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1)); + + // Read and verify the data. + constexpr size_t packet_size = + sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage); + char buf[64]; + struct sockaddr_ll src = {}; + socklen_t src_len = sizeof(src); + ASSERT_THAT(recvfrom(sock, buf, sizeof(buf), 0, + reinterpret_cast<struct sockaddr*>(&src), &src_len), + SyscallSucceedsWithValue(packet_size)); + + // sockaddr_ll ends with an 8 byte physical address field, but ethernet + // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2 + // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns + // sizeof(sockaddr_ll). + ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); + + // Verify the source address. + EXPECT_EQ(src.sll_family, AF_PACKET); + EXPECT_EQ(src.sll_ifindex, ifindex); + EXPECT_EQ(src.sll_halen, ETH_ALEN); + EXPECT_EQ(ntohs(src.sll_protocol), ETH_P_IP); + // This came from the loopback device, so the address is all 0s. + for (int i = 0; i < src.sll_halen; i++) { + EXPECT_EQ(src.sll_addr[i], 0); + } + + // Verify the IP header. We memcpy to deal with pointer aligment. + struct iphdr ip = {}; + memcpy(&ip, buf, sizeof(ip)); + EXPECT_EQ(ip.ihl, 5); + EXPECT_EQ(ip.version, 4); + EXPECT_EQ(ip.tot_len, htons(packet_size)); + EXPECT_EQ(ip.protocol, IPPROTO_UDP); + EXPECT_EQ(ip.daddr, htonl(INADDR_LOOPBACK)); + EXPECT_EQ(ip.saddr, htonl(INADDR_LOOPBACK)); + + // Verify the UDP header. We memcpy to deal with pointer aligment. + struct udphdr udp = {}; + memcpy(&udp, buf + sizeof(iphdr), sizeof(udp)); + EXPECT_EQ(udp.dest, kPort); + EXPECT_EQ(udp.len, htons(sizeof(udphdr) + sizeof(kMessage))); + + // Verify the payload. + char* payload = reinterpret_cast<char*>(buf + sizeof(iphdr) + sizeof(udphdr)); + EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0); +} + +// Receive via a packet socket. +TEST_P(CookedPacketTest, Receive) { + // Let's use a simple IP payload: a UDP datagram. + FileDescriptor udp_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + SendUDPMessage(udp_sock.get()); + + // Receive and verify the data. + int loopback_index = GetLoopbackIndex(); + ReceiveMessage(socket_, loopback_index); +} + +// Send via a packet socket. +TEST_P(CookedPacketTest, Send) { + // Let's send a UDP packet and receive it using a regular UDP socket. + FileDescriptor udp_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + struct sockaddr_in bind_addr = {}; + bind_addr.sin_family = AF_INET; + bind_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + bind_addr.sin_port = kPort; + ASSERT_THAT( + bind(udp_sock.get(), reinterpret_cast<struct sockaddr*>(&bind_addr), + sizeof(bind_addr)), + SyscallSucceeds()); + + // Set up the destination physical address. + struct sockaddr_ll dest = {}; + dest.sll_family = AF_PACKET; + dest.sll_halen = ETH_ALEN; + dest.sll_ifindex = GetLoopbackIndex(); + dest.sll_protocol = htons(ETH_P_IP); + // We're sending to the loopback device, so the address is all 0s. + memset(dest.sll_addr, 0x00, ETH_ALEN); + + // Set up the IP header. + struct iphdr iphdr = {0}; + iphdr.ihl = 5; + iphdr.version = 4; + iphdr.tos = 0; + iphdr.tot_len = + htons(sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage)); + // Get a pseudo-random ID. If we clash with an in-use ID the test will fail, + // but we have no way of getting an ID we know to be good. + srand(*reinterpret_cast<unsigned int*>(&iphdr)); + iphdr.id = rand(); + // Linux sets this bit ("do not fragment") for small packets. + iphdr.frag_off = 1 << 6; + iphdr.ttl = 64; + iphdr.protocol = IPPROTO_UDP; + iphdr.daddr = htonl(INADDR_LOOPBACK); + iphdr.saddr = htonl(INADDR_LOOPBACK); + iphdr.check = IPChecksum(iphdr); + + // Set up the UDP header. + struct udphdr udphdr = {}; + udphdr.source = kPort; + udphdr.dest = kPort; + udphdr.len = htons(sizeof(udphdr) + sizeof(kMessage)); + udphdr.check = UDPChecksum(iphdr, udphdr, kMessage, sizeof(kMessage)); + + // Copy both headers and the payload into our packet buffer. + char send_buf[sizeof(iphdr) + sizeof(udphdr) + sizeof(kMessage)]; + memcpy(send_buf, &iphdr, sizeof(iphdr)); + memcpy(send_buf + sizeof(iphdr), &udphdr, sizeof(udphdr)); + memcpy(send_buf + sizeof(iphdr) + sizeof(udphdr), kMessage, sizeof(kMessage)); + + // Send it. + ASSERT_THAT(sendto(socket_, send_buf, sizeof(send_buf), 0, + reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)), + SyscallSucceedsWithValue(sizeof(send_buf))); + + // Wait for the packet to become available on both sockets. + struct pollfd pfd = {}; + pfd.fd = udp_sock.get(); + pfd.events = POLLIN; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1)); + pfd.fd = socket_; + pfd.events = POLLIN; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1)); + + // Receive on the packet socket. + char recv_buf[sizeof(send_buf)]; + ASSERT_THAT(recv(socket_, recv_buf, sizeof(recv_buf), 0), + SyscallSucceedsWithValue(sizeof(recv_buf))); + ASSERT_EQ(memcmp(recv_buf, send_buf, sizeof(send_buf)), 0); + + // Receive on the UDP socket. + struct sockaddr_in src; + socklen_t src_len = sizeof(src); + ASSERT_THAT(recvfrom(udp_sock.get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT, + reinterpret_cast<struct sockaddr*>(&src), &src_len), + SyscallSucceedsWithValue(sizeof(kMessage))); + // Check src and payload. + EXPECT_EQ(strncmp(recv_buf, kMessage, sizeof(kMessage)), 0); + EXPECT_EQ(src.sin_family, AF_INET); + EXPECT_EQ(src.sin_port, kPort); + EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK)); +} + +// Bind and receive via packet socket. +TEST_P(CookedPacketTest, BindReceive) { + struct sockaddr_ll bind_addr = {}; + bind_addr.sll_family = AF_PACKET; + bind_addr.sll_protocol = htons(GetParam()); + bind_addr.sll_ifindex = GetLoopbackIndex(); + + ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr), + sizeof(bind_addr)), + SyscallSucceeds()); + + // Let's use a simple IP payload: a UDP datagram. + FileDescriptor udp_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + SendUDPMessage(udp_sock.get()); + + // Receive and verify the data. + ReceiveMessage(socket_, bind_addr.sll_ifindex); +} + +// Double Bind socket. +TEST_P(CookedPacketTest, DoubleBindSucceeds) { + struct sockaddr_ll bind_addr = {}; + bind_addr.sll_family = AF_PACKET; + bind_addr.sll_protocol = htons(GetParam()); + bind_addr.sll_ifindex = GetLoopbackIndex(); + + ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr), + sizeof(bind_addr)), + SyscallSucceeds()); + + // Binding socket again should fail. + ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr), + sizeof(bind_addr)), + // Linux 4.09 returns EINVAL here, but some time before 4.19 it + // switched to EADDRINUSE. + SyscallSucceeds()); +} + +// Bind and verify we do not receive data on interface which is not bound +TEST_P(CookedPacketTest, BindDrop) { + // Let's use a simple IP payload: a UDP datagram. + FileDescriptor udp_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + + struct ifaddrs* if_addr_list = nullptr; + auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); }); + + ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds()); + + // Get interface other than loopback. + struct ifreq ifr = {}; + for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) { + if (strcmp(i->ifa_name, "lo") != 0) { + strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name)); + break; + } + } + + // Skip if no interface is available other than loopback. + if (strlen(ifr.ifr_name) == 0) { + GTEST_SKIP(); + } + + // Get interface index. + EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds()); + EXPECT_NE(ifr.ifr_ifindex, 0); + + // Bind to packet socket requires only family, protocol and ifindex. + struct sockaddr_ll bind_addr = {}; + bind_addr.sll_family = AF_PACKET; + bind_addr.sll_protocol = htons(GetParam()); + bind_addr.sll_ifindex = ifr.ifr_ifindex; + + ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr), + sizeof(bind_addr)), + SyscallSucceeds()); + + // Send to loopback interface. + struct sockaddr_in dest = {}; + dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + dest.sin_family = AF_INET; + dest.sin_port = kPort; + EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0, + reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)), + SyscallSucceedsWithValue(sizeof(kMessage))); + + // Wait and make sure the socket never receives any data. + struct pollfd pfd = {}; + pfd.fd = socket_; + pfd.events = POLLIN; + EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0)); +} + +// Verify that we receive outbound packets. This test requires at least one +// non loopback interface so that we can actually capture an outgoing packet. +TEST_P(CookedPacketTest, ReceiveOutbound) { + // Only ETH_P_ALL sockets can receive outbound packets on linux. + SKIP_IF(GetParam() != ETH_P_ALL); + + // Let's use a simple IP payload: a UDP datagram. + FileDescriptor udp_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + + struct ifaddrs* if_addr_list = nullptr; + auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); }); + + ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds()); + + // Get interface other than loopback. + struct ifreq ifr = {}; + for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) { + if (strcmp(i->ifa_name, "lo") != 0) { + strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name)); + break; + } + } + + // Skip if no interface is available other than loopback. + if (strlen(ifr.ifr_name) == 0) { + GTEST_SKIP(); + } + + // Get interface index and name. + EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds()); + EXPECT_NE(ifr.ifr_ifindex, 0); + int ifindex = ifr.ifr_ifindex; + + constexpr int kMACSize = 6; + char hwaddr[kMACSize]; + // Get interface address. + ASSERT_THAT(ioctl(socket_, SIOCGIFHWADDR, &ifr), SyscallSucceeds()); + ASSERT_THAT(ifr.ifr_hwaddr.sa_family, + AnyOf(Eq(ARPHRD_NONE), Eq(ARPHRD_ETHER))); + memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, kMACSize); + + // Just send it to the google dns server 8.8.8.8. It's UDP we don't care + // if it actually gets to the DNS Server we just want to see that we receive + // it on our AF_PACKET socket. + // + // NOTE: We just want to pick an IP that is non-local to avoid having to + // handle ARP as this should cause the UDP packet to be sent to the default + // gateway configured for the system under test. Otherwise the only packet we + // will see is the ARP query unless we picked an IP which will actually + // resolve. The test is a bit brittle but this was the best compromise for + // now. + struct sockaddr_in dest = {}; + ASSERT_EQ(inet_pton(AF_INET, "8.8.8.8", &dest.sin_addr.s_addr), 1); + dest.sin_family = AF_INET; + dest.sin_port = kPort; + EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0, + reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)), + SyscallSucceedsWithValue(sizeof(kMessage))); + + // Wait and make sure the socket receives the data. + struct pollfd pfd = {}; + pfd.fd = socket_; + pfd.events = POLLIN; + EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(1)); + + // Now read and check that the packet is the one we just sent. + // Read and verify the data. + constexpr size_t packet_size = + sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage); + char buf[64]; + struct sockaddr_ll src = {}; + socklen_t src_len = sizeof(src); + ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0, + reinterpret_cast<struct sockaddr*>(&src), &src_len), + SyscallSucceedsWithValue(packet_size)); + + // sockaddr_ll ends with an 8 byte physical address field, but ethernet + // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2 + // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns + // sizeof(sockaddr_ll). + ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); + + // Verify the source address. + EXPECT_EQ(src.sll_family, AF_PACKET); + EXPECT_EQ(src.sll_ifindex, ifindex); + EXPECT_EQ(src.sll_halen, ETH_ALEN); + EXPECT_EQ(ntohs(src.sll_protocol), ETH_P_IP); + EXPECT_EQ(src.sll_pkttype, PACKET_OUTGOING); + // Verify the link address of the interface matches that of the non + // non loopback interface address we stored above. + for (int i = 0; i < src.sll_halen; i++) { + EXPECT_EQ(src.sll_addr[i], hwaddr[i]); + } + + // Verify the IP header. + struct iphdr ip = {}; + memcpy(&ip, buf, sizeof(ip)); + EXPECT_EQ(ip.ihl, 5); + EXPECT_EQ(ip.version, 4); + EXPECT_EQ(ip.tot_len, htons(packet_size)); + EXPECT_EQ(ip.protocol, IPPROTO_UDP); + EXPECT_EQ(ip.daddr, dest.sin_addr.s_addr); + EXPECT_NE(ip.saddr, htonl(INADDR_LOOPBACK)); + + // Verify the UDP header. + struct udphdr udp = {}; + memcpy(&udp, buf + sizeof(iphdr), sizeof(udp)); + EXPECT_EQ(udp.dest, kPort); + EXPECT_EQ(udp.len, htons(sizeof(udphdr) + sizeof(kMessage))); + + // Verify the payload. + char* payload = reinterpret_cast<char*>(buf + sizeof(iphdr) + sizeof(udphdr)); + EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0); +} + +// Bind with invalid address. +TEST_P(CookedPacketTest, BindFail) { + // Null address. + ASSERT_THAT( + bind(socket_, nullptr, sizeof(struct sockaddr)), + AnyOf(SyscallFailsWithErrno(EFAULT), SyscallFailsWithErrno(EINVAL))); + + // Address of size 1. + uint8_t addr = 0; + ASSERT_THAT( + bind(socket_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)), + SyscallFailsWithErrno(EINVAL)); +} + +INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest, + ::testing::Values(ETH_P_IP, ETH_P_ALL)); + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc index 72080a272..aa7182b01 100644 --- a/test/syscalls/linux/packet_socket_raw.cc +++ b/test/syscalls/linux/packet_socket_raw.cc @@ -13,16 +13,13 @@ // limitations under the License. #include <arpa/inet.h> -#include <linux/capability.h> -#include <linux/filter.h> -#include <linux/if_arp.h> -#include <linux/if_packet.h> #include <net/ethernet.h> +#include <net/if_arp.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/udp.h> +#include <netpacket/packet.h> #include <poll.h> -#include <sys/ioctl.h> #include <sys/socket.h> #include <sys/types.h> #include <unistd.h> @@ -30,10 +27,12 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/base/internal/endian.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/capability_util.h" +#include "test/util/cleanup.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" // Some of these tests involve sending packets via AF_PACKET sockets and the @@ -56,8 +55,6 @@ // // These tests require CAP_NET_RAW to run. -// TODO(gvisor.dev/issue/173): gVisor support. - namespace gvisor { namespace testing { @@ -102,7 +99,7 @@ class RawPacketTest : public ::testing::TestWithParam<int> { }; void RawPacketTest::SetUp() { - if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { + if (!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())) { ASSERT_THAT(socket(AF_PACKET, SOCK_RAW, htons(GetParam())), SyscallFailsWithErrno(EPERM)); GTEST_SKIP(); @@ -152,17 +149,15 @@ void RawPacketTest::SetUp() { void RawPacketTest::TearDown() { // TearDown will be run even if we skip the test. - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { + if (ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())) { EXPECT_THAT(close(s_), SyscallSucceeds()); } } int RawPacketTest::GetLoopbackIndex() { - struct ifreq ifr; - snprintf(ifr.ifr_name, IFNAMSIZ, "lo"); - EXPECT_THAT(ioctl(s_, SIOCGIFINDEX, &ifr), SyscallSucceeds()); - EXPECT_NE(ifr.ifr_ifindex, 0); - return ifr.ifr_ifindex; + int v = EXPECT_NO_ERRNO_AND_VALUE(gvisor::testing::GetLoopbackIndex()); + EXPECT_NE(v, 0); + return v; } // Receive via a packet socket. @@ -193,7 +188,6 @@ TEST_P(RawPacketTest, Receive) { // sizeof(sockaddr_ll). ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2))); - // TODO(gvisor.dev/issue/173): Verify protocol once we return it. // Verify the source address. EXPECT_EQ(src.sll_family, AF_PACKET); EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex()); @@ -238,9 +232,6 @@ TEST_P(RawPacketTest, Receive) { // Send via a packet socket. TEST_P(RawPacketTest, Send) { - // TODO(gvisor.dev/issue/173): Remove once we support packet socket writing. - SKIP_IF(IsRunningOnGvisor()); - // Let's send a UDP packet and receive it using a regular UDP socket. FileDescriptor udp_sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); @@ -338,7 +329,7 @@ TEST_P(RawPacketTest, Send) { // Check that setting SO_RCVBUF below min is clamped to the minimum // receive buffer size. TEST_P(RawPacketTest, SetSocketRecvBufBelowMin) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); // Discover minimum receive buf size by trying to set it to zero. // See: @@ -371,7 +362,7 @@ TEST_P(RawPacketTest, SetSocketRecvBufBelowMin) { // Check that setting SO_RCVBUF above max is clamped to the maximum // receive buffer size. TEST_P(RawPacketTest, SetSocketRecvBufAboveMax) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); // Discover max buf size by trying to set the largest possible buffer size. constexpr int kRcvBufSz = 0xffffffff; @@ -398,7 +389,7 @@ TEST_P(RawPacketTest, SetSocketRecvBufAboveMax) { // Check that setting SO_RCVBUF min <= kRcvBufSz <= max is honored. TEST_P(RawPacketTest, SetSocketRecvBuf) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); int max = 0; int min = 0; @@ -447,7 +438,7 @@ TEST_P(RawPacketTest, SetSocketRecvBuf) { // Check that setting SO_SNDBUF below min is clamped to the minimum // receive buffer size. TEST_P(RawPacketTest, SetSocketSendBufBelowMin) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); // Discover minimum buffer size by trying to set it to zero. constexpr int kSndBufSz = 0; @@ -478,7 +469,7 @@ TEST_P(RawPacketTest, SetSocketSendBufBelowMin) { // Check that setting SO_SNDBUF above max is clamped to the maximum // send buffer size. TEST_P(RawPacketTest, SetSocketSendBufAboveMax) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); // Discover maximum buffer size by trying to set it to a large value. constexpr int kSndBufSz = 0xffffffff; @@ -505,7 +496,7 @@ TEST_P(RawPacketTest, SetSocketSendBufAboveMax) { // Check that setting SO_SNDBUF min <= kSndBufSz <= max is honored. TEST_P(RawPacketTest, SetSocketSendBuf) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); int max = 0; int min = 0; @@ -549,7 +540,7 @@ TEST_P(RawPacketTest, SetSocketSendBuf) { } TEST_P(RawPacketTest, GetSocketError) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); int val = 0; socklen_t val_len = sizeof(val); @@ -559,7 +550,7 @@ TEST_P(RawPacketTest, GetSocketError) { } TEST_P(RawPacketTest, GetSocketErrorBind) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); { // Bind to the loopback device. @@ -625,7 +616,7 @@ TEST_P(RawPacketTest, SetSocketDetachFilterNoInstalledFilter) { } TEST_P(RawPacketTest, GetSocketDetachFilter) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); int val = 0; socklen_t val_len = sizeof(val); @@ -634,7 +625,7 @@ TEST_P(RawPacketTest, GetSocketDetachFilter) { } TEST_P(RawPacketTest, SetAndGetSocketLinger) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); int level = SOL_SOCKET; int type = SO_LINGER; @@ -655,7 +646,7 @@ TEST_P(RawPacketTest, SetAndGetSocketLinger) { } TEST_P(RawPacketTest, GetSocketAcceptConn) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); int got = -1; socklen_t length = sizeof(got); @@ -671,7 +662,7 @@ INSTANTIATE_TEST_SUITE_P(AllInetTests, RawPacketTest, class RawPacketMsgSizeTest : public ::testing::TestWithParam<TestAddress> {}; TEST_P(RawPacketMsgSizeTest, SendTooLong) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); TestAddress addr = GetParam().WithPort(kPort); @@ -688,8 +679,11 @@ TEST_P(RawPacketMsgSizeTest, SendTooLong) { SyscallFailsWithErrno(EMSGSIZE)); } +// TODO(https://fxbug.dev/76957): Run this test on Fuchsia once splice is +// available. +#ifndef __Fuchsia__ TEST_P(RawPacketMsgSizeTest, SpliceTooLong) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HavePacketSocketCapability())); const char buf[65536] = {}; int fds[2]; @@ -716,6 +710,7 @@ TEST_P(RawPacketMsgSizeTest, SpliceTooLong) { EXPECT_THAT(n, SyscallSucceedsWithValue(sizeof(buf))); } } +#endif // __Fuchsia__ INSTANTIATE_TEST_SUITE_P(AllRawPacketMsgSizeTest, RawPacketMsgSizeTest, ::testing::Values(V4Loopback(), V6Loopback())); diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc index 223ddc0c8..1bdfcbbe3 100644 --- a/test/syscalls/linux/partial_bad_buffer.cc +++ b/test/syscalls/linux/partial_bad_buffer.cc @@ -26,10 +26,10 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" #include "test/util/posix_error.h" +#include "test/util/socket_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" diff --git a/test/syscalls/linux/ping_socket.cc b/test/syscalls/linux/ping_socket.cc index 8268e91da..025c9568f 100644 --- a/test/syscalls/linux/ping_socket.cc +++ b/test/syscalls/linux/ping_socket.cc @@ -29,8 +29,8 @@ #include "absl/strings/str_join.h" #include "absl/types/optional.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" // Note: These tests require /proc/sys/net/ipv4/ping_group_range to be @@ -128,9 +128,6 @@ std::vector<std::tuple<SocketKind, BindTestCase>> ICMPTestCases() { { .bind_to = V4Broadcast(), .want = EADDRNOTAVAIL, - // TODO(gvisor.dev/issue/5711): Remove want_gvisor once ICMP - // sockets are no longer allowed to bind to broadcast addresses. - .want_gvisor = 0, }, { .bind_to = V4Loopback(), @@ -139,9 +136,6 @@ std::vector<std::tuple<SocketKind, BindTestCase>> ICMPTestCases() { { .bind_to = V4LoopbackSubnetBroadcast(), .want = EADDRNOTAVAIL, - // TODO(gvisor.dev/issue/5711): Remove want_gvisor once ICMP - // sockets are no longer allowed to bind to broadcast addresses. - .want_gvisor = 0, }, { .bind_to = V4Multicast(), @@ -155,43 +149,33 @@ std::vector<std::tuple<SocketKind, BindTestCase>> ICMPTestCases() { .bind_to = V4AddrStr("IPv4UnknownUnicast", "192.168.1.1"), .want = EADDRNOTAVAIL, }, - // TODO(gvisor.dev/issue/6021): Remove want_gvisor from all the test - // cases below once ICMP sockets return EAFNOSUPPORT when binding to - // IPv6 addresses. { .bind_to = V6Any(), .want = EAFNOSUPPORT, - .want_gvisor = EINVAL, }, { .bind_to = V6Loopback(), .want = EAFNOSUPPORT, - .want_gvisor = EINVAL, }, { .bind_to = V6Multicast(), .want = EAFNOSUPPORT, - .want_gvisor = EINVAL, }, { .bind_to = V6MulticastInterfaceLocalAllNodes(), .want = EAFNOSUPPORT, - .want_gvisor = EINVAL, }, { .bind_to = V6MulticastLinkLocalAllNodes(), .want = EAFNOSUPPORT, - .want_gvisor = EINVAL, }, { .bind_to = V6MulticastLinkLocalAllRouters(), .want = EAFNOSUPPORT, - .want_gvisor = EINVAL, }, { .bind_to = V6AddrStr("IPv6UnknownUnicast", "fc00::1"), .want = EAFNOSUPPORT, - .want_gvisor = EINVAL, }, }); } diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc index 0bba86846..209801e36 100644 --- a/test/syscalls/linux/pipe.cc +++ b/test/syscalls/linux/pipe.cc @@ -13,11 +13,13 @@ // limitations under the License. #include <fcntl.h> /* Obtain O_* constant definitions */ +#include <linux/futex.h> #include <linux/magic.h> #include <signal.h> #include <sys/ioctl.h> #include <sys/statfs.h> #include <sys/uio.h> +#include <syscall.h> #include <unistd.h> #include <vector> @@ -50,6 +52,9 @@ std::atomic<int> global_num_signals_received = 0; void SigRecordingHandler(int signum, siginfo_t* siginfo, void* unused_ucontext) { global_num_signals_received++; + ASSERT_THAT(syscall(SYS_futex, &global_num_signals_received, + FUTEX_WAKE | FUTEX_PRIVATE_FLAG, INT_MAX, 0, 0, 0), + SyscallSucceeds()); } PosixErrorOr<Cleanup> RegisterSignalHandler(int signum) { @@ -61,11 +66,14 @@ PosixErrorOr<Cleanup> RegisterSignalHandler(int signum) { return ScopedSigaction(signum, handler); } -void WaitForSignalDelivery(absl::Duration timeout, int max_expected) { - absl::Time wait_start = absl::Now(); - while (global_num_signals_received < max_expected && - absl::Now() - wait_start < timeout) { - absl::SleepFor(absl::Milliseconds(10)); +void WaitForSignalDelivery(int expected) { + while (1) { + int v = global_num_signals_received; + if (v >= expected) { + break; + } + RetryEINTR(syscall)(SYS_futex, &global_num_signals_received, + FUTEX_WAIT | FUTEX_PRIVATE_FLAG, v, 0, 0, 0); } } @@ -371,7 +379,7 @@ TEST_P(PipeTest, ReaderSideCloses) { EXPECT_THAT(write(wfd_.get(), &buf, sizeof(buf)), SyscallFailsWithErrno(EPIPE)); - WaitForSignalDelivery(absl::Seconds(1), 1); + WaitForSignalDelivery(1); ASSERT_EQ(global_num_signals_received, 1); } @@ -411,7 +419,7 @@ TEST_P(PipeTest, BlockWriteClosed) { notify.WaitForNotification(); ASSERT_THAT(close(rfd_.release()), SyscallSucceeds()); - WaitForSignalDelivery(absl::Seconds(1), 1); + WaitForSignalDelivery(1); ASSERT_EQ(global_num_signals_received, 1); t.Join(); @@ -443,7 +451,7 @@ TEST_P(PipeTest, BlockPartialWriteClosed) { // Unblock the above. ASSERT_THAT(close(rfd_.release()), SyscallSucceeds()); - WaitForSignalDelivery(absl::Seconds(1), 2); + WaitForSignalDelivery(2); ASSERT_EQ(global_num_signals_received, 2); t.Join(); diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc index 5ce7e8c8d..ccd084244 100644 --- a/test/syscalls/linux/poll.cc +++ b/test/syscalls/linux/poll.cc @@ -116,7 +116,7 @@ void BlockingReadableTest(int16_t mask) { }); notify.WaitForNotification(); - absl::SleepFor(absl::Seconds(1.0)); + absl::SleepFor(absl::Seconds(1)); // Write some data to the pipe. char s[] = "foo\n"; @@ -221,7 +221,7 @@ TEST_F(PollTest, BlockingEventPOLLHUP) { }); notify.WaitForNotification(); - absl::SleepFor(absl::Seconds(1.0)); + absl::SleepFor(absl::Seconds(1)); // Write some data and close the writer fd. fd1.reset(); diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc index 19a57d353..286b3d168 100644 --- a/test/syscalls/linux/prctl.cc +++ b/test/syscalls/linux/prctl.cc @@ -101,11 +101,11 @@ TEST(PrctlTest, NoNewPrivsPreservedAcrossCloneForkAndExecve) { int no_new_privs; ASSERT_THAT(no_new_privs = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0), SyscallSucceeds()); - ScopedThread([] { + ScopedThread thread = ScopedThread([] { ASSERT_THAT(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), SyscallSucceeds()); EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0), SyscallSucceedsWithValue(1)); - ScopedThread([] { + ScopedThread threadInner = ScopedThread([] { EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0), SyscallSucceedsWithValue(1)); // Note that these ASSERT_*s failing will only return from this thread, @@ -129,9 +129,11 @@ TEST(PrctlTest, NoNewPrivsPreservedAcrossCloneForkAndExecve) { EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0), SyscallSucceedsWithValue(1)); }); + threadInner.Join(); EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0), SyscallSucceedsWithValue(1)); }); + thread.Join(); EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0), SyscallSucceedsWithValue(no_new_privs)); } @@ -141,7 +143,7 @@ TEST(PrctlTest, PDeathSig) { // Make the new process' parent a separate thread since the parent death // signal fires when the parent *thread* exits. - ScopedThread([&] { + ScopedThread thread = ScopedThread([&] { child_pid = fork(); TEST_CHECK(child_pid >= 0); if (child_pid == 0) { @@ -172,6 +174,7 @@ TEST(PrctlTest, PDeathSig) { // Suppress the SIGSTOP and detach from the child. ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds()); }); + thread.Join(); // The child should have been killed by its parent death SIGKILL. int status; @@ -211,6 +214,12 @@ TEST(PrctlTest, RootDumpability) { SyscallFailsWithErrno(EINVAL)); } +TEST(PrctlTest, SetGetSubreaper) { + // Setting subreaper on PID 1 works vacuously because PID 1 is always a + // subreaper. + EXPECT_THAT(prctl(PR_SET_CHILD_SUBREAPER, 1), SyscallSucceeds()); +} + } // namespace } // namespace testing diff --git a/test/syscalls/linux/priority.cc b/test/syscalls/linux/priority.cc index 1d9bdfa70..e381248e4 100644 --- a/test/syscalls/linux/priority.cc +++ b/test/syscalls/linux/priority.cc @@ -72,7 +72,8 @@ TEST(SetpriorityTest, Implemented) { // No need to clear errno for setpriority(): // "The setpriority() call returns 0 if there is no error, or -1 if there is" - EXPECT_THAT(setpriority(PRIO_PROCESS, /*who=*/0, /*nice=*/16), + EXPECT_THAT(setpriority(PRIO_PROCESS, /*who=*/0, + /*nice=*/16), // NOLINT(bugprone-argument-comment) SyscallSucceeds()); } @@ -80,7 +81,8 @@ TEST(SetpriorityTest, Implemented) { TEST(Setpriority, InvalidWhich) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE))); - EXPECT_THAT(setpriority(/*which=*/3, /*who=*/0, /*nice=*/16), + EXPECT_THAT(setpriority(/*which=*/3, /*who=*/0, + /*nice=*/16), // NOLINT(bugprone-argument-comment) SyscallFailsWithErrno(EINVAL)); } @@ -88,7 +90,8 @@ TEST(Setpriority, InvalidWhich) { TEST(SetpriorityTest, ValidWho) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE))); - EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), /*nice=*/16), + EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), + /*nice=*/16), // NOLINT(bugprone-argument-comment) SyscallSucceeds()); } @@ -142,22 +145,26 @@ TEST(SetpriorityTest, OutsideRange) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE))); // Set niceval > 19 - EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), /*nice=*/100), + EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), + /*nice=*/100), // NOLINT(bugprone-argument-comment) SyscallSucceeds()); errno = 0; // Test niceval truncated to 19 EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()), - SyscallSucceedsWithValue(/*maxnice=*/19)); + SyscallSucceedsWithValue( + /*maxnice=*/19)); // NOLINT(bugprone-argument-comment) // Set niceval < -20 - EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), /*nice=*/-100), + EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), + /*nice=*/-100), // NOLINT(bugprone-argument-comment) SyscallSucceeds()); errno = 0; // Test niceval truncated to -20 EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()), - SyscallSucceedsWithValue(/*minnice=*/-20)); + SyscallSucceedsWithValue( + /*minnice=*/-20)); // NOLINT(bugprone-argument-comment) } // Process is not found when which=PRIO_PROCESS @@ -167,7 +174,7 @@ TEST(SetpriorityTest, InvalidWho) { // Flaky, but it's tough to avoid a race condition when finding an unused pid EXPECT_THAT(setpriority(PRIO_PROCESS, /*who=*/INT_MAX - 1, - /*nice=*/16), + /*nice=*/16), // NOLINT(bugprone-argument-comment) SyscallFailsWithErrno(ESRCH)); } diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index 78aa73edc..8a4025fed 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -54,6 +54,8 @@ #include "absl/strings/match.h" #include "absl/strings/numbers.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "absl/strings/str_split.h" #include "absl/strings/string_view.h" #include "absl/synchronization/mutex.h" @@ -88,6 +90,7 @@ using ::testing::Gt; using ::testing::HasSubstr; using ::testing::IsSupersetOf; using ::testing::Pair; +using ::testing::StartsWith; using ::testing::UnorderedElementsAre; using ::testing::UnorderedElementsAreArray; @@ -1622,10 +1625,41 @@ TEST(ProcPidStatusTest, HasBasicFields) { ASSERT_FALSE(status_str.empty()); const auto status = ASSERT_NO_ERRNO_AND_VALUE(ParseProcStatus(status_str)); - EXPECT_THAT(status, IsSupersetOf({Pair("Name", thread_name), - Pair("Tgid", absl::StrCat(tgid)), - Pair("Pid", absl::StrCat(tid)), - Pair("PPid", absl::StrCat(getppid()))})); + EXPECT_THAT(status, IsSupersetOf({ + Pair("Name", thread_name), + Pair("Tgid", absl::StrCat(tgid)), + Pair("Pid", absl::StrCat(tid)), + Pair("PPid", absl::StrCat(getppid())), + })); + + if (!IsRunningWithVFS1()) { + uid_t ruid, euid, suid; + ASSERT_THAT(getresuid(&ruid, &euid, &suid), SyscallSucceeds()); + gid_t rgid, egid, sgid; + ASSERT_THAT(getresgid(&rgid, &egid, &sgid), SyscallSucceeds()); + std::vector<gid_t> supplementary_gids; + int ngids = getgroups(0, nullptr); + supplementary_gids.resize(ngids); + ASSERT_THAT(getgroups(ngids, supplementary_gids.data()), + SyscallSucceeds()); + + EXPECT_THAT( + status, + IsSupersetOf(std::vector< + ::testing::Matcher<std::pair<std::string, std::string>>>{ + // gVisor doesn't support fsuid/gid, and even if it did there is + // no getfsuid/getfsgid(). + Pair("Uid", StartsWith(absl::StrFormat("%d\t%d\t%d\t", ruid, euid, + suid))), + Pair("Gid", StartsWith(absl::StrFormat("%d\t%d\t%d\t", rgid, egid, + sgid))), + // ParseProcStatus strips leading whitespace for each value, + // so if the Groups line is empty then the trailing space is + // stripped. + Pair("Groups", + StartsWith(absl::StrJoin(supplementary_gids, " "))), + })); + } }); } diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc index 04fecc02e..672f2dd42 100644 --- a/test/syscalls/linux/proc_net.cc +++ b/test/syscalls/linux/proc_net.cc @@ -28,10 +28,10 @@ #include "absl/strings/str_split.h" #include "absl/strings/string_view.h" #include "absl/time/clock.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { @@ -152,6 +152,37 @@ TEST(ProcNetDev, Format) { EXPECT_GT(entries.size(), 0); } +// GetMibsAllocationSysctl retuns a value of the net.core.mibs_allocation +// sysctl./proc/sys/net/core/mibs_allocation +// +// When mibs_allocation is unset, a netns creation inherits MIB from init +// network namespace. Otherwise, MIBS is allocated for each namespace. +int GetMibsAllocationSysctl() { + auto ret = GetContents("/proc/sys/net/core/mibs_allocation"); + if (!ret.ok()) { + // The current kernel doesn't support mibs_allocation. + return 1; + } + int32_t val; + EXPECT_TRUE(absl::SimpleAtoi(ret.ValueOrDie(), &val)); + return val; +} + +// GetSNMPMetricFromProc retrieves the metric named `item` of `type` from the +// `snmp` string which represents the file contents of /proc/net/snmp. +// +// Note to test writers: If you are writing tests to check the change in value +// of SNMP metrics, note that if GetMibsAllocationSysctl() == 0 +// (net.core.mibs_allocation isn't set), MIB is from the init network namespace +// and hence these metrics can have system-wide noise. +// +// A feasible way of testing is the following: +// * Consult RFC 1213 to learn the "SYNTAX" of the metric. +// * If net.core.mibs_allocation is set: +// - Can test any metric while checking for hard equality. +// * If net.core.mibs_allocation is NOT set (system-wide noise is present): +// - Only test "SYNTAX Counter" metrics. +// - Counter metrics are increasing in nature, so use LE/GE equality checks. PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp, const std::string& type, const std::string& item) { @@ -226,12 +257,25 @@ TEST(ProcNetSnmp, TcpReset) { newAttemptFails = ASSERT_NO_ERRNO_AND_VALUE( GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails")); - EXPECT_EQ(oldActiveOpens, newActiveOpens - 1); - EXPECT_EQ(oldOutRsts, newOutRsts - 1); - EXPECT_EQ(oldAttemptFails, newAttemptFails - 1); + if (GetMibsAllocationSysctl()) { + EXPECT_EQ(oldActiveOpens + 1, newActiveOpens); + EXPECT_EQ(oldOutRsts + 1, newOutRsts); + EXPECT_EQ(oldAttemptFails + 1, newAttemptFails); + } else { + // System-wide statistics can have some noise. These metrics should have + // increased by at least 1. + EXPECT_LE(oldActiveOpens + 1, newActiveOpens); + EXPECT_LE(oldOutRsts + 1, newOutRsts); + EXPECT_LE(oldAttemptFails + 1, newAttemptFails); + } } TEST(ProcNetSnmp, TcpEstab) { + // This test aims to test the tcpCurrEstab metric which has "SYNTAX Gauge" as + // per RFC 1213. Hence, it becomes infeasible to test this when system-wide + // statistics have noise. + SKIP_IF(GetMibsAllocationSysctl() == 0); + // TODO(gvisor.dev/issue/866): epsocket metrics are not savable. DisableSave ds; @@ -286,9 +330,9 @@ TEST(ProcNetSnmp, TcpEstab) { newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE( GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab")); - EXPECT_EQ(oldActiveOpens, newActiveOpens - 1); - EXPECT_EQ(oldPassiveOpens, newPassiveOpens - 1); - EXPECT_EQ(oldCurrEstab, newCurrEstab - 2); + EXPECT_EQ(oldActiveOpens + 1, newActiveOpens); + EXPECT_EQ(oldPassiveOpens + 1, newPassiveOpens); + EXPECT_EQ(oldCurrEstab + 2, newCurrEstab); // Send 1 byte from client to server. ASSERT_THAT(send(s_connect.get(), "a", 1, 0), SyscallSucceedsWithValue(1)); @@ -355,8 +399,15 @@ TEST(ProcNetSnmp, UdpNoPorts) { newNoPorts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts")); - EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1); - EXPECT_EQ(oldNoPorts, newNoPorts - 1); + if (GetMibsAllocationSysctl()) { + EXPECT_EQ(oldOutDatagrams + 1, newOutDatagrams); + EXPECT_EQ(oldNoPorts + 1, newNoPorts); + } else { + // System-wide statistics can have some noise. These metrics should have + // increased by at least 1. + EXPECT_LE(oldOutDatagrams + 1, newOutDatagrams); + EXPECT_LE(oldNoPorts + 1, newNoPorts); + } } TEST(ProcNetSnmp, UdpIn) { @@ -405,8 +456,15 @@ TEST(ProcNetSnmp, UdpIn) { newInDatagrams = ASSERT_NO_ERRNO_AND_VALUE( GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams")); - EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1); - EXPECT_EQ(oldInDatagrams, newInDatagrams - 1); + if (GetMibsAllocationSysctl()) { + EXPECT_EQ(oldOutDatagrams + 1, newOutDatagrams); + EXPECT_EQ(oldInDatagrams + 1, newInDatagrams); + } else { + // System-wide statistics can have some noise. These metrics should have + // increased by at least 1. + EXPECT_LE(oldOutDatagrams + 1, newOutDatagrams); + EXPECT_LE(oldInDatagrams + 1, newInDatagrams); + } } TEST(ProcNetSnmp, CheckNetStat) { @@ -498,13 +556,7 @@ TEST(ProcSysNetIpv4Recovery, CanReadAndWrite) { // Check initial value is set to 1. EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0), SyscallSucceedsWithValue(sizeof(to_write) + 1)); - if (IsRunningOnGvisor()) { - // TODO(gvisor.dev/issue/5243): TCPRACKLossDetection = 1 should be turned on - // by default. - EXPECT_EQ(strcmp(buf, "0\n"), 0); - } else { - EXPECT_EQ(strcmp(buf, "1\n"), 0); - } + EXPECT_EQ(strcmp(buf, "1\n"), 0); // Set tcp_recovery to one of the allowed constants. EXPECT_THAT(PwriteFd(fd.get(), &to_write, sizeof(to_write), 0), diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc index d519b65e6..f64c23ac0 100644 --- a/test/syscalls/linux/ptrace.cc +++ b/test/syscalls/linux/ptrace.cc @@ -30,6 +30,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/flags/flag.h" +#include "absl/strings/string_view.h" #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/util/capability_util.h" @@ -51,17 +52,10 @@ ABSL_FLAG(bool, ptrace_test_execve_child, false, ABSL_FLAG(bool, ptrace_test_trace_descendants_allowed, false, "If set, run the child workload for " "PtraceTest_TraceDescendantsAllowed."); -ABSL_FLAG(bool, ptrace_test_prctl_set_ptracer_pid, false, - "If set, run the child workload for PtraceTest_PrctlSetPtracerPID."); -ABSL_FLAG(bool, ptrace_test_prctl_set_ptracer_any, false, - "If set, run the child workload for PtraceTest_PrctlSetPtracerAny."); -ABSL_FLAG(bool, ptrace_test_prctl_clear_ptracer, false, - "If set, run the child workload for PtraceTest_PrctlClearPtracer."); -ABSL_FLAG(bool, ptrace_test_prctl_replace_ptracer, false, - "If set, run the child workload for PtraceTest_PrctlReplacePtracer."); -ABSL_FLAG(int, ptrace_test_prctl_replace_ptracer_tid, -1, - "Specifies the replacement tracer tid in the child workload for " - "PtraceTest_PrctlReplacePtracer."); +ABSL_FLAG(bool, ptrace_test_ptrace_attacher, false, + "If set, run the child workload for PtraceAttacherSubprocess."); +ABSL_FLAG(bool, ptrace_test_prctl_set_ptracer, false, + "If set, run the child workload for PrctlSetPtracerSubprocess."); ABSL_FLAG(bool, ptrace_test_prctl_set_ptracer_and_exit_tracee_thread, false, "If set, run the child workload for " "PtraceTest_PrctlSetPtracerPersistsPastTraceeThreadExit."); @@ -161,6 +155,86 @@ int CheckPtraceAttach(pid_t pid) { return 0; } +class SimpleSubprocess { + public: + explicit SimpleSubprocess(absl::string_view child_flag) { + int sockets[2]; + TEST_PCHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0); + + // Allocate vector before forking (not async-signal-safe). + ExecveArray const owned_child_argv = {"/proc/self/exe", child_flag, + "--ptrace_test_fd", + std::to_string(sockets[0])}; + char* const* const child_argv = owned_child_argv.get(); + + pid_ = fork(); + if (pid_ == 0) { + TEST_PCHECK(close(sockets[1]) == 0); + execve(child_argv[0], child_argv, /* envp = */ nullptr); + TEST_PCHECK_MSG(false, "Survived execve to test child"); + } + TEST_PCHECK(pid_ > 0); + TEST_PCHECK(close(sockets[0]) == 0); + sockfd_ = sockets[1]; + } + + SimpleSubprocess(SimpleSubprocess&& orig) + : pid_(orig.pid_), sockfd_(orig.sockfd_) { + orig.pid_ = -1; + orig.sockfd_ = -1; + } + + SimpleSubprocess& operator=(SimpleSubprocess&& orig) { + if (this != &orig) { + this->~SimpleSubprocess(); + pid_ = orig.pid_; + sockfd_ = orig.sockfd_; + orig.pid_ = -1; + orig.sockfd_ = -1; + } + return *this; + } + + SimpleSubprocess(SimpleSubprocess const&) = delete; + SimpleSubprocess& operator=(SimpleSubprocess const&) = delete; + + ~SimpleSubprocess() { + if (pid_ < 0) { + return; + } + EXPECT_THAT(shutdown(sockfd_, SHUT_RDWR), SyscallSucceeds()); + EXPECT_THAT(close(sockfd_), SyscallSucceeds()); + int status; + EXPECT_THAT(waitpid(pid_, &status, 0), SyscallSucceedsWithValue(pid_)); + EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) + << " status " << status; + } + + pid_t pid() const { return pid_; } + + // Sends the child process the given value, receives an errno in response, and + // returns a PosixError corresponding to the received errno. + template <typename T> + PosixError Cmd(T val) { + if (WriteFd(sockfd_, &val, sizeof(val)) < 0) { + return PosixError(errno, "write failed"); + } + return RecvErrno(); + } + + private: + PosixError RecvErrno() { + int resp_errno; + if (ReadFd(sockfd_, &resp_errno, sizeof(resp_errno)) < 0) { + return PosixError(errno, "read failed"); + } + return PosixError(resp_errno); + } + + pid_t pid_ = -1; + int sockfd_ = -1; +}; + TEST(PtraceTest, AttachSelf) { EXPECT_THAT(ptrace(PTRACE_ATTACH, gettid(), 0, 0), SyscallFailsWithErrno(EPERM)); @@ -343,289 +417,128 @@ TEST(PtraceTest, PrctlSetPtracerInvalidPID) { EXPECT_THAT(prctl(PR_SET_PTRACER, 123456789), SyscallFailsWithErrno(EINVAL)); } -TEST(PtraceTest, PrctlSetPtracerPID) { - SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(YamaPtraceScope()) != 1); - - AutoCapability cap(CAP_SYS_PTRACE, false); - - // Use sockets to synchronize between tracer and tracee. - int sockets[2]; - ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets), SyscallSucceeds()); - - // Allocate vector before forking (not async-signal-safe). - ExecveArray const owned_child_argv = { - "/proc/self/exe", "--ptrace_test_prctl_set_ptracer_pid", - "--ptrace_test_fd", std::to_string(sockets[0])}; - char* const* const child_argv = owned_child_argv.get(); - - pid_t const tracee_pid = fork(); - if (tracee_pid == 0) { - TEST_PCHECK(close(sockets[1]) == 0); - // This test will create a new thread in the child process. - // pthread_create(2) isn't async-signal-safe, so we execve() first. - execve(child_argv[0], child_argv, /* envp = */ nullptr); - TEST_PCHECK_MSG(false, "Survived execve to test child"); - } - ASSERT_THAT(tracee_pid, SyscallSucceeds()); - ASSERT_THAT(close(sockets[0]), SyscallSucceeds()); - - pid_t const tracer_pid = fork(); - if (tracer_pid == 0) { - // Wait until tracee has called prctl. - char done; - TEST_PCHECK(read(sockets[1], &done, 1) == 1); - MaybeSave(); - - TEST_PCHECK(CheckPtraceAttach(tracee_pid) == 0); - _exit(0); - } - ASSERT_THAT(tracer_pid, SyscallSucceeds()); - - // Clean up tracer. - int status; - ASSERT_THAT(waitpid(tracer_pid, &status, 0), SyscallSucceeds()); - EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0); - - // Clean up tracee. - ASSERT_THAT(kill(tracee_pid, SIGKILL), SyscallSucceeds()); - ASSERT_THAT(waitpid(tracee_pid, &status, 0), - SyscallSucceedsWithValue(tracee_pid)); - EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL) - << " status " << status; +SimpleSubprocess CreatePtraceAttacherSubprocess() { + return SimpleSubprocess("--ptrace_test_ptrace_attacher"); } -[[noreturn]] void RunPrctlSetPtracerPID(int fd) { - ScopedThread t([fd] { - // Perform prctl in a separate thread to verify that it is process-wide. - TEST_PCHECK(prctl(PR_SET_PTRACER, getppid()) == 0); - MaybeSave(); - // Indicate that the prctl has been set. - TEST_PCHECK(write(fd, "x", 1) == 1); - MaybeSave(); +[[noreturn]] static void RunPtraceAttacher(int sockfd) { + // execve() may have restored CAP_SYS_PTRACE if we had real UID 0. + TEST_CHECK(SetCapability(CAP_SYS_PTRACE, false).ok()); + // Perform PTRACE_ATTACH in a separate thread to verify that permissions + // apply process-wide. + ScopedThread t([&] { + while (true) { + pid_t pid; + int rv = read(sockfd, &pid, sizeof(pid)); + if (rv == 0) { + _exit(0); + } + if (rv < 0) { + _exit(1); + } + int resp_errno = 0; + if (CheckPtraceAttach(pid) < 0) { + resp_errno = errno; + } + TEST_PCHECK(write(sockfd, &resp_errno, sizeof(resp_errno)) == + sizeof(resp_errno)); + } }); while (true) { SleepSafe(absl::Seconds(1)); } } -TEST(PtraceTest, PrctlSetPtracerAny) { - SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(YamaPtraceScope()) != 1); - AutoCapability cap(CAP_SYS_PTRACE, false); - - // Use sockets to synchronize between tracer and tracee. - int sockets[2]; - ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets), SyscallSucceeds()); - - // Allocate vector before forking (not async-signal-safe). - ExecveArray const owned_child_argv = { - "/proc/self/exe", "--ptrace_test_prctl_set_ptracer_any", - "--ptrace_test_fd", std::to_string(sockets[0])}; - char* const* const child_argv = owned_child_argv.get(); - - pid_t const tracee_pid = fork(); - if (tracee_pid == 0) { - // This test will create a new thread in the child process. - // pthread_create(2) isn't async-signal-safe, so we execve() first. - TEST_PCHECK(close(sockets[1]) == 0); - execve(child_argv[0], child_argv, /* envp = */ nullptr); - TEST_PCHECK_MSG(false, "Survived execve to test child"); - } - ASSERT_THAT(tracee_pid, SyscallSucceeds()); - ASSERT_THAT(close(sockets[0]), SyscallSucceeds()); - - pid_t const tracer_pid = fork(); - if (tracer_pid == 0) { - // Wait until tracee has called prctl. - char done; - TEST_PCHECK(read(sockets[1], &done, 1) == 1); - MaybeSave(); - - TEST_PCHECK(CheckPtraceAttach(tracee_pid) == 0); - _exit(0); - } - ASSERT_THAT(tracer_pid, SyscallSucceeds()); - - // Clean up tracer. - int status; - ASSERT_THAT(waitpid(tracer_pid, &status, 0), SyscallSucceeds()); - EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) - << " status " << status; - - // Clean up tracee. - ASSERT_THAT(kill(tracee_pid, SIGKILL), SyscallSucceeds()); - ASSERT_THAT(waitpid(tracee_pid, &status, 0), - SyscallSucceedsWithValue(tracee_pid)); - EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL) - << " status " << status; +SimpleSubprocess CreatePrctlSetPtracerSubprocess() { + return SimpleSubprocess("--ptrace_test_prctl_set_ptracer"); } -[[noreturn]] void RunPrctlSetPtracerAny(int fd) { - ScopedThread t([fd] { - // Perform prctl in a separate thread to verify that it is process-wide. - TEST_PCHECK(prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY) == 0); - MaybeSave(); - // Indicate that the prctl has been set. - TEST_PCHECK(write(fd, "x", 1) == 1); - MaybeSave(); +[[noreturn]] static void RunPrctlSetPtracer(int sockfd) { + // Perform prctl in a separate thread to verify that it applies + // process-wide. + ScopedThread t([&] { + while (true) { + pid_t pid; + int rv = read(sockfd, &pid, sizeof(pid)); + if (rv == 0) { + _exit(0); + } + if (rv < 0) { + _exit(1); + } + int resp_errno = 0; + if (prctl(PR_SET_PTRACER, pid) < 0) { + resp_errno = errno; + } + TEST_PCHECK(write(sockfd, &resp_errno, sizeof(resp_errno)) == + sizeof(resp_errno)); + } }); while (true) { SleepSafe(absl::Seconds(1)); } } -TEST(PtraceTest, PrctlClearPtracer) { +TEST(PtraceTest, PrctlSetPtracer) { SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(YamaPtraceScope()) != 1); - AutoCapability cap(CAP_SYS_PTRACE, false); - - // Use sockets to synchronize between tracer and tracee. - int sockets[2]; - ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets), SyscallSucceeds()); - - // Allocate vector before forking (not async-signal-safe). - ExecveArray const owned_child_argv = { - "/proc/self/exe", "--ptrace_test_prctl_clear_ptracer", "--ptrace_test_fd", - std::to_string(sockets[0])}; - char* const* const child_argv = owned_child_argv.get(); - - pid_t const tracee_pid = fork(); - if (tracee_pid == 0) { - // This test will create a new thread in the child process. - // pthread_create(2) isn't async-signal-safe, so we execve() first. - TEST_PCHECK(close(sockets[1]) == 0); - execve(child_argv[0], child_argv, /* envp = */ nullptr); - TEST_PCHECK_MSG(false, "Survived execve to test child"); - } - ASSERT_THAT(tracee_pid, SyscallSucceeds()); - ASSERT_THAT(close(sockets[0]), SyscallSucceeds()); - - pid_t const tracer_pid = fork(); - if (tracer_pid == 0) { - // Wait until tracee has called prctl. - char done; - TEST_PCHECK(read(sockets[1], &done, 1) == 1); - MaybeSave(); - - TEST_CHECK(CheckPtraceAttach(tracee_pid) == -1); - TEST_PCHECK(errno == EPERM); - _exit(0); - } - ASSERT_THAT(tracer_pid, SyscallSucceeds()); - - // Clean up tracer. - int status; - ASSERT_THAT(waitpid(tracer_pid, &status, 0), SyscallSucceeds()); - EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) - << " status " << status; - - // Clean up tracee. - ASSERT_THAT(kill(tracee_pid, SIGKILL), SyscallSucceeds()); - ASSERT_THAT(waitpid(tracee_pid, &status, 0), - SyscallSucceedsWithValue(tracee_pid)); - EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL) - << " status " << status; -} - -[[noreturn]] void RunPrctlClearPtracer(int fd) { - ScopedThread t([fd] { - // Perform prctl in a separate thread to verify that it is process-wide. - TEST_PCHECK(prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY) == 0); - MaybeSave(); - TEST_PCHECK(prctl(PR_SET_PTRACER, 0) == 0); - MaybeSave(); - // Indicate that the prctl has been set/cleared. - TEST_PCHECK(write(fd, "x", 1) == 1); - MaybeSave(); - }); - while (true) { - SleepSafe(absl::Seconds(1)); - } -} -TEST(PtraceTest, PrctlReplacePtracer) { - SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(YamaPtraceScope()) != 1); AutoCapability cap(CAP_SYS_PTRACE, false); - pid_t const unused_pid = fork(); - if (unused_pid == 0) { - while (true) { - SleepSafe(absl::Seconds(1)); - } - } - ASSERT_THAT(unused_pid, SyscallSucceeds()); + // Ensure that initially, no tracer exception is set. + ASSERT_THAT(prctl(PR_SET_PTRACER, 0), SyscallSucceeds()); - // Use sockets to synchronize between tracer and tracee. - int sockets[2]; - ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets), SyscallSucceeds()); + SimpleSubprocess tracee = CreatePrctlSetPtracerSubprocess(); + SimpleSubprocess tracer = CreatePtraceAttacherSubprocess(); - // Allocate vector before forking (not async-signal-safe). - ExecveArray const owned_child_argv = { - "/proc/self/exe", - "--ptrace_test_prctl_replace_ptracer", - "--ptrace_test_prctl_replace_ptracer_tid", - std::to_string(unused_pid), - "--ptrace_test_fd", - std::to_string(sockets[0])}; - char* const* const child_argv = owned_child_argv.get(); + // By default, Yama should prevent tracer from tracing its parent (this + // process) or siblings (tracee). + EXPECT_THAT(tracer.Cmd(gettid()), PosixErrorIs(EPERM)); + EXPECT_THAT(tracer.Cmd(tracee.pid()), PosixErrorIs(EPERM)); - pid_t const tracee_pid = fork(); - if (tracee_pid == 0) { - TEST_PCHECK(close(sockets[1]) == 0); - // This test will create a new thread in the child process. - // pthread_create(2) isn't async-signal-safe, so we execve() first. - execve(child_argv[0], child_argv, /* envp = */ nullptr); - TEST_PCHECK_MSG(false, "Survived execve to test child"); - } - ASSERT_THAT(tracee_pid, SyscallSucceeds()); - ASSERT_THAT(close(sockets[0]), SyscallSucceeds()); + // If tracee invokes PR_SET_PTRACER on either tracer's pid, the pid of any of + // its ancestors (i.e. us), or PR_SET_PTRACER_ANY, then tracer can trace it + // (but not us). - pid_t const tracer_pid = fork(); - if (tracer_pid == 0) { - // Wait until tracee has called prctl. - char done; - TEST_PCHECK(read(sockets[1], &done, 1) == 1); - MaybeSave(); + ASSERT_THAT(tracee.Cmd(tracer.pid()), PosixErrorIs(0)); + EXPECT_THAT(tracer.Cmd(tracee.pid()), PosixErrorIs(0)); + EXPECT_THAT(tracer.Cmd(gettid()), PosixErrorIs(EPERM)); - TEST_CHECK(CheckPtraceAttach(tracee_pid) == -1); - TEST_PCHECK(errno == EPERM); - _exit(0); - } - ASSERT_THAT(tracer_pid, SyscallSucceeds()); + ASSERT_THAT(tracee.Cmd(gettid()), PosixErrorIs(0)); + EXPECT_THAT(tracer.Cmd(tracee.pid()), PosixErrorIs(0)); + EXPECT_THAT(tracer.Cmd(gettid()), PosixErrorIs(EPERM)); - // Clean up tracer. - int status; - ASSERT_THAT(waitpid(tracer_pid, &status, 0), SyscallSucceeds()); - EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) - << " status " << status; + ASSERT_THAT(tracee.Cmd(static_cast<pid_t>(PR_SET_PTRACER_ANY)), + PosixErrorIs(0)); + EXPECT_THAT(tracer.Cmd(tracee.pid()), PosixErrorIs(0)); + EXPECT_THAT(tracer.Cmd(gettid()), PosixErrorIs(EPERM)); - // Clean up tracee. - ASSERT_THAT(kill(tracee_pid, SIGKILL), SyscallSucceeds()); - ASSERT_THAT(waitpid(tracee_pid, &status, 0), - SyscallSucceedsWithValue(tracee_pid)); - EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL) - << " status " << status; + // If tracee invokes PR_SET_PTRACER with pid 0, then tracer can no longer + // trace it. + ASSERT_THAT(tracee.Cmd(0), PosixErrorIs(0)); + EXPECT_THAT(tracer.Cmd(tracee.pid()), PosixErrorIs(EPERM)); - // Clean up unused. - ASSERT_THAT(kill(unused_pid, SIGKILL), SyscallSucceeds()); - ASSERT_THAT(waitpid(unused_pid, &status, 0), - SyscallSucceedsWithValue(unused_pid)); - EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL) - << " status " << status; -} + // If we invoke PR_SET_PTRACER with tracer's pid, then it can trace us (but + // not our descendants). + ASSERT_THAT(prctl(PR_SET_PTRACER, tracer.pid()), SyscallSucceeds()); + EXPECT_THAT(tracer.Cmd(gettid()), PosixErrorIs(0)); + EXPECT_THAT(tracer.Cmd(tracee.pid()), PosixErrorIs(EPERM)); -[[noreturn]] void RunPrctlReplacePtracer(int new_tracer_pid, int fd) { - TEST_PCHECK(prctl(PR_SET_PTRACER, getppid()) == 0); - MaybeSave(); + // If we invoke PR_SET_PTRACER with pid 0, then tracer can no longer trace us. + ASSERT_THAT(prctl(PR_SET_PTRACER, 0), SyscallSucceeds()); + EXPECT_THAT(tracer.Cmd(gettid()), PosixErrorIs(EPERM)); - ScopedThread t([new_tracer_pid, fd] { - TEST_PCHECK(prctl(PR_SET_PTRACER, new_tracer_pid) == 0); - MaybeSave(); - // Indicate that the prctl has been set. - TEST_PCHECK(write(fd, "x", 1) == 1); - MaybeSave(); - }); - while (true) { - SleepSafe(absl::Seconds(1)); - } + // Another thread in our thread group can invoke PR_SET_PTRACER instead; its + // effect applies to the whole thread group. + pid_t const our_tid = gettid(); + ScopedThread([&] { + ASSERT_THAT(prctl(PR_SET_PTRACER, tracer.pid()), SyscallSucceeds()); + EXPECT_THAT(tracer.Cmd(gettid()), PosixErrorIs(0)); + EXPECT_THAT(tracer.Cmd(our_tid), PosixErrorIs(0)); + + ASSERT_THAT(prctl(PR_SET_PTRACER, 0), SyscallSucceeds()); + EXPECT_THAT(tracer.Cmd(gettid()), PosixErrorIs(EPERM)); + EXPECT_THAT(tracer.Cmd(our_tid), PosixErrorIs(EPERM)); + }).Join(); } // Tests that YAMA exceptions store tracees by thread group leader. Exceptions @@ -2342,21 +2255,12 @@ int main(int argc, char** argv) { gvisor::testing::RunTraceDescendantsAllowed(fd); } - if (absl::GetFlag(FLAGS_ptrace_test_prctl_set_ptracer_pid)) { - gvisor::testing::RunPrctlSetPtracerPID(fd); - } - - if (absl::GetFlag(FLAGS_ptrace_test_prctl_set_ptracer_any)) { - gvisor::testing::RunPrctlSetPtracerAny(fd); - } - - if (absl::GetFlag(FLAGS_ptrace_test_prctl_clear_ptracer)) { - gvisor::testing::RunPrctlClearPtracer(fd); + if (absl::GetFlag(FLAGS_ptrace_test_ptrace_attacher)) { + gvisor::testing::RunPtraceAttacher(fd); } - if (absl::GetFlag(FLAGS_ptrace_test_prctl_replace_ptracer)) { - gvisor::testing::RunPrctlReplacePtracer( - absl::GetFlag(FLAGS_ptrace_test_prctl_replace_ptracer_tid), fd); + if (absl::GetFlag(FLAGS_ptrace_test_prctl_set_ptracer)) { + gvisor::testing::RunPrctlSetPtracer(fd); } if (absl::GetFlag( diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc index 5ff1f12a0..c6592b4a7 100644 --- a/test/syscalls/linux/pty.cc +++ b/test/syscalls/linux/pty.cc @@ -1594,9 +1594,9 @@ TEST_F(JobControlTest, GetForegroundProcessGroupNonControlling) { // - creates a child process in a new process group // - sets that child as the foreground process group // - kills its child and sets itself as the foreground process group. -// TODO(gvisor.dev/issue/5357): Fix and enable. -TEST_F(JobControlTest, DISABLED_SetForegroundProcessGroup) { +TEST_F(JobControlTest, SetForegroundProcessGroup) { auto res = RunInChild([=]() { + TEST_PCHECK(setsid() >= 0); TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0)); // Ignore SIGTTOU so that we don't stop ourself when calling tcsetpgrp. @@ -1634,12 +1634,90 @@ TEST_F(JobControlTest, DISABLED_SetForegroundProcessGroup) { // Set ourself as the foreground process. pid_t pgid; - TEST_PCHECK(pgid = getpgid(0) == 0); + TEST_PCHECK((pgid = getpgid(0)) >= 0); TEST_PCHECK(!tcsetpgrp(replica_.get(), pgid)); }); ASSERT_NO_ERRNO(res); } +// This test verifies if a SIGTTOU signal is sent to the calling process's group +// when tcsetpgrp is called by a background process +TEST_F(JobControlTest, SetForegroundProcessGroupSIGTTOUBackground) { + auto res = RunInChild([=]() { + TEST_PCHECK(setsid() >= 0); + TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0)); + pid_t grandchild = fork(); + if (!grandchild) { + // Assign a different pgid to the child so it will result as + // a background process. + TEST_PCHECK(!setpgid(grandchild, getpid())); + TEST_PCHECK(!tcsetpgrp(replica_.get(), getpgid(0))); + // We should never reach this. + _exit(1); + } + int wstatus; + TEST_PCHECK(waitpid(grandchild, &wstatus, WSTOPPED) == grandchild); + TEST_PCHECK(WSTOPSIG(wstatus) == SIGTTOU); + }); + ASSERT_NO_ERRNO(res); +} + +// This test verifies that a SIGTTOU signal is not delivered to +// a background process which calls tcsetpgrp and is ignoring SIGTTOU +TEST_F(JobControlTest, SetForegroundProcessGroupSIGTTOUIgnored) { + auto res = RunInChild([=]() { + TEST_PCHECK(setsid() >= 0); + TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0)); + pid_t grandchild = fork(); + if (!grandchild) { + // Ignore SIGTTOU so the child in background won't + // be stopped when it will call tcsetpgrp + struct sigaction sa = {}; + sa.sa_handler = SIG_IGN; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + sigaction(SIGTTOU, &sa, NULL); + // Assign a different pgid to the child so it will result as + // a background process. + TEST_PCHECK(!setpgid(grandchild, getpid())); + TEST_PCHECK(!tcsetpgrp(replica_.get(), getpgid(0))); + _exit(0); + } + int wstatus; + TEST_PCHECK(waitpid(grandchild, &wstatus, WSTOPPED) == grandchild); + TEST_PCHECK(WSTOPSIG(wstatus) != SIGTTOU); + TEST_PCHECK(WIFEXITED(wstatus)); + }); + ASSERT_NO_ERRNO(res); +} + +// This test verifies that a SIGTTOU signal is not delivered to +// a background process which calls tcsetpgrp and is blocking SIGTTOU +TEST_F(JobControlTest, SetForegroundProcessGroupSIGTTOUBlocked) { + auto res = RunInChild([=]() { + TEST_PCHECK(setsid() >= 0); + TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0)); + pid_t grandchild = fork(); + if (!grandchild) { + // Block SIGTTOU so the child in background won't + // be stopped when it will call tcsetpgrp + sigset_t signal_set; + sigemptyset(&signal_set); + sigaddset(&signal_set, SIGTTOU); + sigprocmask(SIG_BLOCK, &signal_set, NULL); + // Assign a different pgid to the child so it will result as + // a background process. + TEST_PCHECK(!setpgid(grandchild, getpid())); + TEST_PCHECK(!tcsetpgrp(replica_.get(), getpgid(0))); + _exit(0); + } + int wstatus; + TEST_PCHECK(waitpid(grandchild, &wstatus, WSTOPPED) == grandchild); + TEST_PCHECK(WSTOPSIG(wstatus) != SIGTTOU); + }); + ASSERT_NO_ERRNO(res); +} + TEST_F(JobControlTest, SetForegroundProcessGroupWrongTTY) { pid_t pid = getpid(); ASSERT_THAT(ioctl(replica_.get(), TIOCSPGRP, &pid), @@ -1657,9 +1735,9 @@ TEST_F(JobControlTest, SetForegroundProcessGroupNegPgid) { ASSERT_NO_ERRNO(ret); } -// TODO(gvisor.dev/issue/5357): Fix and enable. -TEST_F(JobControlTest, DISABLED_SetForegroundProcessGroupEmptyProcessGroup) { +TEST_F(JobControlTest, SetForegroundProcessGroupEmptyProcessGroup) { auto res = RunInChild([=]() { + TEST_PCHECK(setsid() >= 0); TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0)); // Create a new process, put it in a new process group, make that group the diff --git a/test/syscalls/linux/raw_socket.cc b/test/syscalls/linux/raw_socket.cc index 69616b400..ef1db47ee 100644 --- a/test/syscalls/linux/raw_socket.cc +++ b/test/syscalls/linux/raw_socket.cc @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include <linux/capability.h> -#include <linux/filter.h> +#include <arpa/inet.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip6.h> @@ -26,10 +25,10 @@ #include <algorithm> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" // Note: in order to run these tests, /proc/sys/net/ipv4/ping_group_range will @@ -76,6 +75,20 @@ class RawSocketTest : public ::testing::TestWithParam<std::tuple<int, int>> { return 0; } + uint16_t Port(struct sockaddr* s) { + if (Family() == AF_INET) { + return ntohs(reinterpret_cast<struct sockaddr_in*>(s)->sin_port); + } + return ntohs(reinterpret_cast<struct sockaddr_in6*>(s)->sin6_port); + } + + void* Addr(struct sockaddr* s) { + if (Family() == AF_INET) { + return &(reinterpret_cast<struct sockaddr_in*>(s)->sin_addr); + } + return &(reinterpret_cast<struct sockaddr_in6*>(s)->sin6_addr); + } + // The socket used for both reading and writing. int s_; @@ -84,7 +97,7 @@ class RawSocketTest : public ::testing::TestWithParam<std::tuple<int, int>> { }; void RawSocketTest::SetUp() { - if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { + if (!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())) { ASSERT_THAT(socket(Family(), SOCK_RAW, Protocol()), SyscallFailsWithErrno(EPERM)); GTEST_SKIP(); @@ -108,7 +121,7 @@ void RawSocketTest::SetUp() { void RawSocketTest::TearDown() { // TearDown will be run even if we skip the test. - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { + if (ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())) { EXPECT_THAT(close(s_), SyscallSucceeds()); } } @@ -117,7 +130,7 @@ void RawSocketTest::TearDown() { // BasicRawSocket::Setup creates the first one, so we only have to create one // more here. TEST_P(RawSocketTest, MultipleCreation) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); int s2; ASSERT_THAT(s2 = socket(Family(), SOCK_RAW, Protocol()), SyscallSucceeds()); @@ -127,7 +140,7 @@ TEST_P(RawSocketTest, MultipleCreation) { // Test that shutting down an unconnected socket fails. TEST_P(RawSocketTest, FailShutdownWithoutConnect) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN)); ASSERT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN)); @@ -135,7 +148,7 @@ TEST_P(RawSocketTest, FailShutdownWithoutConnect) { // Shutdown is a no-op for raw sockets (and datagram sockets in general). TEST_P(RawSocketTest, ShutdownWriteNoop) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT( connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), @@ -150,7 +163,7 @@ TEST_P(RawSocketTest, ShutdownWriteNoop) { // Shutdown is a no-op for raw sockets (and datagram sockets in general). TEST_P(RawSocketTest, ShutdownReadNoop) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT( connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), @@ -167,23 +180,71 @@ TEST_P(RawSocketTest, ShutdownReadNoop) { // Test that listen() fails. TEST_P(RawSocketTest, FailListen) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT(listen(s_, 1), SyscallFailsWithErrno(ENOTSUP)); } // Test that accept() fails. TEST_P(RawSocketTest, FailAccept) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); struct sockaddr saddr; socklen_t addrlen; ASSERT_THAT(accept(s_, &saddr, &addrlen), SyscallFailsWithErrno(ENOTSUP)); } +TEST_P(RawSocketTest, BindThenGetSockName) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); + + struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_); + ASSERT_THAT(bind(s_, addr, AddrLen()), SyscallSucceeds()); + struct sockaddr_storage saddr_storage; + struct sockaddr* saddr = reinterpret_cast<struct sockaddr*>(&saddr_storage); + socklen_t saddrlen = AddrLen(); + ASSERT_THAT(getsockname(s_, saddr, &saddrlen), SyscallSucceeds()); + ASSERT_EQ(saddrlen, AddrLen()); + + // The port is expected to hold the protocol number. + EXPECT_EQ(Port(saddr), Protocol()); + + char addrbuf[INET6_ADDRSTRLEN], saddrbuf[INET6_ADDRSTRLEN]; + const char* addrstr = + inet_ntop(addr->sa_family, Addr(addr), addrbuf, sizeof(addrbuf)); + ASSERT_NE(addrstr, nullptr); + const char* saddrstr = + inet_ntop(saddr->sa_family, Addr(saddr), saddrbuf, sizeof(saddrbuf)); + ASSERT_NE(saddrstr, nullptr); + EXPECT_STREQ(saddrstr, addrstr); +} + +TEST_P(RawSocketTest, ConnectThenGetSockName) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); + + struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_); + ASSERT_THAT(connect(s_, addr, AddrLen()), SyscallSucceeds()); + struct sockaddr_storage saddr_storage; + struct sockaddr* saddr = reinterpret_cast<struct sockaddr*>(&saddr_storage); + socklen_t saddrlen = AddrLen(); + ASSERT_THAT(getsockname(s_, saddr, &saddrlen), SyscallSucceeds()); + ASSERT_EQ(saddrlen, AddrLen()); + + // The port is expected to hold the protocol number. + EXPECT_EQ(Port(saddr), Protocol()); + + char addrbuf[INET6_ADDRSTRLEN], saddrbuf[INET6_ADDRSTRLEN]; + const char* addrstr = + inet_ntop(addr->sa_family, Addr(addr), addrbuf, sizeof(addrbuf)); + ASSERT_NE(addrstr, nullptr); + const char* saddrstr = + inet_ntop(saddr->sa_family, Addr(saddr), saddrbuf, sizeof(saddrbuf)); + ASSERT_NE(saddrstr, nullptr); + EXPECT_STREQ(saddrstr, addrstr); +} + // Test that getpeername() returns nothing before connect(). TEST_P(RawSocketTest, FailGetPeerNameBeforeConnect) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); struct sockaddr saddr; socklen_t addrlen = sizeof(saddr); @@ -193,7 +254,7 @@ TEST_P(RawSocketTest, FailGetPeerNameBeforeConnect) { // Test that getpeername() returns something after connect(). TEST_P(RawSocketTest, GetPeerName) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT( connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), @@ -207,7 +268,7 @@ TEST_P(RawSocketTest, GetPeerName) { // Test that the socket is writable immediately. TEST_P(RawSocketTest, PollWritableImmediately) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); struct pollfd pfd = {}; pfd.fd = s_; @@ -217,7 +278,7 @@ TEST_P(RawSocketTest, PollWritableImmediately) { // Test that the socket isn't readable before receiving anything. TEST_P(RawSocketTest, PollNotReadableInitially) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Try to receive data with MSG_DONTWAIT, which returns immediately if there's // nothing to be read. @@ -228,7 +289,7 @@ TEST_P(RawSocketTest, PollNotReadableInitially) { // Test that the socket becomes readable once something is written to it. TEST_P(RawSocketTest, PollTriggeredOnWrite) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Write something so that there's data to be read. // Arbitrary. @@ -243,7 +304,7 @@ TEST_P(RawSocketTest, PollTriggeredOnWrite) { // Test that we can connect() to a valid IP (loopback). TEST_P(RawSocketTest, ConnectToLoopback) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT( connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), @@ -252,7 +313,7 @@ TEST_P(RawSocketTest, ConnectToLoopback) { // Test that calling send() without connect() fails. TEST_P(RawSocketTest, SendWithoutConnectFails) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Arbitrary. constexpr char kBuf[] = "Endgame was good"; @@ -262,7 +323,7 @@ TEST_P(RawSocketTest, SendWithoutConnectFails) { // Wildcard Bind. TEST_P(RawSocketTest, BindToWildcard) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); struct sockaddr_storage addr; addr = {}; @@ -283,16 +344,15 @@ TEST_P(RawSocketTest, BindToWildcard) { // Bind to localhost. TEST_P(RawSocketTest, BindToLocalhost) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); - ASSERT_THAT( - bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), - SyscallSucceeds()); + ASSERT_THAT(bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), + SyscallSucceeds()); } // Bind to a different address. TEST_P(RawSocketTest, BindToInvalid) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); struct sockaddr_storage bind_addr = addr_; if (Family() == AF_INET) { @@ -304,13 +364,14 @@ TEST_P(RawSocketTest, BindToInvalid) { memset(&sin6->sin6_addr.s6_addr, 0, sizeof(sin6->sin6_addr.s6_addr)); sin6->sin6_addr.s6_addr[0] = 1; // 1: - An address that we can't bind to. } - ASSERT_THAT(bind(s_, reinterpret_cast<struct sockaddr*>(&bind_addr), - AddrLen()), SyscallFailsWithErrno(EADDRNOTAVAIL)); + ASSERT_THAT( + bind(s_, reinterpret_cast<struct sockaddr*>(&bind_addr), AddrLen()), + SyscallFailsWithErrno(EADDRNOTAVAIL)); } // Send and receive an packet. TEST_P(RawSocketTest, SendAndReceive) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Arbitrary. constexpr char kBuf[] = "TB12"; @@ -325,7 +386,7 @@ TEST_P(RawSocketTest, SendAndReceive) { // We should be able to create multiple raw sockets for the same protocol and // receive the same packet on both. TEST_P(RawSocketTest, MultipleSocketReceive) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); int s2; ASSERT_THAT(s2 = socket(Family(), SOCK_RAW, Protocol()), SyscallSucceeds()); @@ -340,11 +401,11 @@ TEST_P(RawSocketTest, MultipleSocketReceive) { // Receive it on socket 2. std::vector<char> recv_buf2(sizeof(kBuf) + HdrLen()); - ASSERT_NO_FATAL_FAILURE(ReceiveBufFrom(s2, recv_buf2.data(), - recv_buf2.size())); + ASSERT_NO_FATAL_FAILURE( + ReceiveBufFrom(s2, recv_buf2.data(), recv_buf2.size())); - EXPECT_EQ(memcmp(recv_buf1.data() + HdrLen(), - recv_buf2.data() + HdrLen(), sizeof(kBuf)), + EXPECT_EQ(memcmp(recv_buf1.data() + HdrLen(), recv_buf2.data() + HdrLen(), + sizeof(kBuf)), 0); ASSERT_THAT(close(s2), SyscallSucceeds()); @@ -352,7 +413,7 @@ TEST_P(RawSocketTest, MultipleSocketReceive) { // Test that connect sends packets to the right place. TEST_P(RawSocketTest, SendAndReceiveViaConnect) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT( connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), @@ -371,11 +432,10 @@ TEST_P(RawSocketTest, SendAndReceiveViaConnect) { // Bind to localhost, then send and receive packets. TEST_P(RawSocketTest, BindSendAndReceive) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); - ASSERT_THAT( - bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), - SyscallSucceeds()); + ASSERT_THAT(bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), + SyscallSucceeds()); // Arbitrary. constexpr char kBuf[] = "DR16"; @@ -389,11 +449,10 @@ TEST_P(RawSocketTest, BindSendAndReceive) { // Bind and connect to localhost and send/receive packets. TEST_P(RawSocketTest, BindConnectSendAndReceive) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); - ASSERT_THAT( - bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), - SyscallSucceeds()); + ASSERT_THAT(bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), + SyscallSucceeds()); ASSERT_THAT( connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), SyscallSucceeds()); @@ -411,7 +470,7 @@ TEST_P(RawSocketTest, BindConnectSendAndReceive) { // Check that setting SO_RCVBUF below min is clamped to the minimum // receive buffer size. TEST_P(RawSocketTest, SetSocketRecvBufBelowMin) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Discover minimum receive buf size by trying to set it to zero. // See: @@ -444,7 +503,7 @@ TEST_P(RawSocketTest, SetSocketRecvBufBelowMin) { // Check that setting SO_RCVBUF above max is clamped to the maximum // receive buffer size. TEST_P(RawSocketTest, SetSocketRecvBufAboveMax) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Discover max buf size by trying to set the largest possible buffer size. constexpr int kRcvBufSz = 0xffffffff; @@ -471,7 +530,7 @@ TEST_P(RawSocketTest, SetSocketRecvBufAboveMax) { // Check that setting SO_RCVBUF min <= kRcvBufSz <= max is honored. TEST_P(RawSocketTest, SetSocketRecvBuf) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); int max = 0; int min = 0; @@ -521,7 +580,7 @@ TEST_P(RawSocketTest, SetSocketRecvBuf) { // Check that setting SO_SNDBUF below min is clamped to the minimum // receive buffer size. TEST_P(RawSocketTest, SetSocketSendBufBelowMin) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Discover minimum buffer size by trying to set it to zero. constexpr int kSndBufSz = 0; @@ -552,7 +611,7 @@ TEST_P(RawSocketTest, SetSocketSendBufBelowMin) { // Check that setting SO_SNDBUF above max is clamped to the maximum // send buffer size. TEST_P(RawSocketTest, SetSocketSendBufAboveMax) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Discover maximum buffer size by trying to set it to a large value. constexpr int kSndBufSz = 0xffffffff; @@ -579,7 +638,7 @@ TEST_P(RawSocketTest, SetSocketSendBufAboveMax) { // Check that setting SO_SNDBUF min <= kSndBufSz <= max is honored. TEST_P(RawSocketTest, SetSocketSendBuf) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); int max = 0; int min = 0; @@ -625,11 +684,10 @@ TEST_P(RawSocketTest, SetSocketSendBuf) { // Test that receive buffer limits are not enforced when the recv buffer is // empty. TEST_P(RawSocketTest, RecvBufLimitsEmptyRecvBuffer) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); - ASSERT_THAT( - bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), - SyscallSucceeds()); + ASSERT_THAT(bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), + SyscallSucceeds()); ASSERT_THAT( connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), SyscallSucceeds()); @@ -656,9 +714,7 @@ TEST_P(RawSocketTest, RecvBufLimitsEmptyRecvBuffer) { // Receive the packet and make sure it's identical. std::vector<char> recv_buf(buf.size() + HdrLen()); ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size())); - EXPECT_EQ( - memcmp(recv_buf.data() + HdrLen(), buf.data(), buf.size()), - 0); + EXPECT_EQ(memcmp(recv_buf.data() + HdrLen(), buf.data(), buf.size()), 0); } { @@ -671,9 +727,7 @@ TEST_P(RawSocketTest, RecvBufLimitsEmptyRecvBuffer) { // Receive the packet and make sure it's identical. std::vector<char> recv_buf(buf.size() + HdrLen()); ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size())); - EXPECT_EQ( - memcmp(recv_buf.data() + HdrLen(), buf.data(), buf.size()), - 0); + EXPECT_EQ(memcmp(recv_buf.data() + HdrLen(), buf.data(), buf.size()), 0); } } @@ -687,11 +741,10 @@ TEST_P(RawSocketTest, RecvBufLimits) { if (Protocol() == IPPROTO_TCP) { return; } - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); - ASSERT_THAT( - bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), - SyscallSucceeds()); + ASSERT_THAT(bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), + SyscallSucceeds()); ASSERT_THAT( connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()), SyscallSucceeds()); @@ -751,9 +804,7 @@ TEST_P(RawSocketTest, RecvBufLimits) { // Receive the packet and make sure it's identical. std::vector<char> recv_buf(buf.size() + HdrLen()); ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size())); - EXPECT_EQ(memcmp(recv_buf.data() + HdrLen(), buf.data(), - buf.size()), - 0); + EXPECT_EQ(memcmp(recv_buf.data() + HdrLen(), buf.data(), buf.size()), 0); } // Assert that the last packet is dropped because the receive buffer should @@ -822,7 +873,7 @@ TEST_P(RawSocketTest, GetSocketDetachFilter) { // AF_INET6+SOCK_RAW+IPPROTO_RAW sockets can be created, but not written to. TEST(RawSocketTest, IPv6ProtoRaw) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); int sock; ASSERT_THAT(sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW), @@ -839,7 +890,7 @@ TEST(RawSocketTest, IPv6ProtoRaw) { } TEST(RawSocketTest, IPv6SendMsg) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); int sock; ASSERT_THAT(sock = socket(AF_INET6, SOCK_RAW, IPPROTO_TCP), @@ -867,7 +918,7 @@ TEST(RawSocketTest, IPv6SendMsg) { } TEST_P(RawSocketTest, ConnectOnIPv6Socket) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); int sock; ASSERT_THAT(sock = socket(AF_INET6, SOCK_RAW, IPPROTO_TCP), @@ -888,6 +939,124 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Combine(::testing::Values(IPPROTO_TCP, IPPROTO_UDP), ::testing::Values(AF_INET, AF_INET6))); +void TestRawSocketMaybeBindReceive(bool do_bind) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); + + constexpr char payload[] = "abcdefgh"; + + const sockaddr_in addr = { + .sin_family = AF_INET, + .sin_addr = {.s_addr = htonl(INADDR_LOOPBACK)}, + }; + + FileDescriptor udp_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + sockaddr_in udp_sock_bind_addr = addr; + socklen_t udp_sock_bind_addr_len = sizeof(udp_sock_bind_addr); + ASSERT_THAT(bind(udp_sock.get(), + reinterpret_cast<const sockaddr*>(&udp_sock_bind_addr), + sizeof(udp_sock_bind_addr)), + SyscallSucceeds()); + ASSERT_THAT(getsockname(udp_sock.get(), + reinterpret_cast<sockaddr*>(&udp_sock_bind_addr), + &udp_sock_bind_addr_len), + SyscallSucceeds()); + ASSERT_EQ(udp_sock_bind_addr_len, sizeof(udp_sock_bind_addr)); + + FileDescriptor raw_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_UDP)); + + auto test_recv = [&](const char* scope, uint32_t expected_destination) { + SCOPED_TRACE(scope); + + constexpr int kInfinitePollTimeout = -1; + pollfd pfd = { + .fd = raw_sock.get(), + .events = POLLIN, + }; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, kInfinitePollTimeout), + SyscallSucceedsWithValue(1)); + + struct ipv4_udp_packet { + iphdr ip; + udphdr udp; + char data[sizeof(payload)]; + + // Used to make sure only the required space is used. + char unused_space; + } ABSL_ATTRIBUTE_PACKED; + constexpr size_t kExpectedIPPacketSize = + offsetof(ipv4_udp_packet, unused_space); + + // Receive the whole IPv4 packet on the raw socket. + ipv4_udp_packet read_raw_packet; + sockaddr_in peer; + socklen_t peerlen = sizeof(peer); + ASSERT_EQ( + recvfrom(raw_sock.get(), reinterpret_cast<char*>(&read_raw_packet), + sizeof(read_raw_packet), 0 /* flags */, + reinterpret_cast<sockaddr*>(&peer), &peerlen), + static_cast<ssize_t>(kExpectedIPPacketSize)) + << strerror(errno); + ASSERT_EQ(peerlen, sizeof(peer)); + EXPECT_EQ(read_raw_packet.ip.version, static_cast<unsigned int>(IPVERSION)); + // IHL holds the number of header bytes in 4 byte units. + EXPECT_EQ(read_raw_packet.ip.ihl, sizeof(read_raw_packet.ip) / 4); + EXPECT_EQ(ntohs(read_raw_packet.ip.tot_len), kExpectedIPPacketSize); + EXPECT_EQ(ntohs(read_raw_packet.ip.frag_off) & IP_OFFMASK, 0); + EXPECT_EQ(read_raw_packet.ip.protocol, SOL_UDP); + EXPECT_EQ(ntohl(read_raw_packet.ip.saddr), INADDR_LOOPBACK); + EXPECT_EQ(ntohl(read_raw_packet.ip.daddr), expected_destination); + EXPECT_EQ(read_raw_packet.udp.source, udp_sock_bind_addr.sin_port); + EXPECT_EQ(read_raw_packet.udp.dest, udp_sock_bind_addr.sin_port); + EXPECT_EQ(ntohs(read_raw_packet.udp.len), + kExpectedIPPacketSize - sizeof(read_raw_packet.ip)); + for (size_t i = 0; i < sizeof(payload); i++) { + EXPECT_EQ(read_raw_packet.data[i], payload[i]) + << "byte mismatch @ idx=" << i; + } + EXPECT_EQ(peer.sin_family, AF_INET); + EXPECT_EQ(peer.sin_port, 0); + EXPECT_EQ(ntohl(peer.sin_addr.s_addr), INADDR_LOOPBACK); + }; + + if (do_bind) { + ASSERT_THAT(bind(raw_sock.get(), reinterpret_cast<const sockaddr*>(&addr), + sizeof(addr)), + SyscallSucceeds()); + } + + constexpr int kSendToFlags = 0; + sockaddr_in different_addr = udp_sock_bind_addr; + different_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK + 1); + ASSERT_THAT(sendto(udp_sock.get(), payload, sizeof(payload), kSendToFlags, + reinterpret_cast<const sockaddr*>(&different_addr), + sizeof(different_addr)), + SyscallSucceedsWithValue(sizeof(payload))); + if (!do_bind) { + ASSERT_NO_FATAL_FAILURE( + test_recv("different_addr", ntohl(different_addr.sin_addr.s_addr))); + } + ASSERT_THAT(sendto(udp_sock.get(), payload, sizeof(payload), kSendToFlags, + reinterpret_cast<const sockaddr*>(&udp_sock_bind_addr), + sizeof(udp_sock_bind_addr)), + SyscallSucceedsWithValue(sizeof(payload))); + ASSERT_NO_FATAL_FAILURE( + test_recv("addr", ntohl(udp_sock_bind_addr.sin_addr.s_addr))); +} + +TEST(RawSocketTest, UnboundReceive) { + // Test that a raw socket receives packets destined to any address if it is + // not bound to an address. + ASSERT_NO_FATAL_FAILURE(TestRawSocketMaybeBindReceive(false /* do_bind */)); +} + +TEST(RawSocketTest, BindReceive) { + // Test that a raw socket only receives packets destined to the address it is + // bound to. + ASSERT_NO_FATAL_FAILURE(TestRawSocketMaybeBindReceive(true /* do_bind */)); +} + } // namespace } // namespace testing diff --git a/test/syscalls/linux/raw_socket_hdrincl.cc b/test/syscalls/linux/raw_socket_hdrincl.cc index 8b3d02d97..d45bd07bc 100644 --- a/test/syscalls/linux/raw_socket_hdrincl.cc +++ b/test/syscalls/linux/raw_socket_hdrincl.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include <linux/capability.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_icmp.h> @@ -27,10 +26,10 @@ #include "gtest/gtest.h" #include "absl/base/internal/endian.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { @@ -63,7 +62,7 @@ class RawHDRINCL : public ::testing::Test { }; void RawHDRINCL::SetUp() { - if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { + if (!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())) { ASSERT_THAT(socket(AF_INET, SOCK_RAW, IPPROTO_RAW), SyscallFailsWithErrno(EPERM)); GTEST_SKIP(); @@ -81,7 +80,7 @@ void RawHDRINCL::SetUp() { void RawHDRINCL::TearDown() { // TearDown will be run even if we skip the test. - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { + if (ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())) { EXPECT_THAT(close(socket_), SyscallSucceeds()); } } @@ -177,11 +176,10 @@ TEST_F(RawHDRINCL, ConnectToLoopback) { SyscallSucceeds()); } -// FIXME(gvisor.dev/issue/3159): Test currently flaky. -TEST_F(RawHDRINCL, DISABLED_SendWithoutConnectSucceeds) { +TEST_F(RawHDRINCL, SendWithoutConnectFails) { struct iphdr hdr = LoopbackHeader(); ASSERT_THAT(send(socket_, &hdr, sizeof(hdr), 0), - SyscallSucceedsWithValue(sizeof(hdr))); + SyscallFailsWithErrno(EDESTADDRREQ)); } // HDRINCL implies write-only. Verify that we can't read a packet sent to @@ -282,9 +280,6 @@ TEST_F(RawHDRINCL, SendAndReceive) { // Send and receive a packet where the sendto address is not the same as the // provided destination. TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) { - // FIXME(gvisor.dev/issue/3160): Test currently flaky. - SKIP_IF(true); - int port = 40000; if (!IsRunningOnGvisor()) { port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE( @@ -302,18 +297,20 @@ TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) { ASSERT_TRUE( FillPacket(packet, sizeof(packet), port, kPayload, sizeof(kPayload))); // Overwrite the IP destination address with an IP we can't get to. + constexpr int32_t kUnreachable = 42; struct iphdr iphdr = {}; memcpy(&iphdr, packet, sizeof(iphdr)); - iphdr.daddr = 42; + iphdr.daddr = kUnreachable; memcpy(packet, &iphdr, sizeof(iphdr)); + // Send to localhost via loopback. socklen_t addrlen = sizeof(addr_); ASSERT_NO_FATAL_FAILURE(sendto(socket_, &packet, sizeof(packet), 0, reinterpret_cast<struct sockaddr*>(&addr_), addrlen)); - // Receive the payload, since sendto should replace the bad destination with - // localhost. + // Receive the payload. Despite an unreachable destination address, sendto + // should have sent the packet through loopback. char recv_buf[sizeof(packet)]; struct sockaddr_in src; socklen_t src_size = sizeof(src); @@ -331,9 +328,8 @@ TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) { struct iphdr recv_iphdr = {}; memcpy(&recv_iphdr, recv_buf, sizeof(recv_iphdr)); EXPECT_NE(recv_iphdr.id, 0); - // The destination address should be localhost, not the bad IP we set - // initially. - EXPECT_EQ(absl::gbswap_32(recv_iphdr.daddr), INADDR_LOOPBACK); + // The destination address is kUnreachable despite arriving via loopback. + EXPECT_EQ(recv_iphdr.daddr, kUnreachable); } // Send and receive a packet w/ the IP_HDRINCL option set. diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc index bd779da92..3f9717284 100644 --- a/test/syscalls/linux/raw_socket_icmp.cc +++ b/test/syscalls/linux/raw_socket_icmp.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include <linux/capability.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_icmp.h> @@ -24,10 +23,10 @@ #include <cstdint> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { @@ -77,7 +76,7 @@ class RawSocketICMPTest : public ::testing::Test { }; void RawSocketICMPTest::SetUp() { - if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { + if (!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())) { ASSERT_THAT(socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallFailsWithErrno(EPERM)); GTEST_SKIP(); @@ -95,7 +94,7 @@ void RawSocketICMPTest::SetUp() { void RawSocketICMPTest::TearDown() { // TearDown will be run even if we skip the test. - if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) { + if (ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())) { EXPECT_THAT(close(s_), SyscallSucceeds()); } } @@ -103,7 +102,7 @@ void RawSocketICMPTest::TearDown() { // We'll only read an echo in this case, as the kernel won't respond to the // malformed ICMP checksum. TEST_F(RawSocketICMPTest, SendAndReceiveBadChecksum) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence, // and ID. None of that should matter for raw sockets - the kernel should @@ -132,7 +131,7 @@ TEST_F(RawSocketICMPTest, SendAndReceiveBadChecksum) { // Send and receive an ICMP packet. TEST_F(RawSocketICMPTest, SendAndReceive) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID. // None of that should matter for raw sockets - the kernel should still give @@ -152,7 +151,7 @@ TEST_F(RawSocketICMPTest, SendAndReceive) { // We should be able to create multiple raw sockets for the same protocol and // receive the same packet on both. TEST_F(RawSocketICMPTest, MultipleSocketReceive) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); FileDescriptor s2 = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_ICMP)); @@ -215,7 +214,7 @@ TEST_F(RawSocketICMPTest, MultipleSocketReceive) { // A raw ICMP socket and ping socket should both receive the ICMP packets // intended for the ping socket. TEST_F(RawSocketICMPTest, RawAndPingSockets) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); FileDescriptor ping_sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP)); @@ -262,10 +261,10 @@ TEST_F(RawSocketICMPTest, RawAndPingSockets) { } // A raw ICMP socket should be able to send a malformed short ICMP Echo Request, -// while ping socket should not. -// Neither should be able to receieve a short malformed packet. +// while a ping socket should not. Neither should be able to receieve a short +// malformed packet. TEST_F(RawSocketICMPTest, ShortEchoRawAndPingSockets) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); FileDescriptor ping_sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP)); @@ -306,7 +305,7 @@ TEST_F(RawSocketICMPTest, ShortEchoRawAndPingSockets) { // while ping socket should not. // Neither should be able to receieve a short malformed packet. TEST_F(RawSocketICMPTest, ShortEchoReplyRawAndPingSockets) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); FileDescriptor ping_sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP)); @@ -345,7 +344,7 @@ TEST_F(RawSocketICMPTest, ShortEchoReplyRawAndPingSockets) { // Test that connect() sends packets to the right place. TEST_F(RawSocketICMPTest, SendAndReceiveViaConnect) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT( connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), @@ -369,7 +368,7 @@ TEST_F(RawSocketICMPTest, SendAndReceiveViaConnect) { // Bind to localhost, then send and receive packets. TEST_F(RawSocketICMPTest, BindSendAndReceive) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT( bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), @@ -392,7 +391,7 @@ TEST_F(RawSocketICMPTest, BindSendAndReceive) { // Bind and connect to localhost and send/receive packets. TEST_F(RawSocketICMPTest, BindConnectSendAndReceive) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); ASSERT_THAT( bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)), @@ -418,7 +417,7 @@ TEST_F(RawSocketICMPTest, BindConnectSendAndReceive) { // Set and get SO_LINGER. TEST_F(RawSocketICMPTest, SetAndGetSocketLinger) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); int level = SOL_SOCKET; int type = SO_LINGER; @@ -440,7 +439,7 @@ TEST_F(RawSocketICMPTest, SetAndGetSocketLinger) { // Test getsockopt for SO_ACCEPTCONN. TEST_F(RawSocketICMPTest, GetSocketAcceptConn) { - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); int got = -1; socklen_t length = sizeof(got); diff --git a/test/syscalls/linux/readahead.cc b/test/syscalls/linux/readahead.cc index 71073bb3c..04104c912 100644 --- a/test/syscalls/linux/readahead.cc +++ b/test/syscalls/linux/readahead.cc @@ -16,8 +16,8 @@ #include <fcntl.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc index f72957f89..87b66aa98 100644 --- a/test/syscalls/linux/semaphore.cc +++ b/test/syscalls/linux/semaphore.cc @@ -1019,6 +1019,17 @@ TEST(SemaphoreTest, SemInfo) { EXPECT_EQ(info.semvmx, kSemVmx); } +TEST(SempahoreTest, RemoveNonExistentSemaphore) { + EXPECT_THAT(semctl(-1, 0, IPC_RMID), SyscallFailsWithErrno(EINVAL)); +} + +TEST(SempahoreTest, RemoveDeletedSemaphore) { + int id; + EXPECT_THAT(id = semget(IPC_PRIVATE, 1, 0), SyscallSucceeds()); + EXPECT_THAT(semctl(id, 0, IPC_RMID), SyscallSucceeds()); + EXPECT_THAT(semctl(id, 0, IPC_RMID), SyscallFailsWithErrno(EINVAL)); +} + } // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc index bea4ee71c..9bd3bd5e8 100644 --- a/test/syscalls/linux/sendfile.cc +++ b/test/syscalls/linux/sendfile.cc @@ -208,38 +208,6 @@ TEST(SendFileTest, SendAndUpdateFileOffset) { absl::string_view(actual, kHalfDataSize)); } -TEST(SendFileTest, SendToDevZeroAndUpdateFileOffset) { - // Create temp files. - // Test input string length must be > 2 AND even. - constexpr char kData[] = "The slings and arrows of outrageous fortune,"; - constexpr int kDataSize = sizeof(kData) - 1; - constexpr int kHalfDataSize = kDataSize / 2; - const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( - GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode)); - - // Open the input file as read only. - const FileDescriptor inf = - ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY)); - - // Open /dev/zero as write only. - const FileDescriptor outf = - ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_WRONLY)); - - // Send data and verify that sendfile returns the correct value. - int bytes_sent; - EXPECT_THAT( - bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kHalfDataSize), - SyscallSucceedsWithValue(kHalfDataSize)); - - char actual[kHalfDataSize]; - // Verify that the input file offset has been updated. - ASSERT_THAT(read(inf.get(), &actual, kDataSize - bytes_sent), - SyscallSucceedsWithValue(kHalfDataSize)); - EXPECT_EQ( - absl::string_view(kData + kDataSize - bytes_sent, kDataSize - bytes_sent), - absl::string_view(actual, kHalfDataSize)); -} - TEST(SendFileTest, SendAndUpdateFileOffsetFromNonzeroStartingPoint) { // Create temp files. // Test input string length must be > 2 AND divisible by 4. @@ -609,23 +577,6 @@ TEST(SendFileTest, SendPipeBlocks) { SyscallSucceedsWithValue(kDataSize)); } -TEST(SendFileTest, SendToSpecialFile) { - // Create temp file. - const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( - GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode)); - - const FileDescriptor inf = - ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR)); - constexpr int kSize = 0x7ff; - ASSERT_THAT(ftruncate(inf.get(), kSize), SyscallSucceeds()); - - auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()); - - // eventfd can accept a number of bytes which is a multiple of 8. - EXPECT_THAT(sendfile(eventfd.get(), inf.get(), nullptr, 0xfffff), - SyscallSucceedsWithValue(kSize & (~7))); -} - TEST(SendFileTest, SendFileToPipe) { // Create temp file. constexpr char kData[] = "<insert-quote-here>"; @@ -672,57 +623,6 @@ TEST(SendFileTest, SendFileToSelf) { SyscallSucceedsWithValue(kSendfileSize)); } -static volatile int signaled = 0; -void SigUsr1Handler(int sig, siginfo_t* info, void* context) { signaled = 1; } - -TEST(SendFileTest, ToEventFDDoesNotSpin) { - FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, 0)); - - // Write the maximum value of an eventfd to a file. - const uint64_t kMaxEventfdValue = 0xfffffffffffffffe; - const auto tempfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); - const auto tempfd = ASSERT_NO_ERRNO_AND_VALUE(Open(tempfile.path(), O_RDWR)); - ASSERT_THAT( - pwrite(tempfd.get(), &kMaxEventfdValue, sizeof(kMaxEventfdValue), 0), - SyscallSucceedsWithValue(sizeof(kMaxEventfdValue))); - - // Set the eventfd's value to 1. - const uint64_t kOne = 1; - ASSERT_THAT(write(efd.get(), &kOne, sizeof(kOne)), - SyscallSucceedsWithValue(sizeof(kOne))); - - // Set up signal handler. - struct sigaction sa = {}; - sa.sa_sigaction = SigUsr1Handler; - sa.sa_flags = SA_SIGINFO; - const auto cleanup_sigact = - ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGUSR1, sa)); - - // Send SIGUSR1 to this thread in 1 second. - struct sigevent sev = {}; - sev.sigev_notify = SIGEV_THREAD_ID; - sev.sigev_signo = SIGUSR1; - sev.sigev_notify_thread_id = gettid(); - auto timer = ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev)); - struct itimerspec its = {}; - its.it_value = absl::ToTimespec(absl::Seconds(1)); - DisableSave ds; // Asserting an EINTR. - ASSERT_NO_ERRNO(timer.Set(0, its)); - - // Sendfile from tempfd to the eventfd. Since the eventfd is not already at - // its maximum value, the eventfd is "ready for writing"; however, since the - // eventfd's existing value plus the new value would exceed the maximum, the - // write should internally fail with EWOULDBLOCK. In this case, sendfile() - // should block instead of spinning, and eventually be interrupted by our - // timer. See b/172075629. - EXPECT_THAT( - sendfile(efd.get(), tempfd.get(), nullptr, sizeof(kMaxEventfdValue)), - SyscallFailsWithErrno(EINTR)); - - // Signal should have been handled. - EXPECT_EQ(signaled, 1); -} - } // namespace } // namespace testing diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc index c101fe9d2..ac6e89e91 100644 --- a/test/syscalls/linux/sendfile_socket.cc +++ b/test/syscalls/linux/sendfile_socket.cc @@ -24,8 +24,8 @@ #include "gtest/gtest.h" #include "absl/strings/string_view.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc index 7b966484d..d2762b6e9 100644 --- a/test/syscalls/linux/socket.cc +++ b/test/syscalls/linux/socket.cc @@ -20,8 +20,8 @@ #include <unistd.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/temp_umask.h" #include "test/util/test_util.h" @@ -119,6 +119,9 @@ TEST(SocketTest, UnixSCMRightsOnlyPassedOnce) { // Send more than what will fit inside the send/receive buffers, so that it is // split into multiple messages. constexpr int kBufSize = 0x100000; + // Heap allocation is async-signal-unsafe and thus cannot occur between fork() + // and execve(). + std::vector<char> buf(kBufSize); pid_t pid = fork(); if (pid == 0) { @@ -127,7 +130,6 @@ TEST(SocketTest, UnixSCMRightsOnlyPassedOnce) { // Construct a message with some control message. struct msghdr msg = {}; char control[CMSG_SPACE(sizeof(int))] = {}; - std::vector<char> buf(kBufSize); struct iovec iov = {}; msg.msg_control = control; msg.msg_controllen = sizeof(control); @@ -154,7 +156,6 @@ TEST(SocketTest, UnixSCMRightsOnlyPassedOnce) { struct msghdr msg = {}; char control[CMSG_SPACE(sizeof(int))] = {}; - std::vector<char> buf(kBufSize); struct iovec iov = {}; msg.msg_control = &control; msg.msg_controllen = sizeof(control); diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc index 00999f192..d450fad14 100644 --- a/test/syscalls/linux/socket_abstract.cc +++ b/test/syscalls/linux/socket_abstract.cc @@ -15,10 +15,10 @@ #include <vector> #include "test/syscalls/linux/socket_generic.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/socket_unix.h" #include "test/syscalls/linux/socket_unix_cmsg.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_bind_to_device.cc b/test/syscalls/linux/socket_bind_to_device.cc index 6b27f6eab..dac31a90c 100644 --- a/test/syscalls/linux/socket_bind_to_device.cc +++ b/test/syscalls/linux/socket_bind_to_device.cc @@ -34,8 +34,8 @@ #include "gtest/gtest.h" #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_bind_to_device_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc index 3b108cbd3..4cddb875a 100644 --- a/test/syscalls/linux/socket_bind_to_device_distribution.cc +++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc @@ -35,8 +35,8 @@ #include "gtest/gtest.h" #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_bind_to_device_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -77,34 +77,6 @@ class BindToDeviceDistributionTest } }; -PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) { - switch (family) { - case AF_INET: - return static_cast<uint16_t>( - reinterpret_cast<sockaddr_in const*>(&addr)->sin_port); - case AF_INET6: - return static_cast<uint16_t>( - reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port); - default: - return PosixError(EINVAL, - absl::StrCat("unknown socket family: ", family)); - } -} - -PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) { - switch (family) { - case AF_INET: - reinterpret_cast<sockaddr_in*>(addr)->sin_port = port; - return NoError(); - case AF_INET6: - reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port; - return NoError(); - default: - return PosixError(EINVAL, - absl::StrCat("unknown socket family: ", family)); - } -} - // Binds sockets to different devices and then creates many TCP connections. // Checks that the distribution of connections received on the sockets matches // the expectation. diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc index d3cc71dbf..334b46730 100644 --- a/test/syscalls/linux/socket_bind_to_device_sequence.cc +++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc @@ -36,8 +36,8 @@ #include "absl/container/node_hash_map.h" #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_bind_to_device_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" diff --git a/test/syscalls/linux/socket_blocking.cc b/test/syscalls/linux/socket_blocking.cc index 7e88aa2d9..5262e9ed9 100644 --- a/test/syscalls/linux/socket_blocking.cc +++ b/test/syscalls/linux/socket_blocking.cc @@ -23,8 +23,8 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" #include "test/util/timer_util.h" diff --git a/test/syscalls/linux/socket_blocking.h b/test/syscalls/linux/socket_blocking.h index db26e5ef5..89134ec30 100644 --- a/test/syscalls/linux/socket_blocking.h +++ b/test/syscalls/linux/socket_blocking.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_BLOCKING_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_BLOCKING_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_capability.cc b/test/syscalls/linux/socket_capability.cc index f75482aba..95cf1f6b4 100644 --- a/test/syscalls/linux/socket_capability.cc +++ b/test/syscalls/linux/socket_capability.cc @@ -16,9 +16,9 @@ // headers). #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc index 287359363..a611e9f4e 100644 --- a/test/syscalls/linux/socket_filesystem.cc +++ b/test/syscalls/linux/socket_filesystem.cc @@ -15,10 +15,10 @@ #include <vector> #include "test/syscalls/linux/socket_generic.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/socket_unix.h" #include "test/syscalls/linux/socket_unix_cmsg.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_generic.h b/test/syscalls/linux/socket_generic.h index 00ae7bfc3..a13262355 100644 --- a/test/syscalls/linux/socket_generic.h +++ b/test/syscalls/linux/socket_generic.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_GENERIC_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_GENERIC_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_generic_stress.cc b/test/syscalls/linux/socket_generic_stress.cc index c35aa2183..9ff385b41 100644 --- a/test/syscalls/linux/socket_generic_stress.cc +++ b/test/syscalls/linux/socket_generic_stress.cc @@ -29,57 +29,19 @@ #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" namespace gvisor { namespace testing { -constexpr char kRangeFile[] = "/proc/sys/net/ipv4/ip_local_port_range"; - -PosixErrorOr<int> NumPorts() { - int min = 0; - int max = 1 << 16; - - // Read the ephemeral range from /proc. - ASSIGN_OR_RETURN_ERRNO(std::string rangefile, GetContents(kRangeFile)); - const std::string err_msg = - absl::StrFormat("%s has invalid content: %s", kRangeFile, rangefile); - if (rangefile.back() != '\n') { - return PosixError(EINVAL, err_msg); - } - rangefile.pop_back(); - std::vector<std::string> range = - absl::StrSplit(rangefile, absl::ByAnyChar("\t ")); - if (range.size() < 2 || !absl::SimpleAtoi(range.front(), &min) || - !absl::SimpleAtoi(range.back(), &max)) { - return PosixError(EINVAL, err_msg); - } - - // If we can open as writable, limit the range. - if (!access(kRangeFile, W_OK)) { - ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, - Open(kRangeFile, O_WRONLY | O_TRUNC, 0)); - max = min + 50; - const std::string small_range = absl::StrFormat("%d %d", min, max); - int n = write(fd.get(), small_range.c_str(), small_range.size()); - if (n < 0) { - return PosixError( - errno, - absl::StrFormat("write(%d [%s], \"%s\", %d)", fd.get(), kRangeFile, - small_range.c_str(), small_range.size())); - } - } - return max - min; -} - // Test fixture for tests that apply to pairs of connected sockets. using ConnectStressTest = SocketPairTest; TEST_P(ConnectStressTest, Reset) { - const int nports = ASSERT_NO_ERRNO_AND_VALUE(NumPorts()); + const int nports = ASSERT_NO_ERRNO_AND_VALUE(MaybeLimitEphemeralPorts()); for (int i = 0; i < nports * 2; i++) { const std::unique_ptr<SocketPair> sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); @@ -103,7 +65,7 @@ TEST_P(ConnectStressTest, Reset) { // Tests that opening too many connections -- without closing them -- does lead // to port exhaustion. TEST_P(ConnectStressTest, TooManyOpen) { - const int nports = ASSERT_NO_ERRNO_AND_VALUE(NumPorts()); + const int nports = ASSERT_NO_ERRNO_AND_VALUE(MaybeLimitEphemeralPorts()); int err_num = 0; std::vector<std::unique_ptr<SocketPair>> sockets = std::vector<std::unique_ptr<SocketPair>>(nports); @@ -164,7 +126,7 @@ class PersistentListenerConnectStressTest : public SocketPairTest { }; TEST_P(PersistentListenerConnectStressTest, ShutdownCloseFirst) { - const int nports = ASSERT_NO_ERRNO_AND_VALUE(NumPorts()); + const int nports = ASSERT_NO_ERRNO_AND_VALUE(MaybeLimitEphemeralPorts()); for (int i = 0; i < nports * 2; i++) { std::unique_ptr<SocketPair> sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketSleep()); @@ -185,7 +147,7 @@ TEST_P(PersistentListenerConnectStressTest, ShutdownCloseFirst) { } TEST_P(PersistentListenerConnectStressTest, ShutdownCloseSecond) { - const int nports = ASSERT_NO_ERRNO_AND_VALUE(NumPorts()); + const int nports = ASSERT_NO_ERRNO_AND_VALUE(MaybeLimitEphemeralPorts()); for (int i = 0; i < nports * 2; i++) { const std::unique_ptr<SocketPair> sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); @@ -206,7 +168,7 @@ TEST_P(PersistentListenerConnectStressTest, ShutdownCloseSecond) { } TEST_P(PersistentListenerConnectStressTest, Close) { - const int nports = ASSERT_NO_ERRNO_AND_VALUE(NumPorts()); + const int nports = ASSERT_NO_ERRNO_AND_VALUE(MaybeLimitEphemeralPorts()); for (int i = 0; i < nports * 2; i++) { std::unique_ptr<SocketPair> sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketSleep()); diff --git a/test/syscalls/linux/socket_generic_test_cases.cc b/test/syscalls/linux/socket_generic_test_cases.cc index 5c4cb6c35..c509d54e2 100644 --- a/test/syscalls/linux/socket_generic_test_cases.cc +++ b/test/syscalls/linux/socket_generic_test_cases.cc @@ -14,6 +14,9 @@ #include "test/syscalls/linux/socket_generic.h" +#ifdef __linux__ +#include <linux/capability.h> +#endif // __linux__ #include <stdio.h> #include <sys/ioctl.h> #include <sys/socket.h> @@ -22,8 +25,9 @@ #include "gtest/gtest.h" #include "absl/strings/str_format.h" #include "absl/strings/string_view.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/capability_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" // This file is a generic socket test file. It must be built with another file @@ -400,6 +404,46 @@ TEST_P(AllSocketPairTest, RcvBufSucceeds) { EXPECT_GT(size, 0); } +#ifdef __linux__ + +// Check that setting SO_RCVBUFFORCE above max is not clamped to the maximum +// receive buffer size. +TEST_P(AllSocketPairTest, SetSocketRecvBufForceAboveMax) { + std::unique_ptr<SocketPair> sockets = + ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + // Discover maxmimum buffer size by setting to a really large value. + constexpr int kRcvBufSz = 0xffffffff; + ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, + sizeof(kRcvBufSz)), + SyscallSucceeds()); + + int max = 0; + socklen_t max_len = sizeof(max); + ASSERT_THAT( + getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &max, &max_len), + SyscallSucceeds()); + + int above_max = max + 1; + int sso = setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUFFORCE, + &above_max, sizeof(above_max)); + if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))) { + ASSERT_THAT(sso, SyscallFailsWithErrno(EPERM)); + return; + } + ASSERT_THAT(sso, SyscallSucceeds()); + + int val = 0; + socklen_t val_len = sizeof(val); + ASSERT_THAT( + getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &val, &val_len), + SyscallSucceeds()); + // The system doubles the passed-in maximum. + ASSERT_EQ(above_max * 2, val); +} + +#endif // __linux__ + TEST_P(AllSocketPairTest, GetSndBufSucceeds) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); int size = 0; diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc index a3bdada86..13a83a1b3 100644 --- a/test/syscalls/linux/socket_inet_loopback.cc +++ b/test/syscalls/linux/socket_inet_loopback.cc @@ -34,10 +34,11 @@ #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/syscalls/linux/socket_inet_loopback_test_params.h" #include "test/util/file_descriptor.h" #include "test/util/posix_error.h" #include "test/util/save_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -48,45 +49,7 @@ namespace { using ::testing::Gt; -PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) { - switch (family) { - case AF_INET: - return static_cast<uint16_t>( - reinterpret_cast<sockaddr_in const*>(&addr)->sin_port); - case AF_INET6: - return static_cast<uint16_t>( - reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port); - default: - return PosixError(EINVAL, - absl::StrCat("unknown socket family: ", family)); - } -} - -PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) { - switch (family) { - case AF_INET: - reinterpret_cast<sockaddr_in*>(addr)->sin_port = port; - return NoError(); - case AF_INET6: - reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port; - return NoError(); - default: - return PosixError(EINVAL, - absl::StrCat("unknown socket family: ", family)); - } -} - -struct TestParam { - TestAddress listener; - TestAddress connector; -}; - -std::string DescribeTestParam(::testing::TestParamInfo<TestParam> const& info) { - return absl::StrCat("Listen", info.param.listener.description, "_Connect", - info.param.connector.description); -} - -using SocketInetLoopbackTest = ::testing::TestWithParam<TestParam>; +using SocketInetLoopbackTest = ::testing::TestWithParam<SocketInetTestParam>; TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) { int fd[2] = {}; @@ -299,7 +262,7 @@ void tcpSimpleConnectTest(TestAddress const& listener, } TEST_P(SocketInetLoopbackTest, TCP) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -307,7 +270,7 @@ TEST_P(SocketInetLoopbackTest, TCP) { } TEST_P(SocketInetLoopbackTest, TCPListenUnbound) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -316,7 +279,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenUnbound) { } TEST_P(SocketInetLoopbackTest, TCPListenShutdownListen) { - const auto& param = GetParam(); + SocketInetTestParam const& param = GetParam(); const TestAddress& listener = param.listener; const TestAddress& connector = param.connector; @@ -345,7 +308,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownListen) { sockaddr_storage conn_addr = connector.addr; ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); - // TODO(b/157236388): Remove Disable save after bug is fixed. S/R test can + // TODO(b/153489135): Remove Disable save after bug is fixed. S/R test can // fail because the last socket may not be delivered to the accept queue // by the time connect returns. DisableSave ds; @@ -362,13 +325,14 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownListen) { } TEST_P(SocketInetLoopbackTest, TCPListenShutdown) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; constexpr int kBacklog = 2; - constexpr int kFDs = kBacklog + 1; + // See the comment in TCPBacklog for why this isn't kBacklog + 1. + constexpr int kFDs = kBacklog; // Create the listening socket. FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( @@ -429,7 +393,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdown) { } TEST_P(SocketInetLoopbackTest, TCPListenClose) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -454,6 +418,8 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) { uint16_t const port = ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); + // Connect repeatedly, keeping each connection open. After kBacklog + // connections, we'll start getting EINPROGRESS. sockaddr_storage conn_addr = connector.addr; ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); std::vector<FileDescriptor> clients; @@ -474,7 +440,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) { // Test the protocol state information returned by TCPINFO. TEST_P(SocketInetLoopbackTest, TCPInfoState) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -543,7 +509,7 @@ TEST_P(SocketInetLoopbackTest, TCPInfoState) { ASSERT_THAT(close(conn_fd.release()), SyscallSucceeds()); } -void TestHangupDuringConnect(const TestParam& param, +void TestHangupDuringConnect(const SocketInetTestParam& param, void (*hangup)(FileDescriptor&)) { TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -606,8 +572,10 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownDuringConnect) { }); } -void TestListenHangupConnectingRead(const TestParam& param, +void TestListenHangupConnectingRead(const SocketInetTestParam& param, void (*hangup)(FileDescriptor&)) { + constexpr int kTimeout = 10000; + TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -637,14 +605,33 @@ void TestListenHangupConnectingRead(const TestParam& param, sockaddr_storage conn_addr = connector.addr; ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); FileDescriptor established_client = ASSERT_NO_ERRNO_AND_VALUE( - Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); - ASSERT_THAT(connect(established_client.get(), AsSockAddr(&conn_addr), - connector.addr_len), - SyscallSucceeds()); + Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); + int ret = connect(established_client.get(), AsSockAddr(&conn_addr), + connector.addr_len); + if (ret != 0) { + EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS)); + } - // Ensure that the accept queue has the completed connection. - constexpr int kTimeout = 10000; + // On some kernels a backlog of 0 means no backlog, while on others it means a + // backlog of 1. See commit c609e6aae4efcf383fe86b195d1b060befcb3666 for more + // explanation. + // + // If we timeout connecting to loopback, we're on a kernel with no backlog. pollfd pfd = { + .fd = established_client.get(), + .events = POLLIN | POLLOUT, + }; + if (!poll(&pfd, 1, kTimeout)) { + // We're on one of those kernels. It should be impossible to establish the + // connection, so connect will always return EALREADY. + EXPECT_THAT(connect(established_client.get(), AsSockAddr(&conn_addr), + connector.addr_len), + SyscallFailsWithErrno(EALREADY)); + return; + } + + // Ensure that the accept queue has the completed connection. + pfd = { .fd = listen_fd.get(), .events = POLLIN, }; @@ -654,8 +641,8 @@ void TestListenHangupConnectingRead(const TestParam& param, FileDescriptor connecting_client = ASSERT_NO_ERRNO_AND_VALUE( Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); // Keep the last client in connecting state. - int ret = connect(connecting_client.get(), AsSockAddr(&conn_addr), - connector.addr_len); + ret = connect(connecting_client.get(), AsSockAddr(&conn_addr), + connector.addr_len); if (ret != 0) { EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS)); } @@ -694,7 +681,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownConnectingRead) { // Test close of a non-blocking connecting socket. TEST_P(SocketInetLoopbackTest, TCPNonBlockingConnectClose) { - TestParam const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -720,7 +707,7 @@ TEST_P(SocketInetLoopbackTest, TCPNonBlockingConnectClose) { // Try many iterations to catch a race with socket close and handshake // completion. - for (int i = 0; i < 1000; ++i) { + for (int i = 0; i < 100; ++i) { FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE( Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); ASSERT_THAT( @@ -764,12 +751,12 @@ TEST_P(SocketInetLoopbackTest, TCPNonBlockingConnectClose) { } } -// TODO(b/157236388): Remove once bug is fixed. Test fails w/ +// TODO(b/153489135): Remove once bug is fixed. Test fails w/ // random save as established connections which can't be delivered to the accept // queue because the queue is full are not correctly delivered after restore // causing the last accept to timeout on the restore. TEST_P(SocketInetLoopbackTest, TCPAcceptBacklogSizes) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -795,7 +782,8 @@ TEST_P(SocketInetLoopbackTest, TCPAcceptBacklogSizes) { if (backlog < 0) { expected_accepts = 1024; } else { - expected_accepts = backlog + 1; + // See the comment in TCPBacklog for why this isn't backlog + 1. + expected_accepts = backlog; } for (int i = 0; i < expected_accepts; i++) { SCOPED_TRACE(absl::StrCat("i=", i)); @@ -813,12 +801,12 @@ TEST_P(SocketInetLoopbackTest, TCPAcceptBacklogSizes) { } } -// TODO(b/157236388): Remove once bug is fixed. Test fails w/ +// TODO(b/153489135): Remove once bug is fixed. Test fails w/ // random save as established connections which can't be delivered to the accept // queue because the queue is full are not correctly delivered after restore // causing the last accept to timeout on the restore. TEST_P(SocketInetLoopbackTest, TCPBacklog) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -896,16 +884,20 @@ TEST_P(SocketInetLoopbackTest, TCPBacklog) { // enqueuing established connections to the accept queue, newer SYNs could // still be replied to causing those client connections would be accepted as // we start dequeuing the queue. - ASSERT_GE(accepted_conns, kBacklogSize + 1); + // + // On some kernels this can value can be off by one, so we don't add 1 to + // kBacklogSize. See commit c609e6aae4efcf383fe86b195d1b060befcb3666 for more + // explanation. + ASSERT_GE(accepted_conns, kBacklogSize); ASSERT_GE(client_conns, accepted_conns); } -// TODO(b/157236388): Remove once bug is fixed. Test fails w/ +// TODO(b/153489135): Remove once bug is fixed. Test fails w/ // random save as established connections which can't be delivered to the accept // queue because the queue is full are not correctly delivered after restore // causing the last accept to timeout on the restore. TEST_P(SocketInetLoopbackTest, TCPBacklogAcceptAll) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -931,7 +923,9 @@ TEST_P(SocketInetLoopbackTest, TCPBacklogAcceptAll) { // Fill up the accept queue and trigger more client connections which would be // waiting to be accepted. - std::array<FileDescriptor, kBacklog + 1> established_clients; + // + // See the comment in TCPBacklog for why this isn't backlog + 1. + std::array<FileDescriptor, kBacklog> established_clients; for (auto& fd : established_clients) { fd = ASSERT_NO_ERRNO_AND_VALUE( Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); @@ -993,175 +987,12 @@ TEST_P(SocketInetLoopbackTest, TCPBacklogAcceptAll) { } } -// TCPFinWait2Test creates a pair of connected sockets then closes one end to -// trigger FIN_WAIT2 state for the closed endpoint. Then it binds the same local -// IP/port on a new socket and tries to connect. The connect should fail w/ -// an EADDRINUSE. Then we wait till the FIN_WAIT2 timeout is over and try the -// connect again with a new socket and this time it should succeed. -// -// TCP timers are not S/R today, this can cause this test to be flaky when run -// under random S/R due to timer being reset on a restore. -TEST_P(SocketInetLoopbackTest, TCPFinWait2Test) { - auto const& param = GetParam(); - TestAddress const& listener = param.listener; - TestAddress const& connector = param.connector; - - // Create the listening socket. - const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); - sockaddr_storage listen_addr = listener.addr; - ASSERT_THAT( - bind(listen_fd.get(), AsSockAddr(&listen_addr), listener.addr_len), - SyscallSucceeds()); - ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); - - // Get the port bound by the listening socket. - socklen_t addrlen = listener.addr_len; - ASSERT_THAT(getsockname(listen_fd.get(), AsSockAddr(&listen_addr), &addrlen), - SyscallSucceeds()); - - uint16_t const port = - ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); - - // Connect to the listening socket. - FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); - - // Lower FIN_WAIT2 state to 5 seconds for test. - constexpr int kTCPLingerTimeout = 5; - EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2, - &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)), - SyscallSucceedsWithValue(0)); - - sockaddr_storage conn_addr = connector.addr; - ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); - ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), AsSockAddr(&conn_addr), - connector.addr_len), - SyscallSucceeds()); - - // Accept the connection. - auto accepted = - ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); - - // Get the address/port bound by the connecting socket. - sockaddr_storage conn_bound_addr; - socklen_t conn_addrlen = connector.addr_len; - ASSERT_THAT( - getsockname(conn_fd.get(), AsSockAddr(&conn_bound_addr), &conn_addrlen), - SyscallSucceeds()); - - // close the connecting FD to trigger FIN_WAIT2 on the connected fd. - conn_fd.reset(); - - // Now bind and connect a new socket. - const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE( - Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); - - // Disable cooperative saves after this point. As a save between the first - // bind/connect and the second one can cause the linger timeout timer to - // be restarted causing the final bind/connect to fail. - DisableSave ds; - - ASSERT_THAT(bind(conn_fd2.get(), AsSockAddr(&conn_bound_addr), conn_addrlen), - SyscallFailsWithErrno(EADDRINUSE)); - - // Sleep for a little over the linger timeout to reduce flakiness in - // save/restore tests. - absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 2)); - - ds.reset(); - - ASSERT_THAT( - RetryEINTR(connect)(conn_fd2.get(), AsSockAddr(&conn_addr), conn_addrlen), - SyscallSucceeds()); -} - -// TCPLinger2TimeoutAfterClose creates a pair of connected sockets -// then closes one end to trigger FIN_WAIT2 state for the closed endpont. -// It then sleeps for the TCP_LINGER2 timeout and verifies that bind/ -// connecting the same address succeeds. -// -// TCP timers are not S/R today, this can cause this test to be flaky when run -// under random S/R due to timer being reset on a restore. -TEST_P(SocketInetLoopbackTest, TCPLinger2TimeoutAfterClose) { - auto const& param = GetParam(); - TestAddress const& listener = param.listener; - TestAddress const& connector = param.connector; - - // Create the listening socket. - const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); - sockaddr_storage listen_addr = listener.addr; - ASSERT_THAT( - bind(listen_fd.get(), AsSockAddr(&listen_addr), listener.addr_len), - SyscallSucceeds()); - ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); - - // Get the port bound by the listening socket. - socklen_t addrlen = listener.addr_len; - ASSERT_THAT(getsockname(listen_fd.get(), AsSockAddr(&listen_addr), &addrlen), - SyscallSucceeds()); - - uint16_t const port = - ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); - - // Connect to the listening socket. - FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); - - sockaddr_storage conn_addr = connector.addr; - ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); - ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), AsSockAddr(&conn_addr), - connector.addr_len), - SyscallSucceeds()); - - // Accept the connection. - auto accepted = - ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); - - // Get the address/port bound by the connecting socket. - sockaddr_storage conn_bound_addr; - socklen_t conn_addrlen = connector.addr_len; - ASSERT_THAT( - getsockname(conn_fd.get(), AsSockAddr(&conn_bound_addr), &conn_addrlen), - SyscallSucceeds()); - - // Disable cooperative saves after this point as TCP timers are not restored - // across a S/R. - { - DisableSave ds; - constexpr int kTCPLingerTimeout = 5; - EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2, - &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)), - SyscallSucceedsWithValue(0)); - - // close the connecting FD to trigger FIN_WAIT2 on the connected fd. - conn_fd.reset(); - - absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1)); - - // ds going out of scope will Re-enable S/R's since at this point the timer - // must have fired and cleaned up the endpoint. - } - - // Now bind and connect a new socket and verify that we can immediately - // rebind the address bound by the conn_fd as it never entered TIME_WAIT. - const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE( - Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); - - ASSERT_THAT(bind(conn_fd2.get(), AsSockAddr(&conn_bound_addr), conn_addrlen), - SyscallSucceeds()); - ASSERT_THAT( - RetryEINTR(connect)(conn_fd2.get(), AsSockAddr(&conn_addr), conn_addrlen), - SyscallSucceeds()); -} - // TCPResetAfterClose creates a pair of connected sockets then closes // one end to trigger FIN_WAIT2 state for the closed endpoint verifies // that we generate RSTs for any new data after the socket is fully // closed. TEST_P(SocketInetLoopbackTest, TCPResetAfterClose) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -1221,198 +1052,8 @@ TEST_P(SocketInetLoopbackTest, TCPResetAfterClose) { SyscallSucceedsWithValue(0)); } -// setupTimeWaitClose sets up a socket endpoint in TIME_WAIT state. -// Callers can choose to perform active close on either ends of the connection -// and also specify if they want to enabled SO_REUSEADDR. -void setupTimeWaitClose(const TestAddress* listener, - const TestAddress* connector, bool reuse, - bool accept_close, sockaddr_storage* listen_addr, - sockaddr_storage* conn_bound_addr) { - // Create the listening socket. - FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(listener->family(), SOCK_STREAM, IPPROTO_TCP)); - if (reuse) { - ASSERT_THAT(setsockopt(listen_fd.get(), SOL_SOCKET, SO_REUSEADDR, - &kSockOptOn, sizeof(kSockOptOn)), - SyscallSucceeds()); - } - ASSERT_THAT( - bind(listen_fd.get(), AsSockAddr(listen_addr), listener->addr_len), - SyscallSucceeds()); - ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); - - // Get the port bound by the listening socket. - socklen_t addrlen = listener->addr_len; - ASSERT_THAT(getsockname(listen_fd.get(), AsSockAddr(listen_addr), &addrlen), - SyscallSucceeds()); - - uint16_t const port = - ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener->family(), *listen_addr)); - - // Connect to the listening socket. - FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(connector->family(), SOCK_STREAM, IPPROTO_TCP)); - - // We disable saves after this point as a S/R causes the netstack seed - // to be regenerated which changes what ports/ISN is picked for a given - // tuple (src ip,src port, dst ip, dst port). This can cause the final - // SYN to use a sequence number that looks like one from the current - // connection in TIME_WAIT and will not be accepted causing the test - // to timeout. - // - // TODO(gvisor.dev/issue/940): S/R portSeed/portHint - DisableSave ds; - - sockaddr_storage conn_addr = connector->addr; - ASSERT_NO_ERRNO(SetAddrPort(connector->family(), &conn_addr, port)); - ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), AsSockAddr(&conn_addr), - connector->addr_len), - SyscallSucceeds()); - - // Accept the connection. - auto accepted = - ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); - - // Get the address/port bound by the connecting socket. - socklen_t conn_addrlen = connector->addr_len; - ASSERT_THAT( - getsockname(conn_fd.get(), AsSockAddr(conn_bound_addr), &conn_addrlen), - SyscallSucceeds()); - - FileDescriptor active_closefd, passive_closefd; - if (accept_close) { - active_closefd = std::move(accepted); - passive_closefd = std::move(conn_fd); - } else { - active_closefd = std::move(conn_fd); - passive_closefd = std::move(accepted); - } - - // shutdown to trigger TIME_WAIT. - ASSERT_THAT(shutdown(active_closefd.get(), SHUT_WR), SyscallSucceeds()); - { - constexpr int kTimeout = 10000; - pollfd pfd = { - .fd = passive_closefd.get(), - .events = POLLIN, - }; - ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); - ASSERT_EQ(pfd.revents, POLLIN); - } - ASSERT_THAT(shutdown(passive_closefd.get(), SHUT_WR), SyscallSucceeds()); - { - constexpr int kTimeout = 10000; - constexpr int16_t want_events = POLLHUP; - pollfd pfd = { - .fd = active_closefd.get(), - .events = want_events, - }; - ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); - } - - // This sleep is needed to reduce flake to ensure that the passive-close - // ensures the state transitions to CLOSE from LAST_ACK. - absl::SleepFor(absl::Seconds(1)); -} - -// These tests are disabled under random save as the the restore run -// results in the stack.Seed() being different which can cause -// sequence number of final connect to be one that is considered -// old and can cause the test to be flaky. -// -// Test re-binding of client and server bound addresses when the older -// connection is in TIME_WAIT. -TEST_P(SocketInetLoopbackTest, TCPPassiveCloseNoTimeWaitTest) { - auto const& param = GetParam(); - sockaddr_storage listen_addr, conn_bound_addr; - listen_addr = param.listener.addr; - setupTimeWaitClose(¶m.listener, ¶m.connector, false /*reuse*/, - true /*accept_close*/, &listen_addr, &conn_bound_addr); - - // Now bind a new socket and verify that we can immediately rebind the address - // bound by the conn_fd as it never entered TIME_WAIT. - const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP)); - ASSERT_THAT(bind(conn_fd.get(), AsSockAddr(&conn_bound_addr), - param.connector.addr_len), - SyscallSucceeds()); - - FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(param.listener.family(), SOCK_STREAM, IPPROTO_TCP)); - ASSERT_THAT( - bind(listen_fd.get(), AsSockAddr(&listen_addr), param.listener.addr_len), - SyscallFailsWithErrno(EADDRINUSE)); -} - -TEST_P(SocketInetLoopbackTest, TCPPassiveCloseNoTimeWaitReuseTest) { - auto const& param = GetParam(); - sockaddr_storage listen_addr, conn_bound_addr; - listen_addr = param.listener.addr; - setupTimeWaitClose(¶m.listener, ¶m.connector, true /*reuse*/, - true /*accept_close*/, &listen_addr, &conn_bound_addr); - - FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(param.listener.family(), SOCK_STREAM, IPPROTO_TCP)); - ASSERT_THAT(setsockopt(listen_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, - sizeof(kSockOptOn)), - SyscallSucceeds()); - ASSERT_THAT( - bind(listen_fd.get(), AsSockAddr(&listen_addr), param.listener.addr_len), - SyscallSucceeds()); - ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); - - // Now bind and connect new socket and verify that we can immediately rebind - // the address bound by the conn_fd as it never entered TIME_WAIT. - const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP)); - ASSERT_THAT(setsockopt(conn_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, - sizeof(kSockOptOn)), - SyscallSucceeds()); - ASSERT_THAT(bind(conn_fd.get(), AsSockAddr(&conn_bound_addr), - param.connector.addr_len), - SyscallSucceeds()); - - uint16_t const port = - ASSERT_NO_ERRNO_AND_VALUE(AddrPort(param.listener.family(), listen_addr)); - sockaddr_storage conn_addr = param.connector.addr; - ASSERT_NO_ERRNO(SetAddrPort(param.connector.family(), &conn_addr, port)); - ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), AsSockAddr(&conn_addr), - param.connector.addr_len), - SyscallSucceeds()); -} - -TEST_P(SocketInetLoopbackTest, TCPActiveCloseTimeWaitTest) { - auto const& param = GetParam(); - sockaddr_storage listen_addr, conn_bound_addr; - listen_addr = param.listener.addr; - setupTimeWaitClose(¶m.listener, ¶m.connector, false /*reuse*/, - false /*accept_close*/, &listen_addr, &conn_bound_addr); - FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP)); - - ASSERT_THAT(bind(conn_fd.get(), AsSockAddr(&conn_bound_addr), - param.connector.addr_len), - SyscallFailsWithErrno(EADDRINUSE)); -} - -TEST_P(SocketInetLoopbackTest, TCPActiveCloseTimeWaitReuseTest) { - auto const& param = GetParam(); - sockaddr_storage listen_addr, conn_bound_addr; - listen_addr = param.listener.addr; - setupTimeWaitClose(¶m.listener, ¶m.connector, true /*reuse*/, - false /*accept_close*/, &listen_addr, &conn_bound_addr); - FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP)); - ASSERT_THAT(setsockopt(conn_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, - sizeof(kSockOptOn)), - SyscallSucceeds()); - ASSERT_THAT(bind(conn_fd.get(), AsSockAddr(&conn_bound_addr), - param.connector.addr_len), - SyscallFailsWithErrno(EADDRINUSE)); -} - TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -1464,7 +1105,7 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) { } TEST_P(SocketInetLoopbackTest, TCPAcceptAfterReset) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -1495,7 +1136,7 @@ TEST_P(SocketInetLoopbackTest, TCPAcceptAfterReset) { sockaddr_storage conn_addr = connector.addr; ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); - // TODO(b/157236388): Reenable Cooperative S/R once bug is fixed. + // TODO(b/153489135): Reenable Cooperative S/R once bug is fixed. DisableSave ds; ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), AsSockAddr(&conn_addr), connector.addr_len), @@ -1575,7 +1216,7 @@ TEST_P(SocketInetLoopbackTest, TCPDeferAccept) { // saved. Enable S/R issue is fixed. DisableSave ds; - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -1655,7 +1296,7 @@ TEST_P(SocketInetLoopbackTest, TCPDeferAcceptTimeout) { // saved. Enable S/R once issue is fixed. DisableSave ds; - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -1722,42 +1363,16 @@ TEST_P(SocketInetLoopbackTest, TCPDeferAcceptTimeout) { ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); } -INSTANTIATE_TEST_SUITE_P( - All, SocketInetLoopbackTest, - ::testing::Values( - // Listeners bound to IPv4 addresses refuse connections using IPv6 - // addresses. - TestParam{V4Any(), V4Any()}, TestParam{V4Any(), V4Loopback()}, - TestParam{V4Any(), V4MappedAny()}, - TestParam{V4Any(), V4MappedLoopback()}, - TestParam{V4Loopback(), V4Any()}, TestParam{V4Loopback(), V4Loopback()}, - TestParam{V4Loopback(), V4MappedLoopback()}, - TestParam{V4MappedAny(), V4Any()}, - TestParam{V4MappedAny(), V4Loopback()}, - TestParam{V4MappedAny(), V4MappedAny()}, - TestParam{V4MappedAny(), V4MappedLoopback()}, - TestParam{V4MappedLoopback(), V4Any()}, - TestParam{V4MappedLoopback(), V4Loopback()}, - TestParam{V4MappedLoopback(), V4MappedLoopback()}, - - // Listeners bound to IN6ADDR_ANY accept all connections. - TestParam{V6Any(), V4Any()}, TestParam{V6Any(), V4Loopback()}, - TestParam{V6Any(), V4MappedAny()}, - TestParam{V6Any(), V4MappedLoopback()}, TestParam{V6Any(), V6Any()}, - TestParam{V6Any(), V6Loopback()}, - - // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4 - // addresses. - TestParam{V6Loopback(), V6Any()}, - TestParam{V6Loopback(), V6Loopback()}), - DescribeTestParam); +INSTANTIATE_TEST_SUITE_P(All, SocketInetLoopbackTest, + SocketInetLoopbackTestValues(), + DescribeSocketInetTestParam); -using SocketInetReusePortTest = ::testing::TestWithParam<TestParam>; +using SocketInetReusePortTest = ::testing::TestWithParam<SocketInetTestParam>; // TODO(gvisor.dev/issue/940): Remove when portHint/stack.Seed is // saved/restored. TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -1867,7 +1482,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) { } TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -1978,7 +1593,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) { } TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -2086,32 +1701,23 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( // Listeners bound to IPv4 addresses refuse connections using IPv6 // addresses. - TestParam{V4Any(), V4Loopback()}, - TestParam{V4Loopback(), V4MappedLoopback()}, + SocketInetTestParam{V4Any(), V4Loopback()}, + SocketInetTestParam{V4Loopback(), V4MappedLoopback()}, // Listeners bound to IN6ADDR_ANY accept all connections. - TestParam{V6Any(), V4Loopback()}, TestParam{V6Any(), V6Loopback()}, + SocketInetTestParam{V6Any(), V4Loopback()}, + SocketInetTestParam{V6Any(), V6Loopback()}, // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4 // addresses. - TestParam{V6Loopback(), V6Loopback()}), - DescribeTestParam); - -struct ProtocolTestParam { - std::string description; - int type; -}; - -std::string DescribeProtocolTestParam( - ::testing::TestParamInfo<ProtocolTestParam> const& info) { - return info.param.description; -} + SocketInetTestParam{V6Loopback(), V6Loopback()}), + DescribeSocketInetTestParam); using SocketMultiProtocolInetLoopbackTest = ::testing::TestWithParam<ProtocolTestParam>; TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedLoopbackOnlyReservesV4) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); for (int i = 0; true; i++) { // Bind the v4 loopback on a dual stack socket. @@ -2160,7 +1766,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedLoopbackOnlyReservesV4) { } TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedAnyOnlyReservesV4) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); for (int i = 0; true; i++) { // Bind the v4 any on a dual stack socket. @@ -2209,7 +1815,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedAnyOnlyReservesV4) { } TEST_P(SocketMultiProtocolInetLoopbackTest, DualStackV6AnyReservesEverything) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); // Bind the v6 any on a dual stack socket. TestAddress const& test_addr_dual = V6Any(); @@ -2272,7 +1878,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, DualStackV6AnyReservesEverything) { TEST_P(SocketMultiProtocolInetLoopbackTest, DualStackV6AnyReuseAddrDoesNotReserveV4Any) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); // Bind the v6 any on a dual stack socket. TestAddress const& test_addr_dual = V6Any(); @@ -2309,7 +1915,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, TEST_P(SocketMultiProtocolInetLoopbackTest, DualStackV6AnyReuseAddrListenReservesV4Any) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); // Only TCP sockets are supported. SKIP_IF((param.type & SOCK_STREAM) == 0); @@ -2352,7 +1958,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, TEST_P(SocketMultiProtocolInetLoopbackTest, DualStackV6AnyWithListenReservesEverything) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); // Only TCP sockets are supported. SKIP_IF((param.type & SOCK_STREAM) == 0); @@ -2419,7 +2025,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, } TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); for (int i = 0; true; i++) { // Bind the v6 any on a v6-only socket. @@ -2472,7 +2078,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) { } TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); for (int i = 0; true; i++) { // Bind the v6 loopback on a dual stack socket. @@ -2552,66 +2158,8 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) { } } -TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReservedReuseAddr) { - auto const& param = GetParam(); - - // Bind the v6 loopback on a dual stack socket. - TestAddress const& test_addr = V6Loopback(); - sockaddr_storage bound_addr = test_addr.addr; - const FileDescriptor bound_fd = - ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); - ASSERT_THAT(bind(bound_fd.get(), AsSockAddr(&bound_addr), test_addr.addr_len), - SyscallSucceeds()); - ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, - sizeof(kSockOptOn)), - SyscallSucceeds()); - - // Listen iff TCP. - if (param.type == SOCK_STREAM) { - ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds()); - } - - // Get the port that we bound. - socklen_t bound_addr_len = test_addr.addr_len; - ASSERT_THAT( - getsockname(bound_fd.get(), AsSockAddr(&bound_addr), &bound_addr_len), - SyscallSucceeds()); - - // Connect to bind an ephemeral port. - const FileDescriptor connected_fd = - ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); - ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR, - &kSockOptOn, sizeof(kSockOptOn)), - SyscallSucceeds()); - ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(), AsSockAddr(&bound_addr), - bound_addr_len), - SyscallSucceeds()); - - // Get the ephemeral port. - sockaddr_storage connected_addr = {}; - socklen_t connected_addr_len = sizeof(connected_addr); - ASSERT_THAT(getsockname(connected_fd.get(), AsSockAddr(&connected_addr), - &connected_addr_len), - SyscallSucceeds()); - uint16_t const ephemeral_port = - ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr)); - - // Verify that we actually got an ephemeral port. - ASSERT_NE(ephemeral_port, 0); - - // Verify that the ephemeral port is not reserved. - const FileDescriptor checking_fd = - ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); - ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR, - &kSockOptOn, sizeof(kSockOptOn)), - SyscallSucceeds()); - EXPECT_THAT( - bind(checking_fd.get(), AsSockAddr(&connected_addr), connected_addr_len), - SyscallSucceeds()); -} - TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); for (int i = 0; true; i++) { // Bind the v4 loopback on a dual stack socket. @@ -2723,68 +2271,8 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) { } } -TEST_P(SocketMultiProtocolInetLoopbackTest, - V4MappedEphemeralPortReservedResueAddr) { - auto const& param = GetParam(); - - // Bind the v4 loopback on a dual stack socket. - TestAddress const& test_addr = V4MappedLoopback(); - sockaddr_storage bound_addr = test_addr.addr; - const FileDescriptor bound_fd = - ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); - ASSERT_THAT(bind(bound_fd.get(), AsSockAddr(&bound_addr), test_addr.addr_len), - SyscallSucceeds()); - - ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, - sizeof(kSockOptOn)), - SyscallSucceeds()); - - // Listen iff TCP. - if (param.type == SOCK_STREAM) { - ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds()); - } - - // Get the port that we bound. - socklen_t bound_addr_len = test_addr.addr_len; - ASSERT_THAT( - getsockname(bound_fd.get(), AsSockAddr(&bound_addr), &bound_addr_len), - SyscallSucceeds()); - - // Connect to bind an ephemeral port. - const FileDescriptor connected_fd = - ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); - ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR, - &kSockOptOn, sizeof(kSockOptOn)), - SyscallSucceeds()); - ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(), AsSockAddr(&bound_addr), - bound_addr_len), - SyscallSucceeds()); - - // Get the ephemeral port. - sockaddr_storage connected_addr = {}; - socklen_t connected_addr_len = sizeof(connected_addr); - ASSERT_THAT(getsockname(connected_fd.get(), AsSockAddr(&connected_addr), - &connected_addr_len), - SyscallSucceeds()); - uint16_t const ephemeral_port = - ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr)); - - // Verify that we actually got an ephemeral port. - ASSERT_NE(ephemeral_port, 0); - - // Verify that the ephemeral port is not reserved. - const FileDescriptor checking_fd = - ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); - ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR, - &kSockOptOn, sizeof(kSockOptOn)), - SyscallSucceeds()); - EXPECT_THAT( - bind(checking_fd.get(), AsSockAddr(&connected_addr), connected_addr_len), - SyscallSucceeds()); -} - TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); for (int i = 0; true; i++) { // Bind the v4 loopback on a v4 socket. @@ -2897,71 +2385,9 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) { } } -TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) { - auto const& param = GetParam(); - - // Bind the v4 loopback on a v4 socket. - TestAddress const& test_addr = V4Loopback(); - sockaddr_storage bound_addr = test_addr.addr; - const FileDescriptor bound_fd = - ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); - - ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, - sizeof(kSockOptOn)), - SyscallSucceeds()); - - ASSERT_THAT(bind(bound_fd.get(), AsSockAddr(&bound_addr), test_addr.addr_len), - SyscallSucceeds()); - - // Listen iff TCP. - if (param.type == SOCK_STREAM) { - ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds()); - } - - // Get the port that we bound. - socklen_t bound_addr_len = test_addr.addr_len; - ASSERT_THAT( - getsockname(bound_fd.get(), AsSockAddr(&bound_addr), &bound_addr_len), - SyscallSucceeds()); - - // Connect to bind an ephemeral port. - const FileDescriptor connected_fd = - ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); - - ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR, - &kSockOptOn, sizeof(kSockOptOn)), - SyscallSucceeds()); - - ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(), AsSockAddr(&bound_addr), - bound_addr_len), - SyscallSucceeds()); - - // Get the ephemeral port. - sockaddr_storage connected_addr = {}; - socklen_t connected_addr_len = sizeof(connected_addr); - ASSERT_THAT(getsockname(connected_fd.get(), AsSockAddr(&connected_addr), - &connected_addr_len), - SyscallSucceeds()); - uint16_t const ephemeral_port = - ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr)); - - // Verify that we actually got an ephemeral port. - ASSERT_NE(ephemeral_port, 0); - - // Verify that the ephemeral port is not reserved. - const FileDescriptor checking_fd = - ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); - ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR, - &kSockOptOn, sizeof(kSockOptOn)), - SyscallSucceeds()); - EXPECT_THAT( - bind(checking_fd.get(), AsSockAddr(&connected_addr), connected_addr_len), - SyscallSucceeds()); -} - TEST_P(SocketMultiProtocolInetLoopbackTest, MultipleBindsAllowedNoListeningReuseAddr) { - const auto& param = GetParam(); + ProtocolTestParam const& param = GetParam(); // UDP sockets are allowed to bind/listen on the port w/ SO_REUSEADDR, for TCP // this is only permitted if there is no other listening socket. SKIP_IF(param.type != SOCK_STREAM); @@ -2996,7 +2422,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, } TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); TestAddress const& test_addr = V4Loopback(); sockaddr_storage addr = test_addr.addr; @@ -3049,7 +2475,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) { // closed, we can bind a different socket to the same address without needing // REUSEPORT. TEST_P(SocketMultiProtocolInetLoopbackTest, NoReusePortFollowingReusePort) { - auto const& param = GetParam(); + ProtocolTestParam const& param = GetParam(); TestAddress const& test_addr = V4Loopback(); sockaddr_storage addr = test_addr.addr; @@ -3076,11 +2502,8 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, NoReusePortFollowingReusePort) { ASSERT_THAT(bind(fd, AsSockAddr(&addr), addrlen), SyscallSucceeds()); } -INSTANTIATE_TEST_SUITE_P( - AllFamilies, SocketMultiProtocolInetLoopbackTest, - ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM}, - ProtocolTestParam{"UDP", SOCK_DGRAM}), - DescribeProtocolTestParam); +INSTANTIATE_TEST_SUITE_P(AllFamilies, SocketMultiProtocolInetLoopbackTest, + ProtocolTestValues(), DescribeProtocolTestParam); } // namespace diff --git a/test/syscalls/linux/socket_inet_loopback_isolated.cc b/test/syscalls/linux/socket_inet_loopback_isolated.cc new file mode 100644 index 000000000..182d20a9e --- /dev/null +++ b/test/syscalls/linux/socket_inet_loopback_isolated.cc @@ -0,0 +1,489 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <netinet/tcp.h> + +#include "gtest/gtest.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "test/syscalls/linux/socket_inet_loopback_test_params.h" +#include "test/util/socket_util.h" +#include "test/util/test_util.h" + +// Unit tests in this file will run in their own network namespace. + +namespace gvisor { +namespace testing { + +namespace { + +using SocketInetLoopbackIsolatedTest = + ::testing::TestWithParam<SocketInetTestParam>; + +TEST_P(SocketInetLoopbackIsolatedTest, TCPActiveCloseTimeWaitTest) { + SocketInetTestParam const& param = GetParam(); + sockaddr_storage listen_addr, conn_bound_addr; + listen_addr = param.listener.addr; + SetupTimeWaitClose(¶m.listener, ¶m.connector, false /*reuse*/, + false /*accept_close*/, &listen_addr, &conn_bound_addr); + FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP)); + + ASSERT_THAT(bind(conn_fd.get(), AsSockAddr(&conn_bound_addr), + param.connector.addr_len), + SyscallFailsWithErrno(EADDRINUSE)); +} + +TEST_P(SocketInetLoopbackIsolatedTest, TCPActiveCloseTimeWaitReuseTest) { + SocketInetTestParam const& param = GetParam(); + sockaddr_storage listen_addr, conn_bound_addr; + listen_addr = param.listener.addr; + SetupTimeWaitClose(¶m.listener, ¶m.connector, true /*reuse*/, + false /*accept_close*/, &listen_addr, &conn_bound_addr); + FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP)); + ASSERT_THAT(setsockopt(conn_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceeds()); + ASSERT_THAT(bind(conn_fd.get(), AsSockAddr(&conn_bound_addr), + param.connector.addr_len), + SyscallFailsWithErrno(EADDRINUSE)); +} + +// These tests are disabled under random save as the restore run +// results in the stack.Seed() being different which can cause +// sequence number of final connect to be one that is considered +// old and can cause the test to be flaky. +// +// Test re-binding of client and server bound addresses when the older +// connection is in TIME_WAIT. +TEST_P(SocketInetLoopbackIsolatedTest, TCPPassiveCloseNoTimeWaitTest) { + SocketInetTestParam const& param = GetParam(); + sockaddr_storage listen_addr, conn_bound_addr; + listen_addr = param.listener.addr; + SetupTimeWaitClose(¶m.listener, ¶m.connector, false /*reuse*/, + true /*accept_close*/, &listen_addr, &conn_bound_addr); + + // Now bind a new socket and verify that we can immediately rebind the address + // bound by the conn_fd as it never entered TIME_WAIT. + const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP)); + ASSERT_THAT(bind(conn_fd.get(), AsSockAddr(&conn_bound_addr), + param.connector.addr_len), + SyscallSucceeds()); + + FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(param.listener.family(), SOCK_STREAM, IPPROTO_TCP)); + ASSERT_THAT( + bind(listen_fd.get(), AsSockAddr(&listen_addr), param.listener.addr_len), + SyscallFailsWithErrno(EADDRINUSE)); +} + +TEST_P(SocketInetLoopbackIsolatedTest, TCPPassiveCloseNoTimeWaitReuseTest) { + SocketInetTestParam const& param = GetParam(); + sockaddr_storage listen_addr, conn_bound_addr; + listen_addr = param.listener.addr; + SetupTimeWaitClose(¶m.listener, ¶m.connector, true /*reuse*/, + true /*accept_close*/, &listen_addr, &conn_bound_addr); + + FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(param.listener.family(), SOCK_STREAM, IPPROTO_TCP)); + ASSERT_THAT(setsockopt(listen_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceeds()); + ASSERT_THAT( + bind(listen_fd.get(), AsSockAddr(&listen_addr), param.listener.addr_len), + SyscallSucceeds()); + ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); + + // Now bind and connect new socket and verify that we can immediately rebind + // the address bound by the conn_fd as it never entered TIME_WAIT. + const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP)); + ASSERT_THAT(setsockopt(conn_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceeds()); + ASSERT_THAT(bind(conn_fd.get(), AsSockAddr(&conn_bound_addr), + param.connector.addr_len), + SyscallSucceeds()); + + uint16_t const port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(param.listener.family(), listen_addr)); + sockaddr_storage conn_addr = param.connector.addr; + ASSERT_NO_ERRNO(SetAddrPort(param.connector.family(), &conn_addr, port)); + ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), AsSockAddr(&conn_addr), + param.connector.addr_len), + SyscallSucceeds()); +} + +// TCPFinWait2Test creates a pair of connected sockets then closes one end to +// trigger FIN_WAIT2 state for the closed endpoint. Then it binds the same local +// IP/port on a new socket and tries to connect. The connect should fail w/ +// an EADDRINUSE. Then we wait till the FIN_WAIT2 timeout is over and try the +// connect again with a new socket and this time it should succeed. +// +// TCP timers are not S/R today, this can cause this test to be flaky when run +// under random S/R due to timer being reset on a restore. +TEST_P(SocketInetLoopbackIsolatedTest, TCPFinWait2Test) { + SocketInetTestParam const& param = GetParam(); + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + + // Create the listening socket. + const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); + sockaddr_storage listen_addr = listener.addr; + ASSERT_THAT( + bind(listen_fd.get(), AsSockAddr(&listen_addr), listener.addr_len), + SyscallSucceeds()); + ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); + + // Get the port bound by the listening socket. + socklen_t addrlen = listener.addr_len; + ASSERT_THAT(getsockname(listen_fd.get(), AsSockAddr(&listen_addr), &addrlen), + SyscallSucceeds()); + + uint16_t const port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); + + // Connect to the listening socket. + FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + + // Lower FIN_WAIT2 state to 5 seconds for test. + constexpr int kTCPLingerTimeout = 5; + EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2, + &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)), + SyscallSucceedsWithValue(0)); + + sockaddr_storage conn_addr = connector.addr; + ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); + ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), AsSockAddr(&conn_addr), + connector.addr_len), + SyscallSucceeds()); + + // Accept the connection. + auto accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + + // Get the address/port bound by the connecting socket. + sockaddr_storage conn_bound_addr; + socklen_t conn_addrlen = connector.addr_len; + ASSERT_THAT( + getsockname(conn_fd.get(), AsSockAddr(&conn_bound_addr), &conn_addrlen), + SyscallSucceeds()); + + // close the connecting FD to trigger FIN_WAIT2 on the connected fd. + conn_fd.reset(); + + // Now bind and connect a new socket. + const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + + // Disable cooperative saves after this point. As a save between the first + // bind/connect and the second one can cause the linger timeout timer to + // be restarted causing the final bind/connect to fail. + DisableSave ds; + + ASSERT_THAT(bind(conn_fd2.get(), AsSockAddr(&conn_bound_addr), conn_addrlen), + SyscallFailsWithErrno(EADDRINUSE)); + + // Sleep for a little over the linger timeout to reduce flakiness in + // save/restore tests. + absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 2)); + + ds.reset(); + + ASSERT_THAT( + RetryEINTR(connect)(conn_fd2.get(), AsSockAddr(&conn_addr), conn_addrlen), + SyscallSucceeds()); +} + +// TCPLinger2TimeoutAfterClose creates a pair of connected sockets +// then closes one end to trigger FIN_WAIT2 state for the closed endpoint. +// It then sleeps for the TCP_LINGER2 timeout and verifies that bind/ +// connecting the same address succeeds. +// +// TCP timers are not S/R today, this can cause this test to be flaky when run +// under random S/R due to timer being reset on a restore. +TEST_P(SocketInetLoopbackIsolatedTest, TCPLinger2TimeoutAfterClose) { + SocketInetTestParam const& param = GetParam(); + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + + // Create the listening socket. + const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); + sockaddr_storage listen_addr = listener.addr; + ASSERT_THAT( + bind(listen_fd.get(), AsSockAddr(&listen_addr), listener.addr_len), + SyscallSucceeds()); + ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); + + // Get the port bound by the listening socket. + socklen_t addrlen = listener.addr_len; + ASSERT_THAT(getsockname(listen_fd.get(), AsSockAddr(&listen_addr), &addrlen), + SyscallSucceeds()); + + uint16_t const port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); + + // Connect to the listening socket. + FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + + sockaddr_storage conn_addr = connector.addr; + ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); + ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), AsSockAddr(&conn_addr), + connector.addr_len), + SyscallSucceeds()); + + // Accept the connection. + auto accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + + // Get the address/port bound by the connecting socket. + sockaddr_storage conn_bound_addr; + socklen_t conn_addrlen = connector.addr_len; + ASSERT_THAT( + getsockname(conn_fd.get(), AsSockAddr(&conn_bound_addr), &conn_addrlen), + SyscallSucceeds()); + + // Disable cooperative saves after this point as TCP timers are not restored + // across a S/R. + { + DisableSave ds; + constexpr int kTCPLingerTimeout = 5; + EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2, + &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)), + SyscallSucceedsWithValue(0)); + + // close the connecting FD to trigger FIN_WAIT2 on the connected fd. + conn_fd.reset(); + + absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1)); + + // ds going out of scope will Re-enable S/R's since at this point the timer + // must have fired and cleaned up the endpoint. + } + + // Now bind and connect a new socket and verify that we can immediately + // rebind the address bound by the conn_fd as it never entered TIME_WAIT. + const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + + ASSERT_THAT(bind(conn_fd2.get(), AsSockAddr(&conn_bound_addr), conn_addrlen), + SyscallSucceeds()); + ASSERT_THAT( + RetryEINTR(connect)(conn_fd2.get(), AsSockAddr(&conn_addr), conn_addrlen), + SyscallSucceeds()); +} + +INSTANTIATE_TEST_SUITE_P(All, SocketInetLoopbackIsolatedTest, + SocketInetLoopbackTestValues(), + DescribeSocketInetTestParam); + +using SocketMultiProtocolInetLoopbackIsolatedTest = + ::testing::TestWithParam<ProtocolTestParam>; + +TEST_P(SocketMultiProtocolInetLoopbackIsolatedTest, + V4EphemeralPortReservedReuseAddr) { + ProtocolTestParam const& param = GetParam(); + + // Bind the v4 loopback on a v4 socket. + TestAddress const& test_addr = V4Loopback(); + sockaddr_storage bound_addr = test_addr.addr; + const FileDescriptor bound_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + + ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceeds()); + + ASSERT_THAT(bind(bound_fd.get(), AsSockAddr(&bound_addr), test_addr.addr_len), + SyscallSucceeds()); + + // Listen iff TCP. + if (param.type == SOCK_STREAM) { + ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds()); + } + + // Get the port that we bound. + socklen_t bound_addr_len = test_addr.addr_len; + ASSERT_THAT( + getsockname(bound_fd.get(), AsSockAddr(&bound_addr), &bound_addr_len), + SyscallSucceeds()); + + // Connect to bind an ephemeral port. + const FileDescriptor connected_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + + ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + + ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(), AsSockAddr(&bound_addr), + bound_addr_len), + SyscallSucceeds()); + + // Get the ephemeral port. + sockaddr_storage connected_addr = {}; + socklen_t connected_addr_len = sizeof(connected_addr); + ASSERT_THAT(getsockname(connected_fd.get(), AsSockAddr(&connected_addr), + &connected_addr_len), + SyscallSucceeds()); + uint16_t const ephemeral_port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr)); + + // Verify that we actually got an ephemeral port. + ASSERT_NE(ephemeral_port, 0); + + // Verify that the ephemeral port is not reserved. + const FileDescriptor checking_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + EXPECT_THAT( + bind(checking_fd.get(), AsSockAddr(&connected_addr), connected_addr_len), + SyscallSucceeds()); +} + +TEST_P(SocketMultiProtocolInetLoopbackIsolatedTest, + V4MappedEphemeralPortReservedReuseAddr) { + ProtocolTestParam const& param = GetParam(); + + // Bind the v4 loopback on a dual stack socket. + TestAddress const& test_addr = V4MappedLoopback(); + sockaddr_storage bound_addr = test_addr.addr; + const FileDescriptor bound_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + ASSERT_THAT(bind(bound_fd.get(), AsSockAddr(&bound_addr), test_addr.addr_len), + SyscallSucceeds()); + + ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceeds()); + + // Listen iff TCP. + if (param.type == SOCK_STREAM) { + ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds()); + } + + // Get the port that we bound. + socklen_t bound_addr_len = test_addr.addr_len; + ASSERT_THAT( + getsockname(bound_fd.get(), AsSockAddr(&bound_addr), &bound_addr_len), + SyscallSucceeds()); + + // Connect to bind an ephemeral port. + const FileDescriptor connected_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(), AsSockAddr(&bound_addr), + bound_addr_len), + SyscallSucceeds()); + + // Get the ephemeral port. + sockaddr_storage connected_addr = {}; + socklen_t connected_addr_len = sizeof(connected_addr); + ASSERT_THAT(getsockname(connected_fd.get(), AsSockAddr(&connected_addr), + &connected_addr_len), + SyscallSucceeds()); + uint16_t const ephemeral_port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr)); + + // Verify that we actually got an ephemeral port. + ASSERT_NE(ephemeral_port, 0); + + // Verify that the ephemeral port is not reserved. + const FileDescriptor checking_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + EXPECT_THAT( + bind(checking_fd.get(), AsSockAddr(&connected_addr), connected_addr_len), + SyscallSucceeds()); +} + +TEST_P(SocketMultiProtocolInetLoopbackIsolatedTest, + V6EphemeralPortReservedReuseAddr) { + ProtocolTestParam const& param = GetParam(); + + // Bind the v6 loopback on a dual stack socket. + TestAddress const& test_addr = V6Loopback(); + sockaddr_storage bound_addr = test_addr.addr; + const FileDescriptor bound_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + ASSERT_THAT(bind(bound_fd.get(), AsSockAddr(&bound_addr), test_addr.addr_len), + SyscallSucceeds()); + ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceeds()); + + // Listen iff TCP. + if (param.type == SOCK_STREAM) { + ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds()); + } + + // Get the port that we bound. + socklen_t bound_addr_len = test_addr.addr_len; + ASSERT_THAT( + getsockname(bound_fd.get(), AsSockAddr(&bound_addr), &bound_addr_len), + SyscallSucceeds()); + + // Connect to bind an ephemeral port. + const FileDescriptor connected_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(), AsSockAddr(&bound_addr), + bound_addr_len), + SyscallSucceeds()); + + // Get the ephemeral port. + sockaddr_storage connected_addr = {}; + socklen_t connected_addr_len = sizeof(connected_addr); + ASSERT_THAT(getsockname(connected_fd.get(), AsSockAddr(&connected_addr), + &connected_addr_len), + SyscallSucceeds()); + uint16_t const ephemeral_port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr)); + + // Verify that we actually got an ephemeral port. + ASSERT_NE(ephemeral_port, 0); + + // Verify that the ephemeral port is not reserved. + const FileDescriptor checking_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + EXPECT_THAT( + bind(checking_fd.get(), AsSockAddr(&connected_addr), connected_addr_len), + SyscallSucceeds()); +} + +INSTANTIATE_TEST_SUITE_P(AllFamilies, + SocketMultiProtocolInetLoopbackIsolatedTest, + ProtocolTestValues(), DescribeProtocolTestParam); + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc index 601ae107b..479162487 100644 --- a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc +++ b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc @@ -27,10 +27,11 @@ #include "gtest/gtest.h" #include "absl/strings/str_cat.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/syscalls/linux/socket_inet_loopback_test_params.h" #include "test/util/file_descriptor.h" #include "test/util/posix_error.h" #include "test/util/save_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { @@ -38,47 +39,7 @@ namespace testing { namespace { -using ::testing::Gt; - -PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) { - switch (family) { - case AF_INET: - return static_cast<uint16_t>( - reinterpret_cast<sockaddr_in const*>(&addr)->sin_port); - case AF_INET6: - return static_cast<uint16_t>( - reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port); - default: - return PosixError(EINVAL, - absl::StrCat("unknown socket family: ", family)); - } -} - -PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) { - switch (family) { - case AF_INET: - reinterpret_cast<sockaddr_in*>(addr)->sin_port = port; - return NoError(); - case AF_INET6: - reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port; - return NoError(); - default: - return PosixError(EINVAL, - absl::StrCat("unknown socket family: ", family)); - } -} - -struct TestParam { - TestAddress listener; - TestAddress connector; -}; - -std::string DescribeTestParam(::testing::TestParamInfo<TestParam> const& info) { - return absl::StrCat("Listen", info.param.listener.description, "_Connect", - info.param.connector.description); -} - -using SocketInetLoopbackTest = ::testing::TestWithParam<TestParam>; +using SocketInetLoopbackTest = ::testing::TestWithParam<SocketInetTestParam>; // This test verifies that connect returns EADDRNOTAVAIL if all local ephemeral // ports are already in use for a given destination ip/port. @@ -87,7 +48,7 @@ using SocketInetLoopbackTest = ::testing::TestWithParam<TestParam>; // // FIXME(b/162475855): This test is failing reliably. TEST_P(SocketInetLoopbackTest, DISABLED_TestTCPPortExhaustion) { - auto const& param = GetParam(); + SocketInetTestParam const& param = GetParam(); TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; @@ -136,59 +97,32 @@ TEST_P(SocketInetLoopbackTest, DISABLED_TestTCPPortExhaustion) { } } -INSTANTIATE_TEST_SUITE_P( - All, SocketInetLoopbackTest, - ::testing::Values( - // Listeners bound to IPv4 addresses refuse connections using IPv6 - // addresses. - TestParam{V4Any(), V4Any()}, TestParam{V4Any(), V4Loopback()}, - TestParam{V4Any(), V4MappedAny()}, - TestParam{V4Any(), V4MappedLoopback()}, - TestParam{V4Loopback(), V4Any()}, TestParam{V4Loopback(), V4Loopback()}, - TestParam{V4Loopback(), V4MappedLoopback()}, - TestParam{V4MappedAny(), V4Any()}, - TestParam{V4MappedAny(), V4Loopback()}, - TestParam{V4MappedAny(), V4MappedAny()}, - TestParam{V4MappedAny(), V4MappedLoopback()}, - TestParam{V4MappedLoopback(), V4Any()}, - TestParam{V4MappedLoopback(), V4Loopback()}, - TestParam{V4MappedLoopback(), V4MappedLoopback()}, - - // Listeners bound to IN6ADDR_ANY accept all connections. - TestParam{V6Any(), V4Any()}, TestParam{V6Any(), V4Loopback()}, - TestParam{V6Any(), V4MappedAny()}, - TestParam{V6Any(), V4MappedLoopback()}, TestParam{V6Any(), V6Any()}, - TestParam{V6Any(), V6Loopback()}, - - // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4 - // addresses. - TestParam{V6Loopback(), V6Any()}, - TestParam{V6Loopback(), V6Loopback()}), - DescribeTestParam); - -struct ProtocolTestParam { - std::string description; - int type; -}; - -std::string DescribeProtocolTestParam( - ::testing::TestParamInfo<ProtocolTestParam> const& info) { - return info.param.description; -} +INSTANTIATE_TEST_SUITE_P(All, SocketInetLoopbackTest, + SocketInetLoopbackTestValues(), + DescribeSocketInetTestParam); using SocketMultiProtocolInetLoopbackTest = ::testing::TestWithParam<ProtocolTestParam>; -TEST_P(SocketMultiProtocolInetLoopbackTest, BindAvoidsListeningPortsReuseAddr) { - const auto& param = GetParam(); - // UDP sockets are allowed to bind/listen on the port w/ SO_REUSEADDR, for TCP - // this is only permitted if there is no other listening socket. +TEST_P(SocketMultiProtocolInetLoopbackTest, + TCPBindAvoidsOtherBoundPortsReuseAddr) { + ProtocolTestParam const& param = GetParam(); + // UDP sockets are allowed to bind/listen on an already bound port w/ + // SO_REUSEADDR even when requesting a port from the kernel. In case of TCP + // rebinding is only permitted when SO_REUSEADDR is set and an explicit port + // is specified. When a zero port is specified to the bind() call then an + // already bound port will not be picked. SKIP_IF(param.type != SOCK_STREAM); DisableSave ds; // Too many syscalls. // A map of port to file descriptor binding the port. - std::map<uint16_t, FileDescriptor> listen_sockets; + std::map<uint16_t, FileDescriptor> bound_sockets; + + // Reduce number of ephemeral ports if permitted to reduce running time of + // the test. + [[maybe_unused]] const int nports = + ASSERT_NO_ERRNO_AND_VALUE(MaybeLimitEphemeralPorts()); // Exhaust all ephemeral ports. while (true) { @@ -214,19 +148,63 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, BindAvoidsListeningPortsReuseAddr) { SyscallSucceeds()); uint16_t port = reinterpret_cast<sockaddr_in*>(&bound_addr)->sin_port; - // Newly bound port should not already be in use by a listening socket. - ASSERT_EQ(listen_sockets.find(port), listen_sockets.end()); - auto fd = bound_fd.get(); - listen_sockets.insert(std::make_pair(port, std::move(bound_fd))); - ASSERT_THAT(listen(fd, SOMAXCONN), SyscallSucceeds()); + auto [iter, inserted] = bound_sockets.emplace(port, std::move(bound_fd)); + ASSERT_TRUE(inserted); + } +} + +TEST_P(SocketMultiProtocolInetLoopbackTest, + UDPBindMayBindOtherBoundPortsReuseAddr) { + ProtocolTestParam const& param = GetParam(); + // UDP sockets are allowed to bind/listen on an already bound port w/ + // SO_REUSEADDR even when requesting a port from the kernel. + SKIP_IF(param.type != SOCK_DGRAM); + + DisableSave ds; // Too many syscalls. + + // A map of port to file descriptor binding the port. + std::map<uint16_t, FileDescriptor> bound_sockets; + + // Reduce number of ephemeral ports if permitted to reduce running time of + // the test. + [[maybe_unused]] const int nports = + ASSERT_NO_ERRNO_AND_VALUE(MaybeLimitEphemeralPorts()); + + // Exhaust all ephemeral ports. + bool duplicate_binding = false; + while (true) { + // Bind the v4 loopback on a v4 socket. + TestAddress const& test_addr = V4Loopback(); + sockaddr_storage bound_addr = test_addr.addr; + FileDescriptor bound_fd = + ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0)); + + ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + + ASSERT_THAT( + bind(bound_fd.get(), AsSockAddr(&bound_addr), test_addr.addr_len), + SyscallSucceeds()); + + // Get the port that we bound. + socklen_t bound_addr_len = test_addr.addr_len; + ASSERT_THAT( + getsockname(bound_fd.get(), AsSockAddr(&bound_addr), &bound_addr_len), + SyscallSucceeds()); + uint16_t port = reinterpret_cast<sockaddr_in*>(&bound_addr)->sin_port; + + auto [iter, inserted] = bound_sockets.emplace(port, std::move(bound_fd)); + if (!inserted) { + duplicate_binding = true; + break; + } } + ASSERT_TRUE(duplicate_binding); } -INSTANTIATE_TEST_SUITE_P( - AllFamilies, SocketMultiProtocolInetLoopbackTest, - ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM}, - ProtocolTestParam{"UDP", SOCK_DGRAM}), - DescribeProtocolTestParam); +INSTANTIATE_TEST_SUITE_P(AllFamilies, SocketMultiProtocolInetLoopbackTest, + ProtocolTestValues(), DescribeProtocolTestParam); } // namespace diff --git a/test/syscalls/linux/socket_inet_loopback_test_params.h b/test/syscalls/linux/socket_inet_loopback_test_params.h new file mode 100644 index 000000000..163e595a8 --- /dev/null +++ b/test/syscalls/linux/socket_inet_loopback_test_params.h @@ -0,0 +1,86 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_INET_LOOPBACK_TEST_PARAMS_H_ +#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_INET_LOOPBACK_TEST_PARAMS_H_ + +#include "gtest/gtest.h" +#include "test/util/socket_util.h" + +namespace gvisor { +namespace testing { + +struct SocketInetTestParam { + TestAddress listener; + TestAddress connector; +}; + +inline std::string DescribeSocketInetTestParam( + ::testing::TestParamInfo<SocketInetTestParam> const& info) { + return absl::StrCat("Listen", info.param.listener.description, "_Connect", + info.param.connector.description); +} + +inline auto SocketInetLoopbackTestValues() { + return ::testing::Values( + // Listeners bound to IPv4 addresses refuse connections using IPv6 + // addresses. + SocketInetTestParam{V4Any(), V4Any()}, + SocketInetTestParam{V4Any(), V4Loopback()}, + SocketInetTestParam{V4Any(), V4MappedAny()}, + SocketInetTestParam{V4Any(), V4MappedLoopback()}, + SocketInetTestParam{V4Loopback(), V4Any()}, + SocketInetTestParam{V4Loopback(), V4Loopback()}, + SocketInetTestParam{V4Loopback(), V4MappedLoopback()}, + SocketInetTestParam{V4MappedAny(), V4Any()}, + SocketInetTestParam{V4MappedAny(), V4Loopback()}, + SocketInetTestParam{V4MappedAny(), V4MappedAny()}, + SocketInetTestParam{V4MappedAny(), V4MappedLoopback()}, + SocketInetTestParam{V4MappedLoopback(), V4Any()}, + SocketInetTestParam{V4MappedLoopback(), V4Loopback()}, + SocketInetTestParam{V4MappedLoopback(), V4MappedLoopback()}, + + // Listeners bound to IN6ADDR_ANY accept all connections. + SocketInetTestParam{V6Any(), V4Any()}, + SocketInetTestParam{V6Any(), V4Loopback()}, + SocketInetTestParam{V6Any(), V4MappedAny()}, + SocketInetTestParam{V6Any(), V4MappedLoopback()}, + SocketInetTestParam{V6Any(), V6Any()}, + SocketInetTestParam{V6Any(), V6Loopback()}, + + // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4 + // addresses. + SocketInetTestParam{V6Loopback(), V6Any()}, + SocketInetTestParam{V6Loopback(), V6Loopback()}); +} + +struct ProtocolTestParam { + std::string description; + int type; +}; + +inline std::string DescribeProtocolTestParam( + ::testing::TestParamInfo<ProtocolTestParam> const& info) { + return info.param.description; +} + +inline auto ProtocolTestValues() { + return ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM}, + ProtocolTestParam{"UDP", SOCK_DGRAM}); +} + +} // namespace testing +} // namespace gvisor + +#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_INET_LOOPBACK_TEST_PARAMS_H_ diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc index fda252dd7..caa5c0c63 100644 --- a/test/syscalls/linux/socket_ip_loopback_blocking.cc +++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc @@ -18,7 +18,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc index 2f5743cda..3271263c8 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic.cc +++ b/test/syscalls/linux/socket_ip_tcp_generic.cc @@ -28,7 +28,7 @@ #include "absl/memory/memory.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" diff --git a/test/syscalls/linux/socket_ip_tcp_generic.h b/test/syscalls/linux/socket_ip_tcp_generic.h index a3eff3c73..e9e60ef4c 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic.h +++ b/test/syscalls/linux/socket_ip_tcp_generic.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_TCP_GENERIC_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_TCP_GENERIC_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc index 4e79d21f4..3406874b8 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc +++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc @@ -18,7 +18,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_ip_tcp_generic.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_tcp_loopback.cc b/test/syscalls/linux/socket_ip_tcp_loopback.cc index 9db3037bc..0796b8634 100644 --- a/test/syscalls/linux/socket_ip_tcp_loopback.cc +++ b/test/syscalls/linux/socket_ip_tcp_loopback.cc @@ -16,7 +16,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_generic.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc index f996b93d2..533ccc3ae 100644 --- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc +++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc @@ -18,7 +18,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_stream_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc index ffa377210..05fe2a738 100644 --- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc +++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc @@ -18,7 +18,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_non_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc index f178f1af9..88adb5b1b 100644 --- a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc +++ b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc @@ -23,7 +23,7 @@ #include "gtest/gtest.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc index 1694e188a..8a87f2667 100644 --- a/test/syscalls/linux/socket_ip_udp_generic.cc +++ b/test/syscalls/linux/socket_ip_udp_generic.cc @@ -28,7 +28,7 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_udp_generic.h b/test/syscalls/linux/socket_ip_udp_generic.h index 106c54e9f..a3a66c768 100644 --- a/test/syscalls/linux/socket_ip_udp_generic.h +++ b/test/syscalls/linux/socket_ip_udp_generic.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_GENERIC_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_GENERIC_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc index c7fa44884..6d06bd580 100644 --- a/test/syscalls/linux/socket_ip_udp_loopback.cc +++ b/test/syscalls/linux/socket_ip_udp_loopback.cc @@ -18,7 +18,7 @@ #include "test/syscalls/linux/socket_generic.h" #include "test/syscalls/linux/socket_ip_udp_generic.h" #include "test/syscalls/linux/socket_non_stream.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc index d6925a8df..60d02e079 100644 --- a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc +++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc @@ -16,7 +16,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_non_stream_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc index d675eddc6..c011e3658 100644 --- a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc +++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc @@ -16,7 +16,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_non_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ip_udp_unbound_external_networking.cc index fdbb2216b..af2459a2f 100644 --- a/test/syscalls/linux/socket_ip_udp_unbound_external_networking.cc +++ b/test/syscalls/linux/socket_ip_udp_unbound_external_networking.cc @@ -14,7 +14,7 @@ #include "test/syscalls/linux/socket_ip_udp_unbound_external_networking.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ip_udp_unbound_external_networking.h index e5287addb..2e8aab129 100644 --- a/test/syscalls/linux/socket_ip_udp_unbound_external_networking.h +++ b/test/syscalls/linux/socket_ip_udp_unbound_external_networking.h @@ -16,7 +16,7 @@ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_UNBOUND_EXTERNAL_NETWORKING_H_ #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc index 029f1e872..930f19e59 100644 --- a/test/syscalls/linux/socket_ip_unbound.cc +++ b/test/syscalls/linux/socket_ip_unbound.cc @@ -24,7 +24,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ip_unbound_netlink.cc b/test/syscalls/linux/socket_ip_unbound_netlink.cc index b02222999..803a3b30b 100644 --- a/test/syscalls/linux/socket_ip_unbound_netlink.cc +++ b/test/syscalls/linux/socket_ip_unbound_netlink.cc @@ -25,8 +25,8 @@ #include "gtest/gtest.h" #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_netlink_route_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound.cc b/test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound.cc new file mode 100644 index 000000000..0e3e7057b --- /dev/null +++ b/test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound.cc @@ -0,0 +1,299 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound.h" + +#include "gtest/gtest.h" +#include "test/syscalls/linux/ip_socket_test_util.h" +#include "test/util/capability_util.h" + +namespace gvisor { +namespace testing { + +void IPv4DatagramBasedUnboundSocketTest::SetUp() { + if (GetParam().type & SOCK_RAW) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); + } +} + +// Check that dropping a group membership that does not exist fails. +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastInvalidDrop) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + // Unregister from a membership that we didn't have. + ip_mreq group = {}; + group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); + group.imr_interface.s_addr = htonl(INADDR_LOOPBACK); + EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group, + sizeof(group)), + SyscallFailsWithErrno(EADDRNOTAVAIL)); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfZero) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreqn iface = {}; + EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, + sizeof(iface)), + SyscallSucceeds()); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfInvalidNic) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreqn iface = {}; + iface.imr_ifindex = -1; + EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, + sizeof(iface)), + SyscallFailsWithErrno(EADDRNOTAVAIL)); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfInvalidAddr) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreq iface = {}; + iface.imr_interface.s_addr = inet_addr("255.255.255"); + EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, + sizeof(iface)), + SyscallFailsWithErrno(EADDRNOTAVAIL)); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfSetShort) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + // Create a valid full-sized request. + ip_mreqn iface = {}; + iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); + + // Send an optlen of 1 to check that optlen is enforced. + EXPECT_THAT( + setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, 1), + SyscallFailsWithErrno(EINVAL)); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfDefault) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + in_addr get = {}; + socklen_t size = sizeof(get); + ASSERT_THAT( + getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), + SyscallSucceeds()); + EXPECT_EQ(size, sizeof(get)); + EXPECT_EQ(get.s_addr, 0); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfDefaultReqn) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreqn get = {}; + socklen_t size = sizeof(get); + ASSERT_THAT( + getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), + SyscallSucceeds()); + + // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the + // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr. + // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr. + EXPECT_EQ(size, sizeof(in_addr)); + + // getsockopt(IP_MULTICAST_IF) will only return the interface address which + // hasn't been set. + EXPECT_EQ(get.imr_multiaddr.s_addr, 0); + EXPECT_EQ(get.imr_address.s_addr, 0); + EXPECT_EQ(get.imr_ifindex, 0); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfSetAddrGetReqn) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + in_addr set = {}; + set.s_addr = htonl(INADDR_LOOPBACK); + ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, + sizeof(set)), + SyscallSucceeds()); + + ip_mreqn get = {}; + socklen_t size = sizeof(get); + ASSERT_THAT( + getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), + SyscallSucceeds()); + + // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the + // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr. + // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr. + EXPECT_EQ(size, sizeof(in_addr)); + EXPECT_EQ(get.imr_multiaddr.s_addr, set.s_addr); + EXPECT_EQ(get.imr_address.s_addr, 0); + EXPECT_EQ(get.imr_ifindex, 0); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfSetReqAddrGetReqn) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreq set = {}; + set.imr_interface.s_addr = htonl(INADDR_LOOPBACK); + ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, + sizeof(set)), + SyscallSucceeds()); + + ip_mreqn get = {}; + socklen_t size = sizeof(get); + ASSERT_THAT( + getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), + SyscallSucceeds()); + + // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the + // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr. + // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr. + EXPECT_EQ(size, sizeof(in_addr)); + EXPECT_EQ(get.imr_multiaddr.s_addr, set.imr_interface.s_addr); + EXPECT_EQ(get.imr_address.s_addr, 0); + EXPECT_EQ(get.imr_ifindex, 0); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfSetNicGetReqn) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreqn set = {}; + set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); + ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, + sizeof(set)), + SyscallSucceeds()); + + ip_mreqn get = {}; + socklen_t size = sizeof(get); + ASSERT_THAT( + getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), + SyscallSucceeds()); + EXPECT_EQ(size, sizeof(in_addr)); + EXPECT_EQ(get.imr_multiaddr.s_addr, 0); + EXPECT_EQ(get.imr_address.s_addr, 0); + EXPECT_EQ(get.imr_ifindex, 0); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfSetAddr) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + in_addr set = {}; + set.s_addr = htonl(INADDR_LOOPBACK); + ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, + sizeof(set)), + SyscallSucceeds()); + + in_addr get = {}; + socklen_t size = sizeof(get); + ASSERT_THAT( + getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), + SyscallSucceeds()); + + EXPECT_EQ(size, sizeof(get)); + EXPECT_EQ(get.s_addr, set.s_addr); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfSetReqAddr) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreq set = {}; + set.imr_interface.s_addr = htonl(INADDR_LOOPBACK); + ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, + sizeof(set)), + SyscallSucceeds()); + + in_addr get = {}; + socklen_t size = sizeof(get); + ASSERT_THAT( + getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), + SyscallSucceeds()); + + EXPECT_EQ(size, sizeof(get)); + EXPECT_EQ(get.s_addr, set.imr_interface.s_addr); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, IpMulticastIfSetNic) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreqn set = {}; + set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); + ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, + sizeof(set)), + SyscallSucceeds()); + + in_addr get = {}; + socklen_t size = sizeof(get); + ASSERT_THAT( + getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), + SyscallSucceeds()); + EXPECT_EQ(size, sizeof(get)); + EXPECT_EQ(get.s_addr, 0); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, TestJoinGroupNoIf) { + // TODO(b/185517803): Fix for native test. + SKIP_IF(!IsRunningOnGvisor()); + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreqn group = {}; + group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); + EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, + sizeof(group)), + SyscallFailsWithErrno(ENODEV)); +} + +TEST_P(IPv4DatagramBasedUnboundSocketTest, TestJoinGroupInvalidIf) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + + ip_mreqn group = {}; + group.imr_address.s_addr = inet_addr("255.255.255"); + group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); + EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, + sizeof(group)), + SyscallFailsWithErrno(ENODEV)); +} + +// Check that multiple memberships are not allowed on the same socket. +TEST_P(IPv4DatagramBasedUnboundSocketTest, TestMultipleJoinsOnSingleSocket) { + auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + auto fd = socket1->get(); + ip_mreqn group = {}; + group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); + + EXPECT_THAT( + setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), + SyscallSucceeds()); + + EXPECT_THAT( + setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), + SyscallFailsWithErrno(EADDRINUSE)); +} + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound.h b/test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound.h new file mode 100644 index 000000000..7ab235cad --- /dev/null +++ b/test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound.h @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_DATAGRAM_BASED_SOCKET_UNBOUND_H_ +#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_DATAGRAM_BASED_SOCKET_UNBOUND_H_ + +#include "test/util/socket_util.h" + +namespace gvisor { +namespace testing { + +// Test fixture for tests that apply to IPv4 datagram-based sockets. +class IPv4DatagramBasedUnboundSocketTest : public SimpleSocketTest { + void SetUp() override; +}; + +} // namespace testing +} // namespace gvisor + +#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_DATAGRAM_BASED_SOCKET_UNBOUND_H_ diff --git a/test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound_loopback.cc b/test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound_loopback.cc new file mode 100644 index 000000000..9ae68e015 --- /dev/null +++ b/test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound_loopback.cc @@ -0,0 +1,28 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/syscalls/linux/ip_socket_test_util.h" +#include "test/syscalls/linux/socket_ipv4_datagram_based_socket_unbound.h" + +namespace gvisor { +namespace testing { + +INSTANTIATE_TEST_SUITE_P(IPv4Sockets, IPv4DatagramBasedUnboundSocketTest, + ::testing::ValuesIn(ApplyVecToVec<SocketKind>( + {IPv4UDPUnboundSocket, IPv4RawUDPUnboundSocket}, + AllBitwiseCombinations(List<int>{ + 0, SOCK_NONBLOCK})))); + +} // namespace testing +} // namespace gvisor diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc index 18be4dcc7..05773a29c 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc @@ -15,8 +15,6 @@ #include "test/syscalls/linux/socket_ipv4_udp_unbound.h" #include <arpa/inet.h> -#include <net/if.h> -#include <sys/ioctl.h> #include <sys/socket.h> #include <sys/types.h> #include <sys/un.h> @@ -26,10 +24,6 @@ #include "gtest/gtest.h" #include "absl/memory/memory.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" -#include "test/util/posix_error.h" -#include "test/util/save_util.h" -#include "test/util/test_util.h" namespace gvisor { namespace testing { @@ -140,7 +134,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNicNoDefaultSendIf) { // Register to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -238,7 +232,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNic) { // Register to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -326,7 +320,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNic) { // Set the default send interface. ip_mreqn iface = {}; - iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, sizeof(iface)), SyscallSucceeds()); @@ -346,7 +340,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNic) { // Register to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -437,7 +431,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicConnect) { // Set the default send interface. ip_mreqn iface = {}; - iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, sizeof(iface)), SyscallSucceeds()); @@ -457,7 +451,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicConnect) { // Register to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -548,7 +542,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelf) { // Set the default send interface. ip_mreqn iface = {}; - iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, sizeof(iface)), SyscallSucceeds()); @@ -568,7 +562,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelf) { // Register to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -657,7 +651,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfConnect) { // Set the default send interface. ip_mreqn iface = {}; - iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, sizeof(iface)), SyscallSucceeds()); @@ -677,7 +671,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfConnect) { // Register to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -770,7 +764,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfNoLoop) { // Set the default send interface. ip_mreqn iface = {}; - iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, sizeof(iface)), SyscallSucceeds()); @@ -794,7 +788,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfNoLoop) { // Register to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -819,20 +813,6 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfNoLoop) { EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf))); } -// Check that dropping a group membership that does not exist fails. -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastInvalidDrop) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - // Unregister from a membership that we didn't have. - ip_mreq group = {}; - group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_interface.s_addr = htonl(INADDR_LOOPBACK); - EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group, - sizeof(group)), - SyscallFailsWithErrno(EADDRNOTAVAIL)); -} - // Check that dropping a group membership prevents multicast packets from being // delivered. Default send address configured by bind and group membership // interface configured by address. @@ -917,7 +897,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropNic) { // Register and unregister to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -943,260 +923,6 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropNic) { PosixErrorIs(EAGAIN, ::testing::_)); } -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfZero) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreqn iface = {}; - EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, - sizeof(iface)), - SyscallSucceeds()); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfInvalidNic) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreqn iface = {}; - iface.imr_ifindex = -1; - EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, - sizeof(iface)), - SyscallFailsWithErrno(EADDRNOTAVAIL)); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfInvalidAddr) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreq iface = {}; - iface.imr_interface.s_addr = inet_addr("255.255.255"); - EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, - sizeof(iface)), - SyscallFailsWithErrno(EADDRNOTAVAIL)); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetShort) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - // Create a valid full-sized request. - ip_mreqn iface = {}; - iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); - - // Send an optlen of 1 to check that optlen is enforced. - EXPECT_THAT( - setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, 1), - SyscallFailsWithErrno(EINVAL)); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfDefault) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - in_addr get = {}; - socklen_t size = sizeof(get); - ASSERT_THAT( - getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), - SyscallSucceeds()); - EXPECT_EQ(size, sizeof(get)); - EXPECT_EQ(get.s_addr, 0); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfDefaultReqn) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreqn get = {}; - socklen_t size = sizeof(get); - ASSERT_THAT( - getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), - SyscallSucceeds()); - - // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the - // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr. - // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr. - EXPECT_EQ(size, sizeof(in_addr)); - - // getsockopt(IP_MULTICAST_IF) will only return the interface address which - // hasn't been set. - EXPECT_EQ(get.imr_multiaddr.s_addr, 0); - EXPECT_EQ(get.imr_address.s_addr, 0); - EXPECT_EQ(get.imr_ifindex, 0); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetAddrGetReqn) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - in_addr set = {}; - set.s_addr = htonl(INADDR_LOOPBACK); - ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, - sizeof(set)), - SyscallSucceeds()); - - ip_mreqn get = {}; - socklen_t size = sizeof(get); - ASSERT_THAT( - getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), - SyscallSucceeds()); - - // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the - // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr. - // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr. - EXPECT_EQ(size, sizeof(in_addr)); - EXPECT_EQ(get.imr_multiaddr.s_addr, set.s_addr); - EXPECT_EQ(get.imr_address.s_addr, 0); - EXPECT_EQ(get.imr_ifindex, 0); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetReqAddrGetReqn) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreq set = {}; - set.imr_interface.s_addr = htonl(INADDR_LOOPBACK); - ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, - sizeof(set)), - SyscallSucceeds()); - - ip_mreqn get = {}; - socklen_t size = sizeof(get); - ASSERT_THAT( - getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), - SyscallSucceeds()); - - // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the - // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr. - // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr. - EXPECT_EQ(size, sizeof(in_addr)); - EXPECT_EQ(get.imr_multiaddr.s_addr, set.imr_interface.s_addr); - EXPECT_EQ(get.imr_address.s_addr, 0); - EXPECT_EQ(get.imr_ifindex, 0); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetNicGetReqn) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreqn set = {}; - set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); - ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, - sizeof(set)), - SyscallSucceeds()); - - ip_mreqn get = {}; - socklen_t size = sizeof(get); - ASSERT_THAT( - getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), - SyscallSucceeds()); - EXPECT_EQ(size, sizeof(in_addr)); - EXPECT_EQ(get.imr_multiaddr.s_addr, 0); - EXPECT_EQ(get.imr_address.s_addr, 0); - EXPECT_EQ(get.imr_ifindex, 0); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetAddr) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - in_addr set = {}; - set.s_addr = htonl(INADDR_LOOPBACK); - ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, - sizeof(set)), - SyscallSucceeds()); - - in_addr get = {}; - socklen_t size = sizeof(get); - ASSERT_THAT( - getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), - SyscallSucceeds()); - - EXPECT_EQ(size, sizeof(get)); - EXPECT_EQ(get.s_addr, set.s_addr); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetReqAddr) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreq set = {}; - set.imr_interface.s_addr = htonl(INADDR_LOOPBACK); - ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, - sizeof(set)), - SyscallSucceeds()); - - in_addr get = {}; - socklen_t size = sizeof(get); - ASSERT_THAT( - getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), - SyscallSucceeds()); - - EXPECT_EQ(size, sizeof(get)); - EXPECT_EQ(get.s_addr, set.imr_interface.s_addr); -} - -TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetNic) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreqn set = {}; - set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); - ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set, - sizeof(set)), - SyscallSucceeds()); - - in_addr get = {}; - socklen_t size = sizeof(get); - ASSERT_THAT( - getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size), - SyscallSucceeds()); - EXPECT_EQ(size, sizeof(get)); - EXPECT_EQ(get.s_addr, 0); -} - -TEST_P(IPv4UDPUnboundSocketTest, TestJoinGroupNoIf) { - // TODO(b/185517803): Fix for native test. - SKIP_IF(!IsRunningOnGvisor()); - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreqn group = {}; - group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, - sizeof(group)), - SyscallFailsWithErrno(ENODEV)); -} - -TEST_P(IPv4UDPUnboundSocketTest, TestJoinGroupInvalidIf) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - - ip_mreqn group = {}; - group.imr_address.s_addr = inet_addr("255.255.255"); - group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, - sizeof(group)), - SyscallFailsWithErrno(ENODEV)); -} - -// Check that multiple memberships are not allowed on the same socket. -TEST_P(IPv4UDPUnboundSocketTest, TestMultipleJoinsOnSingleSocket) { - auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - auto fd = socket1->get(); - ip_mreqn group = {}; - group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); - - EXPECT_THAT( - setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), - SyscallSucceeds()); - - EXPECT_THAT( - setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), - SyscallFailsWithErrno(EADDRINUSE)); -} - // Check that two sockets can join the same multicast group at the same time. TEST_P(IPv4UDPUnboundSocketTest, TestTwoSocketsJoinSameMulticastGroup) { auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); @@ -1204,7 +930,7 @@ TEST_P(IPv4UDPUnboundSocketTest, TestTwoSocketsJoinSameMulticastGroup) { ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -1419,7 +1145,7 @@ TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenJoinThenReceive) { // Register to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -2110,16 +1836,11 @@ TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) { EXPECT_EQ(cmsg->cmsg_level, level); EXPECT_EQ(cmsg->cmsg_type, type); - // Get loopback index. - ifreq ifr = {}; - absl::SNPrintF(ifr.ifr_name, IFNAMSIZ, "lo"); - ASSERT_THAT(ioctl(sender->get(), SIOCGIFINDEX, &ifr), SyscallSucceeds()); - ASSERT_NE(ifr.ifr_ifindex, 0); - // Check the data in_pktinfo received_pktinfo = {}; memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(in_pktinfo)); - EXPECT_EQ(received_pktinfo.ipi_ifindex, ifr.ifr_ifindex); + EXPECT_EQ(received_pktinfo.ipi_ifindex, + ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex())); EXPECT_EQ(received_pktinfo.ipi_spec_dst.s_addr, htonl(INADDR_LOOPBACK)); EXPECT_EQ(received_pktinfo.ipi_addr.s_addr, htonl(INADDR_LOOPBACK)); } @@ -2439,7 +2160,7 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIPPacketInfo) { // Register to receive multicast packets. ip_mreqn group = {}; group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo")); + group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex()); ASSERT_THAT(setsockopt(receiver_socket->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -2484,16 +2205,10 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIPPacketInfo) { EXPECT_EQ(cmsg->cmsg_level, IPPROTO_IP); EXPECT_EQ(cmsg->cmsg_type, IP_PKTINFO); - // Get loopback index. - ifreq ifr = {}; - absl::SNPrintF(ifr.ifr_name, IFNAMSIZ, "lo"); - ASSERT_THAT(ioctl(receiver_socket->get(), SIOCGIFINDEX, &ifr), - SyscallSucceeds()); - ASSERT_NE(ifr.ifr_ifindex, 0); - in_pktinfo received_pktinfo = {}; memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(in_pktinfo)); - EXPECT_EQ(received_pktinfo.ipi_ifindex, ifr.ifr_ifindex); + EXPECT_EQ(received_pktinfo.ipi_ifindex, + ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex())); if (IsRunningOnGvisor()) { // This should actually be a unicast address assigned to the interface. // diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.h b/test/syscalls/linux/socket_ipv4_udp_unbound.h index f64c57645..3818a3490 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound.h +++ b/test/syscalls/linux/socket_ipv4_udp_unbound.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc index f6e64c157..ebf2185f2 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc @@ -17,7 +17,7 @@ #include <vector> #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc index f121c044d..bef284530 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc @@ -12,12 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include <vector> - #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_ipv4_udp_unbound.h" -#include "test/syscalls/linux/socket_test_util.h" -#include "test/util/test_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc index 8052bf404..f90a48630 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc @@ -16,7 +16,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_nogotsan.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_nogotsan.cc index 7ca6d52e4..5a7bca658 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_nogotsan.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_nogotsan.cc @@ -18,7 +18,7 @@ #include "gtest/gtest.h" #include "absl/memory/memory.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { @@ -31,7 +31,7 @@ using IPv4UDPUnboundSocketNogotsanTest = SimpleSocketTest; // We disable S/R because this test creates a large number of sockets. TEST_P(IPv4UDPUnboundSocketNogotsanTest, UDPConnectPortExhaustion) { auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - constexpr int kClients = 65536; + const int kClients = ASSERT_NO_ERRNO_AND_VALUE(MaybeLimitEphemeralPorts()); // Bind the first socket to the loopback and take note of the selected port. auto addr = V4Loopback(); ASSERT_THAT(bind(receiver1->get(), AsSockAddr(&addr.addr), addr.addr_len), @@ -61,7 +61,7 @@ TEST_P(IPv4UDPUnboundSocketNogotsanTest, UDPConnectPortExhaustion) { // We disable S/R because this test creates a large number of sockets. TEST_P(IPv4UDPUnboundSocketNogotsanTest, UDPBindPortExhaustion) { auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - constexpr int kClients = 65536; + const int kClients = ASSERT_NO_ERRNO_AND_VALUE(MaybeLimitEphemeralPorts()); auto addr = V4Loopback(); // Disable cooperative S/R as we are making too many syscalls. DisableSave ds; diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h index 73e7836d5..17c8e2b84 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h +++ b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound.cc b/test/syscalls/linux/socket_ipv6_udp_unbound.cc index a4e3371f4..612fd531c 100644 --- a/test/syscalls/linux/socket_ipv6_udp_unbound.cc +++ b/test/syscalls/linux/socket_ipv6_udp_unbound.cc @@ -31,9 +31,9 @@ #include "gtest/gtest.h" #include "absl/memory/memory.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/posix_error.h" #include "test/util/save_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound.h b/test/syscalls/linux/socket_ipv6_udp_unbound.h index 71e160f73..060343eaf 100644 --- a/test/syscalls/linux/socket_ipv6_udp_unbound.h +++ b/test/syscalls/linux/socket_ipv6_udp_unbound.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking.cc index 09f070797..d72b6b53f 100644 --- a/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking.cc +++ b/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking.cc @@ -39,7 +39,7 @@ TEST_P(IPv6UDPUnboundExternalNetworkingSocketTest, TestJoinLeaveMulticast) { .ipv6mr_multiaddr = reinterpret_cast<sockaddr_in6*>(&multicast_addr.addr)->sin6_addr, .ipv6mr_interface = static_cast<decltype(ipv6_mreq::ipv6mr_interface)>( - ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"))), + ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex())), }; ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IPV6, IPV6_ADD_MEMBERSHIP, &group_req, sizeof(group_req)), diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking_test.cc index 5c764b8fd..ff12eafc3 100644 --- a/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking_test.cc +++ b/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking_test.cc @@ -17,7 +17,7 @@ #include <vector> #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_loopback.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback.cc index 058336ecc..f11f444a2 100644 --- a/test/syscalls/linux/socket_ipv6_udp_unbound_loopback.cc +++ b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback.cc @@ -16,7 +16,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_ipv6_udp_unbound.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc index 17021ff82..565f9bc2e 100644 --- a/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc +++ b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc @@ -16,7 +16,7 @@ #include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h index 88098be82..f017a4c89 100644 --- a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h +++ b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc index 5f8d7f981..c95c5305d 100644 --- a/test/syscalls/linux/socket_netdevice.cc +++ b/test/syscalls/linux/socket_netdevice.cc @@ -22,8 +22,8 @@ #include "gtest/gtest.h" #include "absl/base/internal/endian.h" #include "test/syscalls/linux/socket_netlink_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" // Tests for netdevice queries. @@ -37,6 +37,7 @@ using ::testing::AnyOf; using ::testing::Eq; TEST(NetdeviceTest, Loopback) { + SKIP_IF(IsRunningWithHostinet()); FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); @@ -60,6 +61,7 @@ TEST(NetdeviceTest, Loopback) { } TEST(NetdeviceTest, Netmask) { + SKIP_IF(IsRunningWithHostinet()); // We need an interface index to identify the loopback device. FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); @@ -135,6 +137,7 @@ TEST(NetdeviceTest, Netmask) { } TEST(NetdeviceTest, InterfaceName) { + SKIP_IF(IsRunningWithHostinet()); FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); @@ -168,6 +171,7 @@ TEST(NetdeviceTest, InterfaceFlags) { } TEST(NetdeviceTest, InterfaceMTU) { + SKIP_IF(IsRunningWithHostinet()); FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); @@ -181,6 +185,7 @@ TEST(NetdeviceTest, InterfaceMTU) { } TEST(NetdeviceTest, EthtoolGetTSInfo) { + SKIP_IF(IsRunningWithHostinet()); FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); diff --git a/test/syscalls/linux/socket_netlink.cc b/test/syscalls/linux/socket_netlink.cc index 4ec0fd4fa..c78529a14 100644 --- a/test/syscalls/linux/socket_netlink.cc +++ b/test/syscalls/linux/socket_netlink.cc @@ -18,8 +18,8 @@ #include <unistd.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" // Tests for all netlink socket protocols. diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc index ee3c08770..d5e1ce0cc 100644 --- a/test/syscalls/linux/socket_netlink_route.cc +++ b/test/syscalls/linux/socket_netlink_route.cc @@ -29,10 +29,10 @@ #include "absl/strings/str_format.h" #include "test/syscalls/linux/socket_netlink_route_util.h" #include "test/syscalls/linux/socket_netlink_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/cleanup.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" // Tests for NETLINK_ROUTE sockets. @@ -44,6 +44,7 @@ namespace { constexpr uint32_t kSeq = 12345; +using ::testing::_; using ::testing::AnyOf; using ::testing::Eq; @@ -244,7 +245,7 @@ TEST(NetlinkRouteTest, GetLinkByIndexNotFound) { req.ifm.ifi_index = 1234590; EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), - PosixErrorIs(ENODEV, ::testing::_)); + PosixErrorIs(ENODEV, _)); } TEST(NetlinkRouteTest, GetLinkByNameNotFound) { @@ -273,7 +274,112 @@ TEST(NetlinkRouteTest, GetLinkByNameNotFound) { NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len); EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), - PosixErrorIs(ENODEV, ::testing::_)); + PosixErrorIs(ENODEV, _)); +} + +TEST(NetlinkRouteTest, RemoveLoopbackByName) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink()); + + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); + + struct request { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + struct rtattr rtattr; + char ifname[IFNAMSIZ]; + char pad[NLMSG_ALIGNTO + RTA_ALIGNTO]; + }; + + struct request req = {}; + req.hdr.nlmsg_type = RTM_DELLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST; + req.hdr.nlmsg_seq = kSeq; + req.ifm.ifi_family = AF_UNSPEC; + req.rtattr.rta_type = IFLA_IFNAME; + req.rtattr.rta_len = RTA_LENGTH(loopback_link.name.size() + 1); + strncpy(req.ifname, loopback_link.name.c_str(), sizeof(req.ifname)); + req.hdr.nlmsg_len = + NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len); + + EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), + PosixErrorIs(ENOTSUP, _)); +} + +TEST(NetlinkRouteTest, RemoveLoopbackByIndex) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink()); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); + + struct request { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + }; + + struct request req = {}; + req.hdr.nlmsg_len = sizeof(req); + req.hdr.nlmsg_type = RTM_DELLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST; + req.hdr.nlmsg_seq = kSeq; + req.ifm.ifi_family = AF_UNSPEC; + req.ifm.ifi_index = loopback_link.index; + + EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), + PosixErrorIs(ENOTSUP, _)); +} + +TEST(NetlinkRouteTest, RemoveLinkByIndexNotFound) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); + + struct request { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + }; + + struct request req = {}; + req.hdr.nlmsg_len = sizeof(req); + req.hdr.nlmsg_type = RTM_GETLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST; + req.hdr.nlmsg_seq = kSeq; + req.ifm.ifi_family = AF_UNSPEC; + req.ifm.ifi_index = 1234590; + + EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), + PosixErrorIs(ENODEV, _)); +} + +TEST(NetlinkRouteTest, RemoveLinkByNameNotFound) { + const std::string name = "nodevice?!"; + + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE)); + + struct request { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + struct rtattr rtattr; + char ifname[IFNAMSIZ]; + char pad[NLMSG_ALIGNTO + RTA_ALIGNTO]; + }; + + struct request req = {}; + req.hdr.nlmsg_type = RTM_DELLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST; + req.hdr.nlmsg_seq = kSeq; + req.ifm.ifi_family = AF_UNSPEC; + req.rtattr.rta_type = IFLA_IFNAME; + req.rtattr.rta_len = RTA_LENGTH(name.size() + 1); + strncpy(req.ifname, name.c_str(), sizeof(req.ifname)); + req.hdr.nlmsg_len = + NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len); + + EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), + PosixErrorIs(ENODEV, _)); } TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) { @@ -295,7 +401,7 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) { req.ifm.ifi_family = AF_UNSPEC; EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)), - PosixErrorIs(EOPNOTSUPP, ::testing::_)); + PosixErrorIs(EOPNOTSUPP, _)); } TEST(NetlinkRouteTest, MsgHdrMsgTrunc) { @@ -536,7 +642,7 @@ TEST(NetlinkRouteTest, AddAndRemoveAddr) { // Second delete should fail, as address no longer exists. EXPECT_THAT(LinkDelLocalAddr(loopback_link.index, AF_INET, /*prefixlen=*/24, &addr, sizeof(addr)), - PosixErrorIs(EADDRNOTAVAIL, ::testing::_)); + PosixErrorIs(EADDRNOTAVAIL, _)); }); // Replace an existing address should succeed. @@ -546,7 +652,7 @@ TEST(NetlinkRouteTest, AddAndRemoveAddr) { // Create exclusive should fail, as we created the address above. EXPECT_THAT(LinkAddExclusiveLocalAddr(loopback_link.index, AF_INET, /*prefixlen=*/24, &addr, sizeof(addr)), - PosixErrorIs(EEXIST, ::testing::_)); + PosixErrorIs(EEXIST, _)); } // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request. diff --git a/test/syscalls/linux/socket_netlink_uevent.cc b/test/syscalls/linux/socket_netlink_uevent.cc index da425bed4..9e025911b 100644 --- a/test/syscalls/linux/socket_netlink_uevent.cc +++ b/test/syscalls/linux/socket_netlink_uevent.cc @@ -20,8 +20,8 @@ #include "gtest/gtest.h" #include "test/syscalls/linux/socket_netlink_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" // Tests for NETLINK_KOBJECT_UEVENT sockets. diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc index bdebea321..c1bff3c65 100644 --- a/test/syscalls/linux/socket_netlink_util.cc +++ b/test/syscalls/linux/socket_netlink_util.cc @@ -22,7 +22,7 @@ #include <vector> #include "absl/strings/str_cat.h" -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_non_blocking.cc b/test/syscalls/linux/socket_non_blocking.cc index c3520cadd..3d09485d3 100644 --- a/test/syscalls/linux/socket_non_blocking.cc +++ b/test/syscalls/linux/socket_non_blocking.cc @@ -20,8 +20,8 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_non_blocking.h b/test/syscalls/linux/socket_non_blocking.h index bd3e02fd2..604206cfb 100644 --- a/test/syscalls/linux/socket_non_blocking.h +++ b/test/syscalls/linux/socket_non_blocking.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_BLOCKING_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_BLOCKING_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc index c61817f14..7c3310909 100644 --- a/test/syscalls/linux/socket_non_stream.cc +++ b/test/syscalls/linux/socket_non_stream.cc @@ -20,8 +20,8 @@ #include "gtest/gtest.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_non_stream.h b/test/syscalls/linux/socket_non_stream.h index 469fbe6a2..4876730f9 100644 --- a/test/syscalls/linux/socket_non_stream.h +++ b/test/syscalls/linux/socket_non_stream.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc index b052f6e61..ac33407f8 100644 --- a/test/syscalls/linux/socket_non_stream_blocking.cc +++ b/test/syscalls/linux/socket_non_stream_blocking.cc @@ -22,8 +22,8 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" diff --git a/test/syscalls/linux/socket_non_stream_blocking.h b/test/syscalls/linux/socket_non_stream_blocking.h index 6e205a039..71520bb37 100644 --- a/test/syscalls/linux/socket_non_stream_blocking.h +++ b/test/syscalls/linux/socket_non_stream_blocking.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_BLOCKING_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_BLOCKING_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc index 6522b2e01..11903f28b 100644 --- a/test/syscalls/linux/socket_stream.cc +++ b/test/syscalls/linux/socket_stream.cc @@ -21,8 +21,8 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_stream.h b/test/syscalls/linux/socket_stream.h index b837b8f8c..dc6fb2f98 100644 --- a/test/syscalls/linux/socket_stream.h +++ b/test/syscalls/linux/socket_stream.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc index 0743322ac..e168f79ff 100644 --- a/test/syscalls/linux/socket_stream_blocking.cc +++ b/test/syscalls/linux/socket_stream_blocking.cc @@ -22,8 +22,8 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" #include "test/util/timer_util.h" diff --git a/test/syscalls/linux/socket_stream_blocking.h b/test/syscalls/linux/socket_stream_blocking.h index 9fd19ff90..f760188f6 100644 --- a/test/syscalls/linux/socket_stream_blocking.h +++ b/test/syscalls/linux/socket_stream_blocking.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_BLOCKING_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_BLOCKING_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_stream_nonblock.cc b/test/syscalls/linux/socket_stream_nonblock.cc index 74d608741..788fae906 100644 --- a/test/syscalls/linux/socket_stream_nonblock.cc +++ b/test/syscalls/linux/socket_stream_nonblock.cc @@ -20,8 +20,8 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_stream_nonblock.h b/test/syscalls/linux/socket_stream_nonblock.h index c3b7fad91..d1adaa95c 100644 --- a/test/syscalls/linux/socket_stream_nonblock.h +++ b/test/syscalls/linux/socket_stream_nonblock.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_NONBLOCK_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_NONBLOCK_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc index 591cab3fd..43433eaae 100644 --- a/test/syscalls/linux/socket_unix.cc +++ b/test/syscalls/linux/socket_unix.cc @@ -26,8 +26,11 @@ #include "gtest/gtest.h" #include "absl/strings/string_view.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/file_descriptor.h" +#include "test/util/memory_util.h" +#include "test/util/socket_util.h" +#include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -268,6 +271,18 @@ TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) { } } +// Repro for b/196804997. +TEST_P(UnixSocketPairTest, SendFromMmapBeyondEof) { + TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY)); + Mapping m = ASSERT_NO_ERRNO_AND_VALUE( + Mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, fd.get(), 0)); + + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + ASSERT_THAT(send(sockets->first_fd(), m.ptr(), m.len(), 0), + SyscallFailsWithErrno(EFAULT)); +} + } // namespace } // namespace testing diff --git a/test/syscalls/linux/socket_unix.h b/test/syscalls/linux/socket_unix.h index 3625cc404..f6405f399 100644 --- a/test/syscalls/linux/socket_unix.h +++ b/test/syscalls/linux/socket_unix.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc index 8bef76b67..617c5bfe5 100644 --- a/test/syscalls/linux/socket_unix_abstract_nonblock.cc +++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc @@ -15,8 +15,8 @@ #include <vector> #include "test/syscalls/linux/socket_non_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc index 77cb8c6d6..ba34320bc 100644 --- a/test/syscalls/linux/socket_unix_blocking_local.cc +++ b/test/syscalls/linux/socket_unix_blocking_local.cc @@ -15,8 +15,8 @@ #include <vector> #include "test/syscalls/linux/socket_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc index 22a4ee0d1..6191b1448 100644 --- a/test/syscalls/linux/socket_unix_cmsg.cc +++ b/test/syscalls/linux/socket_unix_cmsg.cc @@ -26,8 +26,8 @@ #include "gtest/gtest.h" #include "absl/strings/string_view.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" diff --git a/test/syscalls/linux/socket_unix_cmsg.h b/test/syscalls/linux/socket_unix_cmsg.h index 431606903..f5a276155 100644 --- a/test/syscalls/linux/socket_unix_cmsg.h +++ b/test/syscalls/linux/socket_unix_cmsg.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc index 5b0844493..d43c83d71 100644 --- a/test/syscalls/linux/socket_unix_dgram.cc +++ b/test/syscalls/linux/socket_unix_dgram.cc @@ -20,8 +20,8 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_dgram.h b/test/syscalls/linux/socket_unix_dgram.h index 0764ef85b..e9b8373a5 100644 --- a/test/syscalls/linux/socket_unix_dgram.h +++ b/test/syscalls/linux/socket_unix_dgram.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_DGRAM_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_DGRAM_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc index 31d2d5216..4760630b9 100644 --- a/test/syscalls/linux/socket_unix_dgram_local.cc +++ b/test/syscalls/linux/socket_unix_dgram_local.cc @@ -15,10 +15,10 @@ #include <vector> #include "test/syscalls/linux/socket_non_stream.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/socket_unix_dgram.h" #include "test/syscalls/linux/socket_unix_non_stream.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc index 2db8b68d3..ca277122e 100644 --- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc +++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc @@ -16,8 +16,8 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_domain.cc b/test/syscalls/linux/socket_unix_domain.cc index f7dff8b4d..d8cb5b892 100644 --- a/test/syscalls/linux/socket_unix_domain.cc +++ b/test/syscalls/linux/socket_unix_domain.cc @@ -15,8 +15,8 @@ #include <vector> #include "test/syscalls/linux/socket_generic.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc index 6700b4d90..cdd38f681 100644 --- a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc +++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc @@ -15,8 +15,8 @@ #include <vector> #include "test/syscalls/linux/socket_non_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc index 9425e87a6..d18f9e7b0 100644 --- a/test/syscalls/linux/socket_unix_non_stream.cc +++ b/test/syscalls/linux/socket_unix_non_stream.cc @@ -19,9 +19,9 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/memory_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_non_stream.h b/test/syscalls/linux/socket_unix_non_stream.h index 7478ab172..44d1c0033 100644 --- a/test/syscalls/linux/socket_unix_non_stream.h +++ b/test/syscalls/linux/socket_unix_non_stream.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_NON_STREAM_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_NON_STREAM_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc index fddcdf1c5..92f40f5e5 100644 --- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc +++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc @@ -15,8 +15,8 @@ #include <vector> #include "test/syscalls/linux/socket_non_stream_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc index 85999db04..28a437339 100644 --- a/test/syscalls/linux/socket_unix_pair.cc +++ b/test/syscalls/linux/socket_unix_pair.cc @@ -14,10 +14,10 @@ #include <vector> -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/socket_unix.h" #include "test/syscalls/linux/socket_unix_cmsg.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc index 281410a9a..39360896b 100644 --- a/test/syscalls/linux/socket_unix_pair_nonblock.cc +++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc @@ -15,8 +15,8 @@ #include <vector> #include "test/syscalls/linux/socket_non_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc index d6e7031c0..2a2741eb9 100644 --- a/test/syscalls/linux/socket_unix_seqpacket.cc +++ b/test/syscalls/linux/socket_unix_seqpacket.cc @@ -20,8 +20,8 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_seqpacket.h b/test/syscalls/linux/socket_unix_seqpacket.h index 30d9b9edf..5bb36af92 100644 --- a/test/syscalls/linux/socket_unix_seqpacket.h +++ b/test/syscalls/linux/socket_unix_seqpacket.h @@ -15,7 +15,7 @@ #ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_SEQPACKET_H_ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_SEQPACKET_H_ -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc index 69a5f150d..492416a77 100644 --- a/test/syscalls/linux/socket_unix_seqpacket_local.cc +++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc @@ -15,10 +15,10 @@ #include <vector> #include "test/syscalls/linux/socket_non_stream.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/socket_unix_non_stream.h" #include "test/syscalls/linux/socket_unix_seqpacket.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc index 3ff810914..2f3cfc3f3 100644 --- a/test/syscalls/linux/socket_unix_stream.cc +++ b/test/syscalls/linux/socket_unix_stream.cc @@ -19,8 +19,8 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { @@ -181,6 +181,21 @@ TEST_P(StreamUnixSocketPairTest, SetSocketSendBuf) { ASSERT_EQ(quarter_sz, val); } +TEST_P(StreamUnixSocketPairTest, SendBufferOverflow) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + auto s = sockets->first_fd(); + + constexpr int kBufSz = 4096; + std::vector<char> buf(kBufSz * 4); + ASSERT_THAT(RetryEINTR(send)(s, buf.data(), buf.size(), MSG_DONTWAIT), + SyscallSucceeds()); + // The new buffer size should be smaller that the amount of data in the queue. + ASSERT_THAT(setsockopt(s, SOL_SOCKET, SO_SNDBUF, &kBufSz, sizeof(kBufSz)), + SyscallSucceeds()); + ASSERT_THAT(RetryEINTR(send)(s, buf.data(), buf.size(), MSG_DONTWAIT), + SyscallFailsWithErrno(EAGAIN)); +} + TEST_P(StreamUnixSocketPairTest, IncreasedSocketSendBufUnblocksWrites) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); int sock = sockets->first_fd(); diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc index 8429bd429..97a6bb327 100644 --- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc +++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc @@ -15,8 +15,8 @@ #include <vector> #include "test/syscalls/linux/socket_stream_blocking.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc index a7e3449a9..4b267ccae 100644 --- a/test/syscalls/linux/socket_unix_stream_local.cc +++ b/test/syscalls/linux/socket_unix_stream_local.cc @@ -15,8 +15,8 @@ #include <vector> #include "test/syscalls/linux/socket_stream.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc index 4b763c8e2..d7bf7747b 100644 --- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc +++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc @@ -14,8 +14,8 @@ #include <vector> #include "test/syscalls/linux/socket_stream_nonblock.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc index dd3d25450..0f6864266 100644 --- a/test/syscalls/linux/socket_unix_unbound_abstract.cc +++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc @@ -16,8 +16,8 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc index 907dca0f1..ccf2c94a1 100644 --- a/test/syscalls/linux/socket_unix_unbound_dgram.cc +++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc @@ -17,8 +17,8 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc index a035fb095..811fe12a1 100644 --- a/test/syscalls/linux/socket_unix_unbound_filesystem.cc +++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc @@ -17,9 +17,9 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc index cb99030f5..e22018890 100644 --- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc +++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc @@ -16,8 +16,8 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc index f185dded3..b10062bc2 100644 --- a/test/syscalls/linux/socket_unix_unbound_stream.cc +++ b/test/syscalls/linux/socket_unix_unbound_stream.cc @@ -16,8 +16,8 @@ #include <sys/un.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc index c85f6da0b..4a10ae8d2 100644 --- a/test/syscalls/linux/splice.cc +++ b/test/syscalls/linux/splice.cc @@ -195,81 +195,6 @@ TEST(SpliceTest, PipeOffsets) { SyscallFailsWithErrno(ESPIPE)); } -// Event FDs may be used with splice without an offset. -TEST(SpliceTest, FromEventFD) { - // Open the input eventfd with an initial value so that it is readable. - constexpr uint64_t kEventFDValue = 1; - int efd; - ASSERT_THAT(efd = eventfd(kEventFDValue, 0), SyscallSucceeds()); - const FileDescriptor in_fd(efd); - - // Create a new pipe. - int fds[2]; - ASSERT_THAT(pipe(fds), SyscallSucceeds()); - const FileDescriptor rfd(fds[0]); - const FileDescriptor wfd(fds[1]); - - // Splice 8-byte eventfd value to pipe. - constexpr int kEventFDSize = 8; - EXPECT_THAT(splice(in_fd.get(), nullptr, wfd.get(), nullptr, kEventFDSize, 0), - SyscallSucceedsWithValue(kEventFDSize)); - - // Contents should be equal. - std::vector<char> rbuf(kEventFDSize); - ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()), - SyscallSucceedsWithValue(kEventFDSize)); - EXPECT_EQ(memcmp(rbuf.data(), &kEventFDValue, rbuf.size()), 0); -} - -// Event FDs may not be used with splice with an offset. -TEST(SpliceTest, FromEventFDOffset) { - int efd; - ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds()); - const FileDescriptor in_fd(efd); - - // Create a new pipe. - int fds[2]; - ASSERT_THAT(pipe(fds), SyscallSucceeds()); - const FileDescriptor rfd(fds[0]); - const FileDescriptor wfd(fds[1]); - - // Attempt to splice 8-byte eventfd value to pipe with offset. - // - // This is not allowed because eventfd doesn't support pread. - constexpr int kEventFDSize = 8; - loff_t in_off = 0; - EXPECT_THAT(splice(in_fd.get(), &in_off, wfd.get(), nullptr, kEventFDSize, 0), - SyscallFailsWithErrno(EINVAL)); -} - -// Event FDs may not be used with splice with an offset. -TEST(SpliceTest, ToEventFDOffset) { - // Create a new pipe. - int fds[2]; - ASSERT_THAT(pipe(fds), SyscallSucceeds()); - const FileDescriptor rfd(fds[0]); - const FileDescriptor wfd(fds[1]); - - // Fill with a value. - constexpr int kEventFDSize = 8; - std::vector<char> buf(kEventFDSize); - buf[0] = 1; - ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()), - SyscallSucceedsWithValue(kEventFDSize)); - - int efd; - ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds()); - const FileDescriptor out_fd(efd); - - // Attempt to splice 8-byte eventfd value to pipe with offset. - // - // This is not allowed because eventfd doesn't support pwrite. - loff_t out_off = 0; - EXPECT_THAT( - splice(rfd.get(), nullptr, out_fd.get(), &out_off, kEventFDSize, 0), - SyscallFailsWithErrno(EINVAL)); -} - TEST(SpliceTest, ToPipe) { // Open the input file. const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); @@ -852,34 +777,6 @@ TEST(SpliceTest, FromPipeMaxFileSize) { EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0); } -TEST(SpliceTest, FromPipeToDevZero) { - // Create a new pipe. - int fds[2]; - ASSERT_THAT(pipe(fds), SyscallSucceeds()); - const FileDescriptor rfd(fds[0]); - FileDescriptor wfd(fds[1]); - - // Fill with some random data. - std::vector<char> buf(kPageSize); - RandomizeBuffer(buf.data(), buf.size()); - ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()), - SyscallSucceedsWithValue(kPageSize)); - - const FileDescriptor zero = - ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_WRONLY)); - - // Close the write end to prevent blocking below. - wfd.reset(); - - // Splice to /dev/zero. The first call should empty the pipe, and the return - // value should not exceed the number of bytes available for reading. - EXPECT_THAT( - splice(rfd.get(), nullptr, zero.get(), nullptr, kPageSize + 123, 0), - SyscallSucceedsWithValue(kPageSize)); - EXPECT_THAT(splice(rfd.get(), nullptr, zero.get(), nullptr, 1, 0), - SyscallSucceedsWithValue(0)); -} - static volatile int signaled = 0; void SigUsr1Handler(int sig, siginfo_t* info, void* context) { signaled = 1; } diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc index 72f888659..19dc80d0c 100644 --- a/test/syscalls/linux/stat.cc +++ b/test/syscalls/linux/stat.cc @@ -765,7 +765,7 @@ TEST_F(StatTest, StatxSymlink) { SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); - std::string parent_dir = "/tmp"; + std::string parent_dir = GetAbsoluteTestTmpdir(); TempPath link = ASSERT_NO_ERRNO_AND_VALUE( TempPath::CreateSymlinkTo(parent_dir, test_file_name_)); std::string p = link.path(); diff --git a/test/syscalls/linux/statfs.cc b/test/syscalls/linux/statfs.cc index d4ea8e026..d057cdc09 100644 --- a/test/syscalls/linux/statfs.cc +++ b/test/syscalls/linux/statfs.cc @@ -28,7 +28,7 @@ namespace testing { namespace { TEST(StatfsTest, CannotStatBadPath) { - auto temp_file = NewTempAbsPathInDir("/tmp"); + auto temp_file = NewTempAbsPath(); struct statfs st; EXPECT_THAT(statfs(temp_file.c_str(), &st), SyscallFailsWithErrno(ENOENT)); diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index 5bfdecc79..3fbbf1423 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -29,9 +29,9 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" #include "test/util/posix_error.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -1182,6 +1182,62 @@ TEST_P(SimpleTcpSocketTest, SelfConnectSend) { EXPECT_THAT(shutdown(s.get(), SHUT_WR), SyscallSucceedsWithValue(0)); } +TEST_P(SimpleTcpSocketTest, SelfConnectSendShutdownWrite) { + // Initialize address to the loopback one. + sockaddr_storage addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t addrlen = sizeof(addr); + + const FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + ASSERT_THAT(bind(s.get(), AsSockAddr(&addr), addrlen), SyscallSucceeds()); + // Get the bound port. + ASSERT_THAT(getsockname(s.get(), AsSockAddr(&addr), &addrlen), + SyscallSucceeds()); + ASSERT_THAT(RetryEINTR(connect)(s.get(), AsSockAddr(&addr), addrlen), + SyscallSucceeds()); + + // Write enough data to fill send and receive buffers. + size_t write_size = 24 << 20; // 24 MiB. + std::vector<char> writebuf(write_size); + + ScopedThread t([&s]() { + absl::SleepFor(absl::Milliseconds(250)); + ASSERT_THAT(shutdown(s.get(), SHUT_WR), SyscallSucceeds()); + }); + + // Try to send the whole thing. + int n; + ASSERT_THAT(n = SendFd(s.get(), writebuf.data(), writebuf.size(), 0), + SyscallFailsWithErrno(EPIPE)); +} + +TEST_P(SimpleTcpSocketTest, SelfConnectRecvShutdownRead) { + // Initialize address to the loopback one. + sockaddr_storage addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t addrlen = sizeof(addr); + + const FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + ASSERT_THAT(bind(s.get(), AsSockAddr(&addr), addrlen), SyscallSucceeds()); + // Get the bound port. + ASSERT_THAT(getsockname(s.get(), AsSockAddr(&addr), &addrlen), + SyscallSucceeds()); + ASSERT_THAT(RetryEINTR(connect)(s.get(), AsSockAddr(&addr), addrlen), + SyscallSucceeds()); + + ScopedThread t([&s]() { + absl::SleepFor(absl::Milliseconds(250)); + ASSERT_THAT(shutdown(s.get(), SHUT_RD), SyscallSucceeds()); + }); + + char buf[1]; + EXPECT_THAT(recv(s.get(), buf, 0, 0), SyscallSucceedsWithValue(0)); +} + void NonBlockingConnect(int family, int16_t pollMask) { const FileDescriptor listener = ASSERT_NO_ERRNO_AND_VALUE(Socket(family, SOCK_STREAM, IPPROTO_TCP)); @@ -2032,6 +2088,66 @@ TEST_P(SimpleTcpSocketTest, ConnectUnspecifiedAddress) { } } +// Tests that send will return EWOULDBLOCK initially with large buffer and will +// succeed after the send buffer size is increased. +TEST_P(TcpSocketTest, SendUnblocksOnSendBufferIncrease) { + // Set the FD to O_NONBLOCK. + int opts; + ASSERT_THAT(opts = fcntl(first_fd, F_GETFL), SyscallSucceeds()); + opts |= O_NONBLOCK; + ASSERT_THAT(fcntl(first_fd, F_SETFL, opts), SyscallSucceeds()); + + // Get maximum buffer size by trying to set it to a large value. + constexpr int kSndBufSz = 0xffffffff; + ASSERT_THAT(setsockopt(first_fd, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, + sizeof(kSndBufSz)), + SyscallSucceeds()); + + int max_buffer_sz = 0; + socklen_t max_len = sizeof(max_buffer_sz); + ASSERT_THAT( + getsockopt(first_fd, SOL_SOCKET, SO_SNDBUF, &max_buffer_sz, &max_len), + SyscallSucceeds()); + + int buffer_sz = max_buffer_sz >> 2; + EXPECT_THAT(setsockopt(first_fd, SOL_SOCKET, SO_SNDBUF, &buffer_sz, + sizeof(buffer_sz)), + SyscallSucceedsWithValue(0)); + + // Create a large buffer that will be used for sending. + std::vector<char> buffer(max_buffer_sz); + + // Write until we receive an error. + while (RetryEINTR(send)(first_fd, buffer.data(), buffer.size(), 0) != -1) { + // Sleep to give linux a chance to move data from the send buffer to the + // receive buffer. + usleep(10000); // 10ms. + } + + // The last error should have been EWOULDBLOCK. + ASSERT_EQ(errno, EWOULDBLOCK); + + ScopedThread send_thread([this]() { + int flags = 0; + ASSERT_THAT(flags = fcntl(first_fd, F_GETFL), SyscallSucceeds()); + EXPECT_THAT(fcntl(first_fd, F_SETFL, flags & ~O_NONBLOCK), + SyscallSucceeds()); + + // Expect the send() to succeed. + char buffer; + ASSERT_THAT(RetryEINTR(send)(first_fd, &buffer, sizeof(buffer), 0), + SyscallSucceeds()); + }); + + // Set SO_SNDBUF to maximum buffer size allowed. + buffer_sz = max_buffer_sz >> 1; + EXPECT_THAT(setsockopt(first_fd, SOL_SOCKET, SO_SNDBUF, &buffer_sz, + sizeof(buffer_sz)), + SyscallSucceedsWithValue(0)); + + send_thread.Join(); +} + INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest, ::testing::Values(AF_INET, AF_INET6)); diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc index 279fe342c..956208bee 100644 --- a/test/syscalls/linux/tuntap.cc +++ b/test/syscalls/linux/tuntap.cc @@ -24,16 +24,18 @@ #include <sys/socket.h> #include <sys/types.h> +#include <cstddef> + #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/strings/ascii.h" #include "absl/strings/str_split.h" #include "test/syscalls/linux/socket_netlink_route_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" #include "test/util/posix_error.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { @@ -44,6 +46,7 @@ constexpr int kIPLen = 4; constexpr const char kDevNetTun[] = "/dev/net/tun"; constexpr const char kTapName[] = "tap0"; +constexpr const char kTunName[] = "tun0"; #define kTapIPAddr htonl(0x0a000001) /* Inet 10.0.0.1 */ #define kTapPeerIPAddr htonl(0x0a000002) /* Inet 10.0.0.2 */ @@ -70,29 +73,14 @@ PosixErrorOr<Link> GetLinkByName(const std::string& name) { return PosixError(ENOENT, "interface not found"); } -struct pihdr { - uint16_t pi_flags; - uint16_t pi_protocol; -} __attribute__((packed)); - -struct ping_pkt { - pihdr pi; - struct ethhdr eth; - struct iphdr ip; - struct icmphdr icmp; +struct ping_ip_pkt { + iphdr ip; + icmphdr icmp; char payload[64]; } __attribute__((packed)); -ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const in_addr_t srcip, - const uint8_t dstmac[ETH_ALEN], - const in_addr_t dstip) { - ping_pkt pkt = {}; - - pkt.pi.pi_protocol = htons(ETH_P_IP); - - memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest)); - memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source)); - pkt.eth.h_proto = htons(ETH_P_IP); +ping_ip_pkt CreatePingIPPacket(const in_addr_t srcip, const in_addr_t dstip) { + ping_ip_pkt pkt = {}; pkt.ip.ihl = 5; pkt.ip.version = 4; @@ -119,6 +107,33 @@ ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const in_addr_t srcip, return pkt; } +struct pihdr { + uint16_t pi_flags; + uint16_t pi_protocol; +} __attribute__((packed)); + +struct ping_pkt { + pihdr pi; + ethhdr eth; + ping_ip_pkt ip_pkt; +} __attribute__((packed)); + +ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const in_addr_t srcip, + const uint8_t dstmac[ETH_ALEN], + const in_addr_t dstip) { + ping_pkt pkt = {}; + + pkt.pi.pi_protocol = htons(ETH_P_IP); + + memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest)); + memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source)); + pkt.eth.h_proto = htons(ETH_P_IP); + + pkt.ip_pkt = CreatePingIPPacket(srcip, dstip); + + return pkt; +} + struct arp_pkt { pihdr pi; struct ethhdr eth; @@ -271,13 +286,26 @@ TEST_F(TuntapTest, WriteToDownDevice) { EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO)); } -PosixErrorOr<FileDescriptor> OpenAndAttachTap(const std::string& dev_name, - const in_addr_t dev_addr) { +struct TunTapInterface { + FileDescriptor fd; + Link link; +}; + +PosixErrorOr<TunTapInterface> OpenAndAttachTunTap(const std::string& dev_name, + const in_addr_t dev_addr, + bool tap, bool no_pi) { // Interface creation. ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, Open(kDevNetTun, O_RDWR)); struct ifreq ifr_set = {}; - ifr_set.ifr_flags = IFF_TAP; + if (tap) { + ifr_set.ifr_flags |= IFF_TAP; + } else { + ifr_set.ifr_flags |= IFF_TUN; + } + if (no_pi) { + ifr_set.ifr_flags |= IFF_NO_PI; + } strncpy(ifr_set.ifr_name, dev_name.c_str(), IFNAMSIZ); if (ioctl(fd.get(), TUNSETIFF, &ifr_set) < 0) { return PosixError(errno); @@ -293,13 +321,15 @@ PosixErrorOr<FileDescriptor> OpenAndAttachTap(const std::string& dev_name, if (!IsRunningOnGvisor()) { // FIXME(b/110961832): gVisor doesn't support setting MAC address on // interfaces yet. - RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA))); + if (tap) { + RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA))); + } // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces. RETURN_IF_ERRNO(LinkChangeFlags(link.index, IFF_UP, IFF_UP)); } - return fd; + return TunTapInterface{.fd = std::move(fd), .link = std::move(link)}; } // This test sets up a TAP device and pings kernel by sending ICMP echo request. @@ -319,8 +349,9 @@ PosixErrorOr<FileDescriptor> OpenAndAttachTap(const std::string& dev_name, TEST_F(TuntapTest, PingKernel) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto& [fd, link] = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); + ping_pkt ping_req = CreatePingPacket(kMacB, kTapPeerIPAddr, kMacA, kTapIPAddr); std::string arp_rep = @@ -362,10 +393,10 @@ TEST_F(TuntapTest, PingKernel) { // Process ping response packet. if (n >= sizeof(ping_pkt) && r.pi.pi_protocol == ping_req.pi.pi_protocol && - r.ping.ip.protocol == ping_req.ip.protocol && - !memcmp(&r.ping.ip.saddr, &ping_req.ip.daddr, kIPLen) && - !memcmp(&r.ping.ip.daddr, &ping_req.ip.saddr, kIPLen) && - r.ping.icmp.type == 0 && r.ping.icmp.code == 0) { + r.ping.ip_pkt.ip.protocol == ping_req.ip_pkt.ip.protocol && + !memcmp(&r.ping.ip_pkt.ip.saddr, &ping_req.ip_pkt.ip.daddr, kIPLen) && + !memcmp(&r.ping.ip_pkt.ip.daddr, &ping_req.ip_pkt.ip.saddr, kIPLen) && + r.ping.ip_pkt.icmp.type == 0 && r.ping.ip_pkt.icmp.code == 0) { // Ends and passes the test. break; } @@ -375,8 +406,8 @@ TEST_F(TuntapTest, PingKernel) { TEST_F(TuntapTest, SendUdpTriggersArpResolution) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto& [fd, link] = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); // Send a UDP packet to remote. int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); @@ -413,6 +444,45 @@ TEST_F(TuntapTest, SendUdpTriggersArpResolution) { } } +TEST_F(TuntapTest, TUNNoPacketInfo) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + + // Interface creation. + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR)); + + struct ifreq ifr_set = {}; + ifr_set.ifr_flags = IFF_TUN | IFF_NO_PI; + strncpy(ifr_set.ifr_name, kTunName, IFNAMSIZ); + EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set), SyscallSucceeds()); + + // Interface setup. + auto link = ASSERT_NO_ERRNO_AND_VALUE(GetLinkByName(kTunName)); + const struct in_addr dev_ipv4_addr = {.s_addr = kTapIPAddr}; + EXPECT_NO_ERRNO(LinkAddLocalAddr(link.index, AF_INET, 24, &dev_ipv4_addr, + sizeof(dev_ipv4_addr))); + + ping_ip_pkt ping_req = CreatePingIPPacket(kTapPeerIPAddr, kTapIPAddr); + + // Send ICMP query + EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)), + SyscallSucceedsWithValue(sizeof(ping_req))); + + // Receive loop to process inbound packets. + while (1) { + ping_ip_pkt ping_resp = {}; + EXPECT_THAT(read(fd.get(), &ping_resp, sizeof(ping_req)), + SyscallSucceedsWithValue(sizeof(ping_req))); + + // Process ping response packet. + if (!memcmp(&ping_resp.ip.saddr, &ping_req.ip.daddr, kIPLen) && + !memcmp(&ping_resp.ip.daddr, &ping_req.ip.saddr, kIPLen) && + ping_resp.icmp.type == 0 && ping_resp.icmp.code == 0) { + // Ends and passes the test. + break; + } + } +} + // TCPBlockingConnectFailsArpResolution tests for TCP connect to fail on link // address resolution failure to a routable, but non existent peer. TEST_F(TuntapTest, TCPBlockingConnectFailsArpResolution) { @@ -421,8 +491,8 @@ TEST_F(TuntapTest, TCPBlockingConnectFailsArpResolution) { FileDescriptor sender = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto tuntap = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); sockaddr_in connect_addr = { .sin_family = AF_INET, @@ -443,8 +513,8 @@ TEST_F(TuntapTest, TCPNonBlockingConnectFailsArpResolution) { FileDescriptor sender = ASSERT_NO_ERRNO_AND_VALUE( Socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto tuntap = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); sockaddr_in connect_addr = { .sin_family = AF_INET, @@ -474,8 +544,8 @@ TEST_F(TuntapTest, TCPNonBlockingConnectFailsArpResolution) { TEST_F(TuntapTest, WriteHangBug155928773) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, kTapIPAddr)); + const auto tuntap = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTapName, kTapIPAddr, true /* tap */, false /* no_pi */)); int sock = socket(AF_INET, SOCK_DGRAM, 0); ASSERT_THAT(sock, SyscallSucceeds()); @@ -490,5 +560,95 @@ TEST_F(TuntapTest, WriteHangBug155928773) { write(sock, "hello", 5); } +// Test that raw packet sockets do not need/include link headers when +// sending/receiving packets to/from pure L3 (e.g. TUN) interfaces. +TEST_F(TuntapTest, RawPacketSocket) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + auto [tun, link] = ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTunTap( + kTunName, kTapIPAddr, false /* tap */, true /* no_pi */)); + FileDescriptor packet_sock = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP))); + + constexpr int kInfiniteTimeout = -1; + + uint8_t hardware_address_length = 0; + if (IsRunningOnGvisor()) { + // TODO(https://gvisor.dev/issue/6530): Do not assume all interfaces have + // an ethernet address. + hardware_address_length = ETH_ALEN; + } + + { + const ping_ip_pkt ping_req = CreatePingIPPacket(kTapPeerIPAddr, kTapIPAddr); + ASSERT_THAT(write(tun.get(), &ping_req, sizeof(ping_req)), + SyscallSucceedsWithValue(sizeof(ping_req))); + // Wait for the packet socket to become readable. + pollfd pfd = { + .fd = packet_sock.get(), + .events = POLLIN, + }; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, kInfiniteTimeout), + SyscallSucceedsWithValue(1)); + + char read_buf[sizeof(ping_req) + 1]; + struct sockaddr_ll src; + socklen_t src_len = sizeof(src); + ASSERT_THAT(recvfrom(packet_sock.get(), read_buf, sizeof(read_buf), 0, + reinterpret_cast<struct sockaddr*>(&src), &src_len), + SyscallSucceedsWithValue(sizeof(ping_req))); + EXPECT_EQ(memcmp(read_buf, &ping_req, sizeof(ping_req)), 0); + ASSERT_EQ(src_len, sizeof(src)); + EXPECT_EQ(src.sll_family, AF_PACKET); + EXPECT_EQ(ntohs(src.sll_protocol), ETH_P_IP); + EXPECT_EQ(src.sll_ifindex, link.index); + EXPECT_EQ(src.sll_pkttype, PACKET_HOST); + EXPECT_EQ(src.sll_halen, hardware_address_length); + if (IsRunningOnGvisor()) { + // TODO(https://gvisor.dev/issue/6531): Check this field for the right + // hardware type. + EXPECT_EQ(src.sll_hatype, 0); + } else { + EXPECT_EQ(src.sll_hatype, ARPHRD_NONE); + } + } + + { + const struct sockaddr_ll dest = { + .sll_family = AF_PACKET, + .sll_protocol = htons(ETH_P_IP), + .sll_ifindex = link.index, + .sll_halen = hardware_address_length, + }; + + const ping_ip_pkt ping_req = CreatePingIPPacket(kTapIPAddr, kTapPeerIPAddr); + ASSERT_THAT( + sendto(packet_sock.get(), &ping_req, sizeof(ping_req), 0, + reinterpret_cast<const struct sockaddr*>(&dest), sizeof(dest)), + SyscallSucceedsWithValue(sizeof(ping_req))); + + // Loop until we receive the packet we expect - the kernel may send packets + // we do not care about. + while (true) { + // Wait for the TUN interface to become readable. + pollfd pfd = { + .fd = tun.get(), + .events = POLLIN, + }; + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, kInfiniteTimeout), + SyscallSucceedsWithValue(1)); + + char read_buf[sizeof(ping_req) + 1]; + int n = read(tun.get(), &read_buf, sizeof(read_buf)); + ASSERT_THAT(n, SyscallSucceeds()); + if (n == sizeof(ping_req) && + memcmp(read_buf, &ping_req, sizeof(ping_req)) == 0) { + break; + } + } + } +} + } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/udp_bind.cc b/test/syscalls/linux/udp_bind.cc index f68d78aa2..5ed115a14 100644 --- a/test/syscalls/linux/udp_bind.cc +++ b/test/syscalls/linux/udp_bind.cc @@ -17,8 +17,8 @@ #include <sys/types.h> #include "gtest/gtest.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" namespace gvisor { diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc index 2b687c198..0c0a16074 100644 --- a/test/syscalls/linux/udp_socket.cc +++ b/test/syscalls/linux/udp_socket.cc @@ -18,6 +18,8 @@ #include <netinet/ip_icmp.h> #include <ctime> +#include <utility> +#include <vector> #ifdef __linux__ #include <linux/errqueue.h> @@ -39,10 +41,10 @@ #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/syscalls/linux/socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/file_descriptor.h" #include "test/util/posix_error.h" +#include "test/util/socket_util.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -600,6 +602,67 @@ TEST_P(UdpSocketTest, DisconnectAfterBind) { SyscallFailsWithErrno(ENOTCONN)); } +void ConnectThenDisconnect(const FileDescriptor& sock, + const sockaddr* bind_addr, + const socklen_t expected_addrlen) { + // Connect the bound socket. + ASSERT_THAT(connect(sock.get(), bind_addr, expected_addrlen), + SyscallSucceeds()); + + // Disconnect. + { + sockaddr_storage unspec = {.ss_family = AF_UNSPEC}; + ASSERT_THAT(connect(sock.get(), AsSockAddr(&unspec), sizeof(unspec)), + SyscallSucceeds()); + } + { + // Check that we're not in a bound state. + sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + ASSERT_THAT(getsockname(sock.get(), AsSockAddr(&addr), &addrlen), + SyscallSucceeds()); + ASSERT_EQ(addrlen, expected_addrlen); + // Everything should be the zero value except the address family. + sockaddr_storage expected = { + .ss_family = bind_addr->sa_family, + }; + EXPECT_EQ(memcmp(&expected, &addr, expected_addrlen), 0); + } + + { + // We are not connected so we have no peer. + sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + EXPECT_THAT(getpeername(sock.get(), AsSockAddr(&addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); + } +} + +TEST_P(UdpSocketTest, DisconnectAfterBindToUnspecAndConnect) { + ASSERT_NO_ERRNO(BindLoopback()); + + sockaddr_storage unspec = {.ss_family = AF_UNSPEC}; + int bind_res = bind(sock_.get(), AsSockAddr(&unspec), sizeof(unspec)); + if ((!IsRunningOnGvisor() || IsRunningWithHostinet()) && + GetFamily() == AF_INET) { + // Linux allows this for undocumented compatibility reasons: + // https://github.com/torvalds/linux/commit/29c486df6a208432b370bd4be99ae1369ede28d8. + // + // TODO(https://gvisor.dev/issue/6575): Match Linux's behaviour. + ASSERT_THAT(bind_res, SyscallSucceeds()); + } else { + ASSERT_THAT(bind_res, SyscallFailsWithErrno(EAFNOSUPPORT)); + } + + ASSERT_NO_FATAL_FAILURE(ConnectThenDisconnect(sock_, bind_addr_, addrlen_)); +} + +TEST_P(UdpSocketTest, DisconnectAfterConnectWithoutBind) { + ASSERT_NO_ERRNO(BindLoopback()); + + ASSERT_NO_FATAL_FAILURE(ConnectThenDisconnect(sock_, bind_addr_, addrlen_)); +} + TEST_P(UdpSocketTest, BindToAnyConnnectToLocalhost) { ASSERT_NO_ERRNO(BindAny()); @@ -1429,12 +1492,8 @@ TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) { sendto(sock_.get(), buf + i * psize, 0, 0, bind_addr_, addrlen_), SyscallSucceedsWithValue(0)); - // TODO(gvisor.dev/issue/2726): sending a zero-length message to a hostinet - // socket does not cause a poll event to be triggered. - if (!IsRunningWithHostinet()) { - ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), - SyscallSucceedsWithValue(1)); - } + ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000), + SyscallSucceedsWithValue(1)); // Check that regardless of how many packets are in the queue, the size // reported is that of a single packet. @@ -1689,6 +1748,148 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) { ASSERT_EQ(tv.tv_usec, tv2.tv_usec); } +// TOS and TCLASS values may be different but IPv6 sockets with IPv4-mapped-IPv6 +// addresses use TOS (IPv4), not TCLASS (IPv6). +TEST_P(UdpSocketTest, DifferentTOSAndTClass) { + const int kFamily = GetFamily(); + constexpr int kToS = IPTOS_LOWDELAY; + constexpr int kTClass = IPTOS_THROUGHPUT; + ASSERT_NE(kToS, kTClass); + + if (kFamily == AF_INET6) { + ASSERT_THAT(setsockopt(sock_.get(), SOL_IPV6, IPV6_TCLASS, &kTClass, + sizeof(kTClass)), + SyscallSucceeds()); + + // Marking an IPv6 socket as IPv6 only should not affect the ability to + // configure IPv4 socket options as the V6ONLY flag may later be disabled so + // that applications may use the socket to send/receive IPv4 packets. + constexpr int on = 1; + ASSERT_THAT(setsockopt(sock_.get(), SOL_IPV6, IPV6_V6ONLY, &on, sizeof(on)), + SyscallSucceeds()); + } + + ASSERT_THAT(setsockopt(sock_.get(), SOL_IP, IP_TOS, &kToS, sizeof(kToS)), + SyscallSucceeds()); + + if (kFamily == AF_INET6) { + int got_tclass; + socklen_t got_tclass_len = sizeof(got_tclass); + ASSERT_THAT(getsockopt(sock_.get(), SOL_IPV6, IPV6_TCLASS, &got_tclass, + &got_tclass_len), + SyscallSucceeds()); + ASSERT_EQ(got_tclass_len, sizeof(got_tclass)); + EXPECT_EQ(got_tclass, kTClass); + } + + { + int got_tos; + socklen_t got_tos_len = sizeof(got_tos); + ASSERT_THAT(getsockopt(sock_.get(), SOL_IP, IP_TOS, &got_tos, &got_tos_len), + SyscallSucceeds()); + ASSERT_EQ(got_tos_len, sizeof(got_tos)); + EXPECT_EQ(got_tos, kToS); + } + + auto test_send = [this](sockaddr_storage addr, + std::function<void(const cmsghdr*)> cb) { + FileDescriptor bind = ASSERT_NO_ERRNO_AND_VALUE( + Socket(addr.ss_family, SOCK_DGRAM, IPPROTO_UDP)); + ASSERT_NO_ERRNO(BindSocket(bind.get(), reinterpret_cast<sockaddr*>(&addr))); + ASSERT_THAT(setsockopt(bind.get(), SOL_IP, IP_RECVTOS, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceeds()); + if (addr.ss_family == AF_INET6) { + ASSERT_THAT(setsockopt(bind.get(), SOL_IPV6, IPV6_RECVTCLASS, &kSockOptOn, + sizeof(kSockOptOn)), + SyscallSucceeds()); + } + + char sent_data[1024]; + iovec sent_iov = { + .iov_base = sent_data, + .iov_len = sizeof(sent_data), + }; + msghdr sent_msg = { + .msg_name = &addr, + .msg_namelen = sizeof(addr), + .msg_iov = &sent_iov, + .msg_iovlen = 1, + }; + ASSERT_THAT(RetryEINTR(sendmsg)(sock_.get(), &sent_msg, 0), + SyscallSucceedsWithValue(sizeof(sent_data))); + + char received_data[sizeof(sent_data) + 1]; + iovec received_iov = { + .iov_base = received_data, + .iov_len = sizeof(received_data), + }; + std::vector<char> received_cmsgbuf(CMSG_SPACE(sizeof(int8_t))); + msghdr received_msg = { + .msg_iov = &received_iov, + .msg_iovlen = 1, + .msg_control = received_cmsgbuf.data(), + .msg_controllen = static_cast<socklen_t>(received_cmsgbuf.size()), + }; + ASSERT_THAT(RetryEINTR(recvmsg)(bind.get(), &received_msg, 0), + SyscallSucceedsWithValue(sizeof(sent_data))); + + cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg); + ASSERT_NE(cmsg, nullptr); + ASSERT_NO_FATAL_FAILURE(cb(cmsg)); + EXPECT_EQ(CMSG_NXTHDR(&received_msg, cmsg), nullptr); + }; + + if (kFamily == AF_INET6) { + SCOPED_TRACE( + "Send IPv4 loopback packet using IPv6 socket via IPv4-mapped-IPv6"); + + constexpr int off = 0; + ASSERT_THAT( + setsockopt(sock_.get(), SOL_IPV6, IPV6_V6ONLY, &off, sizeof(off)), + SyscallSucceeds()); + + // Send a packet and make sure that the ToS value in the IPv4 header is + // the configured IPv4 ToS Value and not the IPv6 Traffic Class value even + // though we use an IPv6 socket to send an IPv4 packet. + ASSERT_NO_FATAL_FAILURE( + test_send(V4MappedLoopback().addr, [kToS](const cmsghdr* cmsg) { + EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int8_t))); + EXPECT_EQ(cmsg->cmsg_level, SOL_IP); + EXPECT_EQ(cmsg->cmsg_type, IP_TOS); + int8_t received; + memcpy(&received, CMSG_DATA(cmsg), sizeof(received)); + EXPECT_EQ(received, kToS); + })); + } + + { + SCOPED_TRACE("Send loopback packet"); + + ASSERT_NO_FATAL_FAILURE(test_send( + InetLoopbackAddr(), [kFamily, kTClass, kToS](const cmsghdr* cmsg) { + switch (kFamily) { + case AF_INET: { + EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int8_t))); + EXPECT_EQ(cmsg->cmsg_level, SOL_IP); + EXPECT_EQ(cmsg->cmsg_type, IP_TOS); + int8_t received; + memcpy(&received, CMSG_DATA(cmsg), sizeof(received)); + EXPECT_EQ(received, kToS); + } break; + case AF_INET6: { + EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int32_t))); + EXPECT_EQ(cmsg->cmsg_level, SOL_IPV6); + EXPECT_EQ(cmsg->cmsg_type, IPV6_TCLASS); + int32_t received; + memcpy(&received, CMSG_DATA(cmsg), sizeof(received)); + EXPECT_EQ(received, kTClass); + } break; + } + })); + } +} + // Test that a socket with IP_TOS or IPV6_TCLASS set will set the TOS byte on // outgoing packets, and that a receiving socket with IP_RECVTOS or // IPV6_RECVTCLASS will create the corresponding control message. diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc index 4139a18d8..d95a3e010 100644 --- a/test/syscalls/linux/uidgid.cc +++ b/test/syscalls/linux/uidgid.cc @@ -170,7 +170,9 @@ TEST(UidGidRootTest, SetgidNotFromThreadGroupLeader) { const gid_t gid = absl::GetFlag(FLAGS_scratch_gid1); // NOTE(b/64676707): Do setgid in a separate thread so that we can test if // info.si_pid is set correctly. - ScopedThread([gid] { ASSERT_THAT(setgid(gid), SyscallSucceeds()); }); + ScopedThread thread = + ScopedThread([gid] { ASSERT_THAT(setgid(gid), SyscallSucceeds()); }); + thread.Join(); EXPECT_NO_ERRNO(CheckGIDs(gid, gid, gid)); #pragma pop_macro("allow_setgid") diff --git a/test/syscalls/linux/uname.cc b/test/syscalls/linux/uname.cc index 759ea4f53..c52abef5c 100644 --- a/test/syscalls/linux/uname.cc +++ b/test/syscalls/linux/uname.cc @@ -88,7 +88,7 @@ TEST(UnameTest, UnshareUTS) { struct utsname init; ASSERT_THAT(uname(&init), SyscallSucceeds()); - ScopedThread([&]() { + ScopedThread thread = ScopedThread([&]() { EXPECT_THAT(unshare(CLONE_NEWUTS), SyscallSucceeds()); constexpr char kHostname[] = "wubbalubba"; @@ -97,6 +97,7 @@ TEST(UnameTest, UnshareUTS) { char hostname[65]; EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds()); }); + thread.Join(); struct utsname after; EXPECT_THAT(uname(&after), SyscallSucceeds()); diff --git a/test/syscalls/linux/unix_domain_socket_test_util.h b/test/syscalls/linux/unix_domain_socket_test_util.h index b8073db17..4240bd5f6 100644 --- a/test/syscalls/linux/unix_domain_socket_test_util.h +++ b/test/syscalls/linux/unix_domain_socket_test_util.h @@ -17,7 +17,7 @@ #include <string> -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/syscalls/linux/verity_getdents.cc b/test/syscalls/linux/verity_getdents.cc index 093595dd3..822a75254 100644 --- a/test/syscalls/linux/verity_getdents.cc +++ b/test/syscalls/linux/verity_getdents.cc @@ -58,16 +58,16 @@ class GetDentsTest : public ::testing::Test { }; TEST_F(GetDentsTest, GetDents) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); std::vector<std::string> expect = {".", "..", filename_}; EXPECT_NO_ERRNO(DirContains(verity_dir, expect, /*exclude=*/{})); } TEST_F(GetDentsTest, Deleted) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); EXPECT_THAT(unlink(JoinPath(tmpfs_dir_.path(), filename_).c_str()), SyscallSucceeds()); @@ -77,8 +77,8 @@ TEST_F(GetDentsTest, Deleted) { } TEST_F(GetDentsTest, Renamed) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); std::string new_file_name = "renamed-" + filename_; EXPECT_THAT(rename(JoinPath(tmpfs_dir_.path(), filename_).c_str(), diff --git a/test/syscalls/linux/verity_ioctl.cc b/test/syscalls/linux/verity_ioctl.cc index be91b23d0..45650809c 100644 --- a/test/syscalls/linux/verity_ioctl.cc +++ b/test/syscalls/linux/verity_ioctl.cc @@ -105,8 +105,8 @@ TEST_F(IoctlTest, Measure) { } TEST_F(IoctlTest, Mount) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); // Make sure the file can be open and read in the mounted verity fs. auto const verity_fd = ASSERT_NO_ERRNO_AND_VALUE( @@ -117,8 +117,8 @@ TEST_F(IoctlTest, Mount) { } TEST_F(IoctlTest, NonExistingFile) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); // Confirm that opening a non-existing file in the verity-enabled directory // triggers the expected error instead of verification failure. @@ -128,8 +128,8 @@ TEST_F(IoctlTest, NonExistingFile) { } TEST_F(IoctlTest, ModifiedFile) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); // Modify the file and check verification failure upon reading from it. auto const fd = ASSERT_NO_ERRNO_AND_VALUE( @@ -143,8 +143,8 @@ TEST_F(IoctlTest, ModifiedFile) { } TEST_F(IoctlTest, ModifiedMerkle) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); // Modify the Merkle file and check verification failure upon opening the // corresponding file. @@ -158,8 +158,8 @@ TEST_F(IoctlTest, ModifiedMerkle) { } TEST_F(IoctlTest, ModifiedDirMerkle) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); // Modify the Merkle file for the parent directory and check verification // failure upon opening the corresponding file. @@ -173,8 +173,8 @@ TEST_F(IoctlTest, ModifiedDirMerkle) { } TEST_F(IoctlTest, Stat) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); struct stat st; EXPECT_THAT(stat(JoinPath(verity_dir, filename_).c_str(), &st), @@ -182,8 +182,8 @@ TEST_F(IoctlTest, Stat) { } TEST_F(IoctlTest, ModifiedStat) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); EXPECT_THAT(chmod(JoinPath(tmpfs_dir_.path(), filename_).c_str(), 0644), SyscallSucceeds()); @@ -193,8 +193,8 @@ TEST_F(IoctlTest, ModifiedStat) { } TEST_F(IoctlTest, DeleteFile) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); EXPECT_THAT(unlink(JoinPath(tmpfs_dir_.path(), filename_).c_str()), SyscallSucceeds()); @@ -203,8 +203,8 @@ TEST_F(IoctlTest, DeleteFile) { } TEST_F(IoctlTest, DeleteMerkle) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); EXPECT_THAT( unlink(MerklePath(JoinPath(tmpfs_dir_.path(), filename_)).c_str()), @@ -214,8 +214,8 @@ TEST_F(IoctlTest, DeleteMerkle) { } TEST_F(IoctlTest, RenameFile) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); std::string new_file_name = "renamed-" + filename_; EXPECT_THAT(rename(JoinPath(tmpfs_dir_.path(), filename_).c_str(), @@ -226,8 +226,8 @@ TEST_F(IoctlTest, RenameFile) { } TEST_F(IoctlTest, RenameMerkle) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); std::string new_file_name = "renamed-" + filename_; EXPECT_THAT( diff --git a/test/syscalls/linux/verity_mmap.cc b/test/syscalls/linux/verity_mmap.cc index dde74cc91..2bfd43b16 100644 --- a/test/syscalls/linux/verity_mmap.cc +++ b/test/syscalls/linux/verity_mmap.cc @@ -57,8 +57,8 @@ class MmapTest : public ::testing::Test { }; TEST_F(MmapTest, MmapRead) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); // Make sure the file can be open and mmapped in the mounted verity fs. auto const verity_fd = ASSERT_NO_ERRNO_AND_VALUE( @@ -71,8 +71,8 @@ TEST_F(MmapTest, MmapRead) { } TEST_F(MmapTest, ModifiedBeforeMmap) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); // Modify the file and check verification failure upon mmapping. auto const fd = ASSERT_NO_ERRNO_AND_VALUE( @@ -90,8 +90,8 @@ TEST_F(MmapTest, ModifiedBeforeMmap) { } TEST_F(MmapTest, ModifiedAfterMmap) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); auto const verity_fd = ASSERT_NO_ERRNO_AND_VALUE( Open(JoinPath(verity_dir, filename_), O_RDONLY, 0777)); @@ -126,8 +126,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::ValuesIn({MAP_SHARED, MAP_PRIVATE}))); TEST_P(MmapParamTest, Mmap) { - std::string verity_dir = - ASSERT_NO_ERRNO_AND_VALUE(MountVerity(tmpfs_dir_.path(), filename_)); + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE( + MountVerity(tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY)})); // Make sure the file can be open and mmapped in the mounted verity fs. auto const verity_fd = ASSERT_NO_ERRNO_AND_VALUE( diff --git a/test/syscalls/linux/verity_symlink.cc b/test/syscalls/linux/verity_symlink.cc new file mode 100644 index 000000000..c6fce8ead --- /dev/null +++ b/test/syscalls/linux/verity_symlink.cc @@ -0,0 +1,117 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stdint.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <sys/stat.h> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "test/util/capability_util.h" +#include "test/util/fs_util.h" +#include "test/util/mount_util.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" +#include "test/util/verity_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +const char kSymlink[] = "verity_symlink"; + +class SymlinkTest : public ::testing::Test { + protected: + void SetUp() override { + // Verity is implemented in VFS2. + SKIP_IF(IsRunningWithVFS1()); + + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + // Mount a tmpfs file system, to be wrapped by a verity fs. + tmpfs_dir_ = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + ASSERT_THAT(mount("", tmpfs_dir_.path().c_str(), "tmpfs", 0, ""), + SyscallSucceeds()); + + // Create a new file in the tmpfs mount. + file_ = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(tmpfs_dir_.path(), kContents, 0777)); + filename_ = Basename(file_.path()); + + // Create a symlink to the file. + ASSERT_THAT(symlink(file_.path().c_str(), + JoinPath(tmpfs_dir_.path(), kSymlink).c_str()), + SyscallSucceeds()); + } + + TempPath tmpfs_dir_; + TempPath file_; + std::string filename_; +}; + +TEST_F(SymlinkTest, Success) { + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE(MountVerity( + tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY), + EnableTarget(kSymlink, O_RDONLY | O_NOFOLLOW)})); + + char buf[256]; + EXPECT_THAT( + readlink(JoinPath(verity_dir, kSymlink).c_str(), buf, sizeof(buf)), + SyscallSucceeds()); + auto const verity_fd = ASSERT_NO_ERRNO_AND_VALUE( + Open(JoinPath(verity_dir, kSymlink).c_str(), O_RDONLY, 0777)); + EXPECT_THAT(ReadFd(verity_fd.get(), buf, sizeof(kContents)), + SyscallSucceeds()); +} + +TEST_F(SymlinkTest, DeleteLink) { + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE(MountVerity( + tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY), + EnableTarget(kSymlink, O_RDONLY | O_NOFOLLOW)})); + + ASSERT_THAT(unlink(JoinPath(tmpfs_dir_.path(), kSymlink).c_str()), + SyscallSucceeds()); + char buf[256]; + EXPECT_THAT( + readlink(JoinPath(verity_dir, kSymlink).c_str(), buf, sizeof(buf)), + SyscallFailsWithErrno(EIO)); + EXPECT_THAT(open(JoinPath(verity_dir, kSymlink).c_str(), O_RDONLY, 0777), + SyscallFailsWithErrno(EIO)); +} + +TEST_F(SymlinkTest, ModifyLink) { + std::string verity_dir = ASSERT_NO_ERRNO_AND_VALUE(MountVerity( + tmpfs_dir_.path(), {EnableTarget(filename_, O_RDONLY), + EnableTarget(kSymlink, O_RDONLY | O_NOFOLLOW)})); + + ASSERT_THAT(unlink(JoinPath(tmpfs_dir_.path(), kSymlink).c_str()), + SyscallSucceeds()); + + std::string newlink = "newlink"; + ASSERT_THAT(symlink(JoinPath(tmpfs_dir_.path(), newlink).c_str(), + JoinPath(tmpfs_dir_.path(), kSymlink).c_str()), + SyscallSucceeds()); + char buf[256]; + EXPECT_THAT( + readlink(JoinPath(verity_dir, kSymlink).c_str(), buf, sizeof(buf)), + SyscallFailsWithErrno(EIO)); + EXPECT_THAT(open(JoinPath(verity_dir, kSymlink).c_str(), O_RDONLY, 0777), + SyscallFailsWithErrno(EIO)); +} + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/util/BUILD b/test/util/BUILD index cc83221ea..5b6bcb32c 100644 --- a/test/util/BUILD +++ b/test/util/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "cc_library", "cc_test", "coreutil", "gbenchmark", "gtest", "select_system") +load("//tools:defs.bzl", "cc_library", "cc_test", "coreutil", "default_net_util", "gbenchmark_internal", "gtest", "select_system") package( default_visibility = ["//:sandbox"], @@ -8,13 +8,20 @@ package( cc_library( name = "capability_util", testonly = 1, - srcs = ["capability_util.cc"], - hdrs = ["capability_util.h"], + srcs = [ + "fuchsia_capability_util.cc", + "linux_capability_util.cc", + ], + hdrs = [ + "capability_util.h", + "linux_capability_util.h", + ], deps = [ ":cleanup", ":memory_util", ":posix_error", ":save_util", + ":socket_util", ":test_util", "@com_google_absl//absl/strings", ], @@ -288,7 +295,7 @@ cc_library( "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/time", gtest, - gbenchmark, + gbenchmark_internal, ], ) @@ -343,6 +350,18 @@ cc_library( ) cc_library( + name = "benchmark_main", + testonly = 1, + linkstatic = 1, + deps = [ + ":test_util", + "@com_google_absl//absl/flags:flag", + gtest, + gbenchmark_internal, + ], +) + +cc_library( name = "epoll_util", testonly = 1, srcs = ["epoll_util.cc"], @@ -414,3 +433,27 @@ cc_library( ":temp_path", ], ) + +cc_library( + name = "socket_util", + testonly = 1, + srcs = [ + "socket_util.cc", + "socket_util_impl.cc", + ], + hdrs = ["socket_util.h"], + defines = select_system(), + deps = default_net_util() + [ + gtest, + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/time", + "@com_google_absl//absl/types:optional", + "//test/util:file_descriptor", + "//test/util:posix_error", + "//test/util:temp_path", + "//test/util:test_util", + "//test/util:thread_util", + ], +) diff --git a/test/util/capability_util.h b/test/util/capability_util.h index f2c370125..ac1a1b32b 100644 --- a/test/util/capability_util.h +++ b/test/util/capability_util.h @@ -17,107 +17,32 @@ #ifndef GVISOR_TEST_UTIL_CAPABILITY_UTIL_H_ #define GVISOR_TEST_UTIL_CAPABILITY_UTIL_H_ -#include <errno.h> -#include <linux/capability.h> -#include <sys/syscall.h> -#include <unistd.h> - -#include "test/util/cleanup.h" #include "test/util/posix_error.h" -#include "test/util/save_util.h" -#include "test/util/test_util.h" -#ifndef _LINUX_CAPABILITY_VERSION_3 -#error Expecting _LINUX_CAPABILITY_VERSION_3 support +#if defined(__Fuchsia__) +// Nothing to include. +#elif defined(__linux__) +#include "test/util/linux_capability_util.h" +#else +#error "Unhandled platform" #endif namespace gvisor { namespace testing { -// HaveCapability returns true if the process has the specified EFFECTIVE -// capability. -inline PosixErrorOr<bool> HaveCapability(int cap) { - if (!cap_valid(cap)) { - return PosixError(EINVAL, "Invalid capability"); - } - - struct __user_cap_header_struct header = {_LINUX_CAPABILITY_VERSION_3, 0}; - struct __user_cap_data_struct caps[_LINUX_CAPABILITY_U32S_3] = {}; - RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capget, &header, &caps)); - MaybeSave(); - - return (caps[CAP_TO_INDEX(cap)].effective & CAP_TO_MASK(cap)) != 0; -} - -// SetCapability sets the specified EFFECTIVE capability. -inline PosixError SetCapability(int cap, bool set) { - if (!cap_valid(cap)) { - return PosixError(EINVAL, "Invalid capability"); - } - - struct __user_cap_header_struct header = {_LINUX_CAPABILITY_VERSION_3, 0}; - struct __user_cap_data_struct caps[_LINUX_CAPABILITY_U32S_3] = {}; - RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capget, &header, &caps)); - MaybeSave(); - - if (set) { - caps[CAP_TO_INDEX(cap)].effective |= CAP_TO_MASK(cap); - } else { - caps[CAP_TO_INDEX(cap)].effective &= ~CAP_TO_MASK(cap); - } - header = {_LINUX_CAPABILITY_VERSION_3, 0}; - RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capset, &header, &caps)); - MaybeSave(); - - return NoError(); -} - -// DropPermittedCapability drops the specified PERMITTED. The EFFECTIVE -// capabilities must be a subset of PERMITTED, so those are dropped as well. -inline PosixError DropPermittedCapability(int cap) { - if (!cap_valid(cap)) { - return PosixError(EINVAL, "Invalid capability"); - } - - struct __user_cap_header_struct header = {_LINUX_CAPABILITY_VERSION_3, 0}; - struct __user_cap_data_struct caps[_LINUX_CAPABILITY_U32S_3] = {}; - RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capget, &header, &caps)); - MaybeSave(); - - caps[CAP_TO_INDEX(cap)].effective &= ~CAP_TO_MASK(cap); - caps[CAP_TO_INDEX(cap)].permitted &= ~CAP_TO_MASK(cap); - - header = {_LINUX_CAPABILITY_VERSION_3, 0}; - RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capset, &header, &caps)); - MaybeSave(); - - return NoError(); -} - -PosixErrorOr<bool> CanCreateUserNamespace(); - -class AutoCapability { - public: - AutoCapability(int cap, bool set) : cap_(cap), set_(set) { - const bool has = EXPECT_NO_ERRNO_AND_VALUE(HaveCapability(cap)); - if (set != has) { - EXPECT_NO_ERRNO(SetCapability(cap_, set_)); - applied_ = true; - } - } - - ~AutoCapability() { - if (applied_) { - EXPECT_NO_ERRNO(SetCapability(cap_, !set_)); - } - } +// HaveRawIPSocketCapability returns whether or not the process has access to +// raw IP sockets. +// +// Returns an error when raw IP socket access cannot be determined. +PosixErrorOr<bool> HaveRawIPSocketCapability(); - private: - int cap_; - bool set_; - bool applied_ = false; -}; +// HavePacketSocketCapability returns whether or not the process has access to +// packet sockets. +// +// Returns an error when packet socket access cannot be determined. +PosixErrorOr<bool> HavePacketSocketCapability(); } // namespace testing } // namespace gvisor + #endif // GVISOR_TEST_UTIL_CAPABILITY_UTIL_H_ diff --git a/test/util/cgroup_util.cc b/test/util/cgroup_util.cc index 977993f41..df3c57b87 100644 --- a/test/util/cgroup_util.cc +++ b/test/util/cgroup_util.cc @@ -25,12 +25,26 @@ namespace gvisor { namespace testing { -Cgroup::Cgroup(std::string_view path) : cgroup_path_(path) { +Cgroup::Cgroup(absl::string_view path) : cgroup_path_(path) { id_ = ++Cgroup::next_id_; std::cerr << absl::StreamFormat("[cg#%d] <= %s", id_, cgroup_path_) << std::endl; } +PosixErrorOr<Cgroup> Cgroup::RecursivelyCreate(absl::string_view path) { + RETURN_IF_ERRNO(RecursivelyCreateDir(path)); + return Cgroup(path); +} + +PosixErrorOr<Cgroup> Cgroup::Create(absl::string_view path) { + RETURN_IF_ERRNO(Mkdir(path)); + return Cgroup(path); +} + +PosixErrorOr<Cgroup> Cgroup::CreateChild(absl::string_view name) const { + return Cgroup::Create(JoinPath(Path(), name)); +} + PosixErrorOr<std::string> Cgroup::ReadControlFile( absl::string_view name) const { std::string buf; @@ -93,7 +107,7 @@ PosixErrorOr<absl::flat_hash_set<pid_t>> Cgroup::ParsePIDList( absl::string_view data) const { absl::flat_hash_set<pid_t> res; std::vector<absl::string_view> lines = absl::StrSplit(data, '\n'); - for (const std::string_view& line : lines) { + for (const absl::string_view& line : lines) { if (line.empty()) { continue; } diff --git a/test/util/cgroup_util.h b/test/util/cgroup_util.h index e3f696a89..ccc7219e3 100644 --- a/test/util/cgroup_util.h +++ b/test/util/cgroup_util.h @@ -34,8 +34,20 @@ class Cgroup { uint64_t id() const { return id_; } + // RecursivelyCreate creates cgroup specified by path, including all + // components leading up to path. Path should end inside a cgroupfs mount. If + // path already exists, RecursivelyCreate does nothing and silently succeeds. + static PosixErrorOr<Cgroup> RecursivelyCreate(std::string_view path); + + // Creates a new cgroup at path. The parent directory must exist and be a + // cgroupfs directory. + static PosixErrorOr<Cgroup> Create(std::string_view path); + const std::string& Path() const { return cgroup_path_; } + // Creates a child cgroup under this cgroup with the given name. + PosixErrorOr<Cgroup> CreateChild(std::string_view name) const; + std::string Relpath(absl::string_view leaf) const { return JoinPath(cgroup_path_, leaf); } diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc index 483ae848d..253411858 100644 --- a/test/util/fs_util.cc +++ b/test/util/fs_util.cc @@ -201,7 +201,8 @@ PosixError UnlinkAt(const FileDescriptor& dfd, absl::string_view path, PosixError Mkdir(absl::string_view path, int mode) { int res = mkdir(std::string(path).c_str(), mode); if (res < 0) { - return PosixError(errno, absl::StrCat("mkdir ", path, " mode ", mode)); + return PosixError(errno, + absl::StrFormat("mkdir \"%s\" mode %#o", path, mode)); } return NoError(); diff --git a/test/util/fuchsia_capability_util.cc b/test/util/fuchsia_capability_util.cc new file mode 100644 index 000000000..bbe8643e7 --- /dev/null +++ b/test/util/fuchsia_capability_util.cc @@ -0,0 +1,73 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef __Fuchsia__ + +#include <netinet/if_ether.h> +#include <netinet/in.h> +#include <sys/socket.h> + +#include "test/util/socket_util.h" + +namespace gvisor { +namespace testing { + +// On Linux, access to raw IP and packet socket is controlled by a single +// capability (CAP_NET_RAW). However on Fuchsia, access to raw IP and packet +// sockets are controlled by separate capabilities/protocols. + +namespace { + +PosixErrorOr<bool> HaveSocketCapability(int domain, int type, int protocol) { + // Fuchsia does not have a platform supported way to check the protocols made + // available to a sandbox. As a workaround, simply try to create the specified + // socket and assume no access if we get a no permissions error. + auto s = Socket(domain, type, protocol); + if (s.ok()) { + return true; + } + if (s.error().errno_value() == EPERM) { + return false; + } + return s.error(); +} + +} // namespace + +PosixErrorOr<bool> HaveRawIPSocketCapability() { + static PosixErrorOr<bool> result(false); + static std::once_flag once; + + std::call_once(once, [&]() { + result = HaveSocketCapability(AF_INET, SOCK_RAW, IPPROTO_UDP); + }); + + return result; +} + +PosixErrorOr<bool> HavePacketSocketCapability() { + static PosixErrorOr<bool> result(false); + static std::once_flag once; + + std::call_once(once, [&]() { + result = HaveSocketCapability(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + }); + + return result; +} + +} // namespace testing +} // namespace gvisor + +#endif // __Fuchsia__ diff --git a/test/util/capability_util.cc b/test/util/linux_capability_util.cc index a1b994c45..7218aa4ac 100644 --- a/test/util/capability_util.cc +++ b/test/util/linux_capability_util.cc @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "test/util/capability_util.h" +#ifdef __linux__ + +#include "test/util/linux_capability_util.h" #include <linux/capability.h> #include <sched.h> @@ -30,6 +32,14 @@ namespace gvisor { namespace testing { +PosixErrorOr<bool> HaveRawIPSocketCapability() { + return HaveCapability(CAP_NET_RAW); +} + +PosixErrorOr<bool> HavePacketSocketCapability() { + return HaveCapability(CAP_NET_RAW); +} + PosixErrorOr<bool> CanCreateUserNamespace() { // The most reliable way to determine if userns creation is possible is by // trying to create one; see below. @@ -79,3 +89,5 @@ PosixErrorOr<bool> CanCreateUserNamespace() { } // namespace testing } // namespace gvisor + +#endif // __linux__ diff --git a/test/util/linux_capability_util.h b/test/util/linux_capability_util.h new file mode 100644 index 000000000..be94ebd19 --- /dev/null +++ b/test/util/linux_capability_util.h @@ -0,0 +1,128 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Utilities for testing capabilities on Linux. + +#ifndef GVISOR_TEST_UTIL_LINUX_CAPABILITY_UTIL_H_ +#define GVISOR_TEST_UTIL_LINUX_CAPABILITY_UTIL_H_ + +#ifdef __linux__ + +#include <errno.h> +#include <linux/capability.h> +#include <sys/syscall.h> +#include <unistd.h> + +#include "test/util/cleanup.h" +#include "test/util/posix_error.h" +#include "test/util/save_util.h" +#include "test/util/test_util.h" + +#ifndef _LINUX_CAPABILITY_VERSION_3 +#error Expecting _LINUX_CAPABILITY_VERSION_3 support +#endif + +namespace gvisor { +namespace testing { + +// HaveCapability returns true if the process has the specified EFFECTIVE +// capability. +inline PosixErrorOr<bool> HaveCapability(int cap) { + if (!cap_valid(cap)) { + return PosixError(EINVAL, "Invalid capability"); + } + + struct __user_cap_header_struct header = {_LINUX_CAPABILITY_VERSION_3, 0}; + struct __user_cap_data_struct caps[_LINUX_CAPABILITY_U32S_3] = {}; + RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capget, &header, &caps)); + MaybeSave(); + + return (caps[CAP_TO_INDEX(cap)].effective & CAP_TO_MASK(cap)) != 0; +} + +// SetCapability sets the specified EFFECTIVE capability. +inline PosixError SetCapability(int cap, bool set) { + if (!cap_valid(cap)) { + return PosixError(EINVAL, "Invalid capability"); + } + + struct __user_cap_header_struct header = {_LINUX_CAPABILITY_VERSION_3, 0}; + struct __user_cap_data_struct caps[_LINUX_CAPABILITY_U32S_3] = {}; + RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capget, &header, &caps)); + MaybeSave(); + + if (set) { + caps[CAP_TO_INDEX(cap)].effective |= CAP_TO_MASK(cap); + } else { + caps[CAP_TO_INDEX(cap)].effective &= ~CAP_TO_MASK(cap); + } + header = {_LINUX_CAPABILITY_VERSION_3, 0}; + RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capset, &header, &caps)); + MaybeSave(); + + return NoError(); +} + +// DropPermittedCapability drops the specified PERMITTED. The EFFECTIVE +// capabilities must be a subset of PERMITTED, so those are dropped as well. +inline PosixError DropPermittedCapability(int cap) { + if (!cap_valid(cap)) { + return PosixError(EINVAL, "Invalid capability"); + } + + struct __user_cap_header_struct header = {_LINUX_CAPABILITY_VERSION_3, 0}; + struct __user_cap_data_struct caps[_LINUX_CAPABILITY_U32S_3] = {}; + RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capget, &header, &caps)); + MaybeSave(); + + caps[CAP_TO_INDEX(cap)].effective &= ~CAP_TO_MASK(cap); + caps[CAP_TO_INDEX(cap)].permitted &= ~CAP_TO_MASK(cap); + + header = {_LINUX_CAPABILITY_VERSION_3, 0}; + RETURN_ERROR_IF_SYSCALL_FAIL(syscall(__NR_capset, &header, &caps)); + MaybeSave(); + + return NoError(); +} + +PosixErrorOr<bool> CanCreateUserNamespace(); + +class AutoCapability { + public: + AutoCapability(int cap, bool set) : cap_(cap), set_(set) { + const bool has = EXPECT_NO_ERRNO_AND_VALUE(HaveCapability(cap)); + if (set != has) { + EXPECT_NO_ERRNO(SetCapability(cap_, set_)); + applied_ = true; + } + } + + ~AutoCapability() { + if (applied_) { + EXPECT_NO_ERRNO(SetCapability(cap_, !set_)); + } + } + + private: + int cap_; + bool set_; + bool applied_ = false; +}; + +} // namespace testing +} // namespace gvisor + +#endif // __linux__ + +#endif // GVISOR_TEST_UTIL_LINUX_CAPABILITY_UTIL_H_ diff --git a/test/util/posix_error.h b/test/util/posix_error.h index 9ca09b77c..40853cb21 100644 --- a/test/util/posix_error.h +++ b/test/util/posix_error.h @@ -385,7 +385,7 @@ class PosixErrorIsMatcher { }; // Returns a gMock matcher that matches a PosixError or PosixErrorOr<> whose -// whose error code matches code_matcher, and whose error message matches +// error code matches code_matcher, and whose error message matches // message_matcher. template <typename ErrorCodeMatcher> PosixErrorIsMatcher PosixErrorIs( @@ -395,6 +395,14 @@ PosixErrorIsMatcher PosixErrorIs( std::move(message_matcher)); } +// Returns a gMock matcher that matches a PosixError or PosixErrorOr<> whose +// error code matches code_matcher. +template <typename ErrorCodeMatcher> +PosixErrorIsMatcher PosixErrorIs(ErrorCodeMatcher&& code_matcher) { + return PosixErrorIsMatcher(std::forward<ErrorCodeMatcher>(code_matcher), + ::testing::_); +} + // Returns a gMock matcher that matches a PosixErrorOr<> which is ok() and // value matches the inner matcher. template <typename InnerMatcher> diff --git a/test/syscalls/linux/socket_test_util.cc b/test/util/socket_util.cc index 6039b2a67..650b422ae 100644 --- a/test/syscalls/linux/socket_test_util.cc +++ b/test/util/socket_util.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" #include <arpa/inet.h> #include <netinet/in.h> @@ -24,6 +24,7 @@ #include "gtest/gtest.h" #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_split.h" #include "absl/time/clock.h" #include "absl/types/optional.h" #include "test/util/file_descriptor.h" @@ -56,7 +57,19 @@ Creator<FileDescriptor> SyscallSocketCreator(int domain, int type, PosixErrorOr<struct sockaddr_un> UniqueUnixAddr(bool abstract, int domain) { struct sockaddr_un addr = {}; + +#ifdef ANDROID + // Using NewTempAbsPath() can cause the tmp directory path to exceed the max + // length (i.e., sizeof(addr.sun_path)). + // + // However, existing systems that are built with the ANDROID configuration + // have their temp directory in a different location, and must respect the + // TEST_TMPDIR. + std::string path = NewTempAbsPath(); +#else std::string path = NewTempAbsPathInDir("/tmp"); +#endif // ANDROID + if (path.size() >= sizeof(addr.sun_path)) { return PosixError(EINVAL, "Unable to generate a temp path of appropriate length"); @@ -85,7 +98,8 @@ Creator<SocketPair> AcceptBindSocketPairCreator(bool abstract, int domain, RETURN_ERROR_IF_SYSCALL_FAIL( bind(bound, AsSockAddr(&bind_addr), sizeof(bind_addr))); MaybeSave(); // Successful bind. - RETURN_ERROR_IF_SYSCALL_FAIL(listen(bound, /* backlog = */ 5)); + RETURN_ERROR_IF_SYSCALL_FAIL( + listen(bound, /* backlog = */ 5)); // NOLINT(bugprone-argument-comment) MaybeSave(); // Successful listen. int connected; @@ -316,7 +330,8 @@ PosixErrorOr<T> BindIP(int fd, bool dual_stack) { template <typename T> PosixErrorOr<T> TCPBindAndListen(int fd, bool dual_stack) { ASSIGN_OR_RETURN_ERRNO(T addr, BindIP<T>(fd, dual_stack)); - RETURN_ERROR_IF_SYSCALL_FAIL(listen(fd, /* backlog = */ 5)); + RETURN_ERROR_IF_SYSCALL_FAIL( + listen(fd, /* backlog = */ 5)); // NOLINT(bugprone-argument-comment) return addr; } @@ -946,5 +961,169 @@ uint16_t ICMPChecksum(struct icmphdr icmphdr, const char* payload, return csum; } +PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) { + switch (family) { + case AF_INET: + return static_cast<uint16_t>( + reinterpret_cast<sockaddr_in const*>(&addr)->sin_port); + case AF_INET6: + return static_cast<uint16_t>( + reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port); + default: + return PosixError(EINVAL, + absl::StrCat("unknown socket family: ", family)); + } +} + +PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) { + switch (family) { + case AF_INET: + reinterpret_cast<sockaddr_in*>(addr)->sin_port = port; + return NoError(); + case AF_INET6: + reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port; + return NoError(); + default: + return PosixError(EINVAL, + absl::StrCat("unknown socket family: ", family)); + } +} + +void SetupTimeWaitClose(const TestAddress* listener, + const TestAddress* connector, bool reuse, + bool accept_close, sockaddr_storage* listen_addr, + sockaddr_storage* conn_bound_addr) { + // Create the listening socket. + FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(listener->family(), SOCK_STREAM, IPPROTO_TCP)); + if (reuse) { + ASSERT_THAT(setsockopt(listen_fd.get(), SOL_SOCKET, SO_REUSEADDR, + &kSockOptOn, sizeof(kSockOptOn)), + SyscallSucceeds()); + } + ASSERT_THAT( + bind(listen_fd.get(), AsSockAddr(listen_addr), listener->addr_len), + SyscallSucceeds()); + ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds()); + + // Get the port bound by the listening socket. + socklen_t addrlen = listener->addr_len; + ASSERT_THAT(getsockname(listen_fd.get(), AsSockAddr(listen_addr), &addrlen), + SyscallSucceeds()); + + uint16_t const port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener->family(), *listen_addr)); + + // Connect to the listening socket. + FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector->family(), SOCK_STREAM, IPPROTO_TCP)); + + // We disable saves after this point as a S/R causes the netstack seed + // to be regenerated which changes what ports/ISN is picked for a given + // tuple (src ip,src port, dst ip, dst port). This can cause the final + // SYN to use a sequence number that looks like one from the current + // connection in TIME_WAIT and will not be accepted causing the test + // to timeout. + // + // TODO(gvisor.dev/issue/940): S/R portSeed/portHint + DisableSave ds; + + sockaddr_storage conn_addr = connector->addr; + ASSERT_NO_ERRNO(SetAddrPort(connector->family(), &conn_addr, port)); + ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(), AsSockAddr(&conn_addr), + connector->addr_len), + SyscallSucceeds()); + + // Accept the connection. + auto accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + + // Get the address/port bound by the connecting socket. + socklen_t conn_addrlen = connector->addr_len; + ASSERT_THAT( + getsockname(conn_fd.get(), AsSockAddr(conn_bound_addr), &conn_addrlen), + SyscallSucceeds()); + + FileDescriptor active_closefd, passive_closefd; + if (accept_close) { + active_closefd = std::move(accepted); + passive_closefd = std::move(conn_fd); + } else { + active_closefd = std::move(conn_fd); + passive_closefd = std::move(accepted); + } + + // shutdown to trigger TIME_WAIT. + ASSERT_THAT(shutdown(active_closefd.get(), SHUT_WR), SyscallSucceeds()); + { + constexpr int kTimeout = 10000; + pollfd pfd = { + .fd = passive_closefd.get(), + .events = POLLIN, + }; + ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); + ASSERT_EQ(pfd.revents, POLLIN); + } + ASSERT_THAT(shutdown(passive_closefd.get(), SHUT_WR), SyscallSucceeds()); + { + constexpr int kTimeout = 10000; + constexpr int16_t want_events = POLLHUP; + pollfd pfd = { + .fd = active_closefd.get(), + .events = want_events, + }; + ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); + } + + // This sleep is needed to reduce flake to ensure that the passive-close + // ensures the state transitions to CLOSE from LAST_ACK. + absl::SleepFor(absl::Seconds(1)); +} + +constexpr char kRangeFile[] = "/proc/sys/net/ipv4/ip_local_port_range"; + +PosixErrorOr<int> MaybeLimitEphemeralPorts() { + int min = 0; + int max = 1 << 16; + + // Read the ephemeral range from /proc. + ASSIGN_OR_RETURN_ERRNO(std::string rangefile, GetContents(kRangeFile)); + const std::string err_msg = + absl::StrFormat("%s has invalid content: %s", kRangeFile, rangefile); + if (rangefile.back() != '\n') { + return PosixError(EINVAL, err_msg); + } + rangefile.pop_back(); + std::vector<std::string> range = + absl::StrSplit(rangefile, absl::ByAnyChar("\t ")); + if (range.size() < 2 || !absl::SimpleAtoi(range.front(), &min) || + !absl::SimpleAtoi(range.back(), &max)) { + return PosixError(EINVAL, err_msg); + } + + // If we can open as writable, limit the range. + if (!access(kRangeFile, W_OK)) { + ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, + Open(kRangeFile, O_WRONLY | O_TRUNC, 0)); + int newMax = min + 50; + const std::string small_range = absl::StrFormat("%d %d", min, newMax); + int n = write(fd.get(), small_range.c_str(), small_range.size()); + if (n < 0) { + // Hostinet doesn't allow modifying the host port range. And if we're root + // (as we are in some tests), access and open will succeed even if the + // file mode is readonly. + if (errno != EACCES) { + return PosixError( + errno, + absl::StrFormat("write(%d [%s], \"%s\", %d)", fd.get(), kRangeFile, + small_range.c_str(), small_range.size())); + } + } else { + max = newMax; + } + } + return max - min; +} + } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket_test_util.h b/test/util/socket_util.h index 76dc090e0..0e2be63cc 100644 --- a/test/syscalls/linux/socket_test_util.h +++ b/test/util/socket_util.h @@ -564,6 +564,22 @@ inline sockaddr* AsSockAddr(sockaddr_un* s) { return reinterpret_cast<sockaddr*>(s); } +PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr); + +PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port); + +// setupTimeWaitClose sets up a socket endpoint in TIME_WAIT state. +// Callers can choose to perform active close on either ends of the connection +// and also specify if they want to enabled SO_REUSEADDR. +void SetupTimeWaitClose(const TestAddress* listener, + const TestAddress* connector, bool reuse, + bool accept_close, sockaddr_storage* listen_addr, + sockaddr_storage* conn_bound_addr); + +// MaybeLimitEphemeralPorts attempts to reduce the number of ephemeral ports and +// returns the number of ephemeral ports. +PosixErrorOr<int> MaybeLimitEphemeralPorts(); + namespace internal { PosixErrorOr<int> TryPortAvailable(int port, AddressFamily family, SocketType type, bool reuse_addr); diff --git a/test/syscalls/linux/socket_test_util_impl.cc b/test/util/socket_util_impl.cc index ef661a0e3..04550ad7c 100644 --- a/test/syscalls/linux/socket_test_util_impl.cc +++ b/test/util/socket_util_impl.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/socket_util.h" namespace gvisor { namespace testing { diff --git a/test/util/test_util_impl.cc b/test/util/test_util_impl.cc index 7e1ad9e66..6b6826898 100644 --- a/test/util/test_util_impl.cc +++ b/test/util/test_util_impl.cc @@ -20,6 +20,7 @@ #include "benchmark/benchmark.h" #include "test/util/logging.h" +extern bool FLAGS_gtest_list_tests; extern bool FLAGS_benchmark_list_tests; extern std::string FLAGS_benchmark_filter; @@ -40,12 +41,18 @@ void TestInit(int* argc, char*** argv) { } int RunAllTests() { - if (FLAGS_benchmark_list_tests || FLAGS_benchmark_filter != ".") { + if (::testing::FLAGS_gtest_list_tests) { + return RUN_ALL_TESTS(); + } + if (FLAGS_benchmark_list_tests) { benchmark::RunSpecifiedBenchmarks(); return 0; - } else { - return RUN_ALL_TESTS(); } + + // Run selected tests & benchmarks. + int rc = RUN_ALL_TESTS(); + benchmark::RunSpecifiedBenchmarks(); + return rc; } } // namespace testing diff --git a/test/util/verity_util.cc b/test/util/verity_util.cc index f1b4c251b..b7d1cb212 100644 --- a/test/util/verity_util.cc +++ b/test/util/verity_util.cc @@ -54,18 +54,21 @@ PosixError FlipRandomBit(int fd, int size) { return NoError(); } -PosixErrorOr<std::string> MountVerity(std::string tmpfs_dir, - std::string filename) { - // Mount a verity fs on the existing tmpfs mount. - std::string mount_opts = "lower_path=" + tmpfs_dir; +PosixErrorOr<std::string> MountVerity(std::string lower_dir, + std::vector<EnableTarget> targets) { + // Mount a verity fs on the existing mount. + std::string mount_opts = "lower_path=" + lower_dir; ASSIGN_OR_RETURN_ERRNO(TempPath verity_dir, TempPath::CreateDir()); RETURN_ERROR_IF_SYSCALL_FAIL( mount("", verity_dir.path().c_str(), "verity", 0, mount_opts.c_str())); - // Enable both the file and the directory. - ASSIGN_OR_RETURN_ERRNO( - auto fd, Open(JoinPath(verity_dir.path(), filename), O_RDONLY, 0777)); - RETURN_ERROR_IF_SYSCALL_FAIL(ioctl(fd.get(), FS_IOC_ENABLE_VERITY)); + for (const EnableTarget& target : targets) { + ASSIGN_OR_RETURN_ERRNO( + auto target_fd, + Open(JoinPath(verity_dir.path(), target.path), target.flags, 0777)); + RETURN_ERROR_IF_SYSCALL_FAIL(ioctl(target_fd.get(), FS_IOC_ENABLE_VERITY)); + } + ASSIGN_OR_RETURN_ERRNO(auto dir_fd, Open(verity_dir.path(), O_RDONLY, 0777)); RETURN_ERROR_IF_SYSCALL_FAIL(ioctl(dir_fd.get(), FS_IOC_ENABLE_VERITY)); @@ -83,6 +86,7 @@ PosixErrorOr<std::string> MountVerity(std::string tmpfs_dir, ASSIGN_OR_RETURN_ERRNO(TempPath verity_with_hash_dir, TempPath::CreateDir()); RETURN_ERROR_IF_SYSCALL_FAIL(mount("", verity_with_hash_dir.path().c_str(), "verity", 0, mount_opts.c_str())); + // Verity directories should not be deleted. Release the TempPath objects to // prevent those directories from being deleted by the destructor. verity_dir.release(); diff --git a/test/util/verity_util.h b/test/util/verity_util.h index 18743ecd6..ebb78b4bb 100644 --- a/test/util/verity_util.h +++ b/test/util/verity_util.h @@ -17,6 +17,8 @@ #include <stdint.h> +#include <vector> + #include "test/util/posix_error.h" namespace gvisor { @@ -44,6 +46,13 @@ struct fsverity_digest { unsigned char digest[]; }; +struct EnableTarget { + std::string path; + int flags; + + EnableTarget(std::string path, int flags) : path(path), flags(flags) {} +}; + constexpr int kMaxDigestSize = 64; constexpr int kDefaultDigestSize = 32; constexpr char kContents[] = "foobarbaz"; @@ -67,7 +76,7 @@ PosixError FlipRandomBit(int fd, int size); // Mount a verity on the tmpfs and enable both the file and the direcotry. Then // mount a new verity with measured root hash. PosixErrorOr<std::string> MountVerity(std::string tmpfs_dir, - std::string filename); + std::vector<EnableTarget> targets); } // namespace testing } // namespace gvisor diff --git a/tools/bazel.mk b/tools/bazel.mk index 60b50cfb0..1444423e4 100644 --- a/tools/bazel.mk +++ b/tools/bazel.mk @@ -84,7 +84,7 @@ DOCKER_RUN_OPTIONS += -v "$(shell readlink -m $(GCLOUD_CONFIG)):$(GCLOUD_CONFIG) DOCKER_RUN_OPTIONS += -v "/tmp:/tmp" DOCKER_EXEC_OPTIONS := --user $(UID):$(GID) DOCKER_EXEC_OPTIONS += --interactive -ifeq (true,$(shell test -t 0 && echo true)) +ifeq (true,$(shell test -t 1 && echo true)) DOCKER_EXEC_OPTIONS += --tty endif @@ -181,23 +181,13 @@ endif # build_paths extracts the built binary from the bazel stderr output. # -# This could be alternately done by parsing the bazel build event stream, but -# this is a complex schema, and begs the question: what will build the thing -# that parses the output? Bazel? Do we need a separate bootstrapping build -# command here? Yikes, let's just stick with the ugly shell pipeline. -# # The last line is used to prevent terminal shenanigans. build_paths = \ (set -euo pipefail; \ - $(call wrapper,$(BAZEL) build $(BASE_OPTIONS) $(BAZEL_OPTIONS) $(1)) 2>&1 \ - | tee /dev/fd/2 \ - | sed -n -e '/^Target/,$$p' \ - | sed -n -e '/^ \($(subst /,\/,$(subst $(SPACE),\|,$(BUILD_ROOTS)))\)/p' \ - | sed -e 's/ /\n/g' \ - | awk '{$$1=$$1};1' \ - | strings \ - | xargs -r -n 1 -I {} readlink -f "{}" \ - | xargs -r -n 1 -I {} bash -c 'set -xeuo pipefail; $(2)') + $(call wrapper,$(BAZEL) build $(BASE_OPTIONS) $(BAZEL_OPTIONS) $(1)) && \ + $(call wrapper,$(BAZEL) cquery $(BASE_OPTIONS) $(BAZEL_OPTIONS) $(1) --output=starlark --starlark:file=tools/show_paths.bzl) \ + | xargs -r -n 1 -I {} bash -c 'test -e "{}" || exit 0; readlink -f "{}"' \ + | xargs -r -n 1 -I {} bash -c 'set -euo pipefail; $(2)') clean = $(call header,CLEAN) && $(call wrapper,$(BAZEL) clean) build = $(call header,BUILD $(1)) && $(call build_paths,$(1),echo {}) diff --git a/tools/bazeldefs/BUILD b/tools/bazeldefs/BUILD index 24e6f8a94..5295f4a85 100644 --- a/tools/bazeldefs/BUILD +++ b/tools/bazeldefs/BUILD @@ -46,6 +46,11 @@ genrule( outs = ["version.txt"], cmd = "cat bazel-out/stable-status.txt | grep STABLE_VERSION | cut -d' ' -f2- | sed 's/^[^[:digit:]]*//g' >$@", stamp = True, + tags = [ + "manual", + "nobuilder", + "notap", + ], visibility = ["//:sandbox"], ) diff --git a/tools/bazeldefs/cc.bzl b/tools/bazeldefs/cc.bzl index 2831eac5f..57d33726a 100644 --- a/tools/bazeldefs/cc.bzl +++ b/tools/bazeldefs/cc.bzl @@ -9,6 +9,7 @@ cc_test = _cc_test cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain" gtest = "@com_google_googletest//:gtest" gbenchmark = "@com_google_benchmark//:benchmark" +gbenchmark_internal = "@com_google_benchmark//:benchmark" grpcpp = "@com_github_grpc_grpc//:grpc++" vdso_linker_option = "-fuse-ld=gold " diff --git a/tools/bazeldefs/go.bzl b/tools/bazeldefs/go.bzl index da027846b..af3a1c3ee 100644 --- a/tools/bazeldefs/go.bzl +++ b/tools/bazeldefs/go.bzl @@ -6,8 +6,11 @@ load("@io_bazel_rules_go//proto:def.bzl", _go_grpc_library = "go_grpc_library", load("//tools/bazeldefs:defs.bzl", "select_arch", "select_system") gazelle = _gazelle + go_embed_data = _go_embed_data + go_path = _go_path + bazel_worker_proto = "//tools/bazeldefs:worker_protocol_go_proto" def _go_proto_or_grpc_library(go_library_func, name, **kwargs): @@ -15,10 +18,19 @@ def _go_proto_or_grpc_library(go_library_func, name, **kwargs): # If importpath is explicit, pass straight through. go_library_func(name = name, **kwargs) return - deps = [ - dep.replace("_proto", "_go_proto") - for dep in (kwargs.pop("deps", []) or []) - ] + deps = [] + for d in (kwargs.pop("deps", []) or []): + if d == "@com_google_protobuf//:timestamp_proto": + # Special case: this proto has its Go definitions in a different + # repository. + deps.append("@org_golang_google_protobuf//" + + "types/known/timestamppb") + continue + if "//" in d: + repo, path = d.split("//", 1) + deps.append(repo + "//" + path.replace("_proto", "_go_proto")) + else: + deps.append(d.replace("_proto", "_go_proto")) go_library_func( name = name + "_go_proto", importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name + "_go_proto", @@ -130,18 +142,18 @@ def go_context(ctx, goos = None, goarch = None, std = False): elif goarch != go_ctx.sdk.goarch: fail("Internal GOARCH (%s) doesn't match GoSdk GOARCH (%s)." % (goarch, go_ctx.sdk.goarch)) return struct( - go = go_ctx.go, env = go_ctx.env, - nogo_args = [], - stdlib_srcs = go_ctx.sdk.srcs, - runfiles = depset([go_ctx.go] + go_ctx.sdk.srcs + go_ctx.sdk.tools + go_ctx.stdlib.libs), - goos = go_ctx.sdk.goos, + go = go_ctx.go, goarch = go_ctx.sdk.goarch, + goos = go_ctx.sdk.goos, gotags = go_ctx.tags, + nogo_args = [], + runfiles = depset([go_ctx.go] + go_ctx.sdk.srcs + go_ctx.sdk.tools + go_ctx.stdlib.libs), + stdlib_srcs = go_ctx.sdk.srcs, ) def select_goarch(): - return select_arch(arm64 = "arm64", amd64 = "amd64") + return select_arch(amd64 = "amd64", arm64 = "arm64") def select_goos(): return select_system(linux = "linux") diff --git a/tools/bigquery/bigquery.go b/tools/bigquery/bigquery.go index 935154acc..082410697 100644 --- a/tools/bigquery/bigquery.go +++ b/tools/bigquery/bigquery.go @@ -39,13 +39,94 @@ type Suite struct { Timestamp time.Time `bq:"timestamp"` } +func (s *Suite) String() string { + conditions := make([]string, 0, len(s.Conditions)) + for _, c := range s.Conditions { + conditions = append(conditions, c.String()) + } + benchmarks := make([]string, 0, len(s.Benchmarks)) + for _, b := range s.Benchmarks { + benchmarks = append(benchmarks, b.String()) + } + + format := `Suite: +Name: %s +Conditions: %s +Benchmarks: %s +Official: %t +Timestamp: %s +` + + return fmt.Sprintf(format, + s.Name, + strings.Join(conditions, "\n"), + strings.Join(benchmarks, "\n"), + s.Official, + s.Timestamp) +} + // Benchmark represents an individual benchmark in a suite. type Benchmark struct { Name string `bq:"name"` - Condition []*Condition `bq:"condition"` + Condition []*Condition `bq:"cond"` Metric []*Metric `bq:"metric"` } +// String implements the String method for Benchmark +func (bm *Benchmark) String() string { + conditions := make([]string, 0, len(bm.Condition)) + for _, c := range bm.Condition { + conditions = append(conditions, c.String()) + } + metrics := make([]string, 0, len(bm.Metric)) + for _, m := range bm.Metric { + metrics = append(metrics, m.String()) + } + + format := `Condition: +Name: %s +Conditions: %s +Metrics: %s +` + + return fmt.Sprintf(format, + bm.Name, + strings.Join(conditions, "\n"), + strings.Join(metrics, "\n")) +} + +// AddMetric adds a metric to an existing Benchmark. +func (bm *Benchmark) AddMetric(metricName, unit string, sample float64) { + m := &Metric{ + Name: metricName, + Unit: unit, + Sample: sample, + } + bm.Metric = append(bm.Metric, m) +} + +// AddCondition adds a condition to an existing Benchmark. +func (bm *Benchmark) AddCondition(name, value string) { + bm.Condition = append(bm.Condition, &Condition{ + Name: name, + Value: value, + }) +} + +// NewBenchmark initializes a new benchmark. +func NewBenchmark(name string, iters int) *Benchmark { + return &Benchmark{ + Name: name, + Metric: make([]*Metric, 0), + Condition: []*Condition{ + { + Name: "iterations", + Value: strconv.Itoa(iters), + }, + }, + } +} + // Condition represents qualifiers for the benchmark or suite. For example: // Get_Pid/1/real_time would have Benchmark Name "Get_Pid" with "1" // and "real_time" parameters as conditions. Suite conditions include @@ -55,6 +136,10 @@ type Condition struct { Value string `bq:"value"` } +func (c *Condition) String() string { + return fmt.Sprintf("Condition:\nName: %s Value: %s\n", c.Name, c.Value) +} + // Metric holds the actual metric data and unit information for this benchmark. type Metric struct { Name string `bq:"name"` @@ -62,6 +147,10 @@ type Metric struct { Sample float64 `bq:"sample"` } +func (m *Metric) String() string { + return fmt.Sprintf("Metric:\nName: %s Unit: %s Sample: %e\n", m.Name, m.Unit, m.Sample) +} + // InitBigQuery initializes a BigQuery dataset/table in the project. If the dataset/table already exists, it is not duplicated. func InitBigQuery(ctx context.Context, projectID, datasetID, tableID string, opts []option.ClientOption) error { client, err := bq.NewClient(ctx, projectID, opts...) @@ -87,38 +176,6 @@ func InitBigQuery(ctx context.Context, projectID, datasetID, tableID string, opt return nil } -// AddCondition adds a condition to an existing Benchmark. -func (bm *Benchmark) AddCondition(name, value string) { - bm.Condition = append(bm.Condition, &Condition{ - Name: name, - Value: value, - }) -} - -// AddMetric adds a metric to an existing Benchmark. -func (bm *Benchmark) AddMetric(metricName, unit string, sample float64) { - m := &Metric{ - Name: metricName, - Unit: unit, - Sample: sample, - } - bm.Metric = append(bm.Metric, m) -} - -// NewBenchmark initializes a new benchmark. -func NewBenchmark(name string, iters int) *Benchmark { - return &Benchmark{ - Name: name, - Metric: make([]*Metric, 0), - Condition: []*Condition{ - { - Name: "iterations", - Value: strconv.Itoa(iters), - }, - }, - } -} - // NewBenchmarkWithMetric creates a new sending to BigQuery, initialized with a // single iteration and single metric. func NewBenchmarkWithMetric(name, metric, unit string, value float64) *Benchmark { diff --git a/tools/checkescape/BUILD b/tools/checkescape/BUILD index 940538b9e..109b5410c 100644 --- a/tools/checkescape/BUILD +++ b/tools/checkescape/BUILD @@ -8,6 +8,7 @@ go_library( nogo = False, visibility = ["//tools/nogo:__subpackages__"], deps = [ + "//tools/nogo/objdump", "@org_golang_x_tools//go/analysis:go_default_library", "@org_golang_x_tools//go/analysis/passes/buildssa:go_default_library", "@org_golang_x_tools//go/ssa:go_default_library", diff --git a/tools/checkescape/checkescape.go b/tools/checkescape/checkescape.go index c788654a8..ddd1212d7 100644 --- a/tools/checkescape/checkescape.go +++ b/tools/checkescape/checkescape.go @@ -61,21 +61,19 @@ package checkescape import ( "bufio" "bytes" - "flag" "fmt" "go/ast" "go/token" "go/types" "io" "log" - "os" - "os/exec" "path/filepath" "strings" "golang.org/x/tools/go/analysis" "golang.org/x/tools/go/analysis/passes/buildssa" "golang.org/x/tools/go/ssa" + "gvisor.dev/gvisor/tools/nogo/objdump" ) const ( @@ -92,21 +90,6 @@ const ( exempt = "// escapes" ) -var ( - // Binary is the binary under analysis. - // - // See Reader, below. - binary = flag.String("binary", "", "binary under analysis") - - // Reader is the input stream. - // - // This may be set instead of Binary. - Reader io.Reader - - // objdumpTool is the tool used to dump a binary. - objdumpTool = flag.String("objdump_tool", "", "tool used to dump a binary") -) - // EscapeReason is an escape reason. // // This is a simple enum. @@ -374,31 +357,6 @@ func MergeAll(others []Escapes) (es Escapes) { // Note that the map uses <basename.go>:<line> because that is all that is // provided in the objdump format. Since this is all local, it is sufficient. func loadObjdump() (map[string][]string, error) { - var ( - args []string - stdin io.Reader - ) - if *binary != "" { - args = append(args, *binary) - } else if Reader != nil { - stdin = Reader - } else { - // We have no input stream or binary. - return nil, fmt.Errorf("no binary or reader provided") - } - - // Construct our command. - cmd := exec.Command(*objdumpTool, args...) - cmd.Stdin = stdin - cmd.Stderr = os.Stderr - out, err := cmd.StdoutPipe() - if err != nil { - return nil, err - } - if err := cmd.Start(); err != nil { - return nil, err - } - // Identify calls by address or name. Note that this is also // constructed dynamically below, as we encounted the addresses. // This is because some of the functions (duffzero) may have @@ -431,78 +389,83 @@ func loadObjdump() (map[string][]string, error) { // Build the map. nextFunc := "" // For funcsAllowed. m := make(map[string][]string) - r := bufio.NewReader(out) -NextLine: - for { - line, err := r.ReadString('\n') - if err != nil && err != io.EOF { - return nil, err - } - fields := strings.Fields(line) - - // Is this an "allowed" function definition? - if len(fields) >= 2 && fields[0] == "TEXT" { - nextFunc = strings.TrimSuffix(fields[1], "(SB)") - if _, ok := funcsAllowed[nextFunc]; !ok { - nextFunc = "" // Don't record addresses. - } - } - if nextFunc != "" && len(fields) > 2 { - // Save the given address (in hex form, as it appears). - addrsAllowed[fields[1]] = struct{}{} - } - - // We recognize lines corresponding to actual code (not the - // symbol name or other metadata) and annotate them if they - // correspond to an explicit CALL instruction. We assume that - // the lack of a CALL for a given line is evidence that escape - // analysis has eliminated an allocation. - // - // Lines look like this (including the first space): - // gohacks_unsafe.go:33 0xa39 488b442408 MOVQ 0x8(SP), AX - if len(fields) >= 5 && line[0] == ' ' { - if !strings.Contains(fields[3], "CALL") { - continue + if err := objdump.Load(func(origR io.Reader) error { + r := bufio.NewReader(origR) + NextLine: + for { + line, err := r.ReadString('\n') + if err != nil && err != io.EOF { + return err } - site := fields[0] - target := strings.TrimSuffix(fields[4], "(SB)") + fields := strings.Fields(line) - // Ignore strings containing allowed functions. - if _, ok := funcsAllowed[target]; ok { - continue + // Is this an "allowed" function definition? + if len(fields) >= 2 && fields[0] == "TEXT" { + nextFunc = strings.TrimSuffix(fields[1], "(SB)") + if _, ok := funcsAllowed[nextFunc]; !ok { + nextFunc = "" // Don't record addresses. + } } - if _, ok := addrsAllowed[target]; ok { - continue + if nextFunc != "" && len(fields) > 2 { + // Save the given address (in hex form, as it appears). + addrsAllowed[fields[1]] = struct{}{} } - if len(fields) > 5 { - // This may be a future relocation. Some - // objdump versions describe this differently. - // If it contains any of the functions allowed - // above as a string, we let it go. - softTarget := strings.Join(fields[5:], " ") - for name := range funcsAllowed { - if strings.Contains(softTarget, name) { - continue NextLine + + // We recognize lines corresponding to actual code (not the + // symbol name or other metadata) and annotate them if they + // correspond to an explicit CALL instruction. We assume that + // the lack of a CALL for a given line is evidence that escape + // analysis has eliminated an allocation. + // + // Lines look like this (including the first space): + // gohacks_unsafe.go:33 0xa39 488b442408 MOVQ 0x8(SP), AX + if len(fields) >= 5 && line[0] == ' ' { + if !strings.Contains(fields[3], "CALL") { + continue + } + site := fields[0] + target := strings.TrimSuffix(fields[4], "(SB)") + + // Ignore strings containing allowed functions. + if _, ok := funcsAllowed[target]; ok { + continue + } + if _, ok := addrsAllowed[target]; ok { + continue + } + if len(fields) > 5 { + // This may be a future relocation. Some + // objdump versions describe this differently. + // If it contains any of the functions allowed + // above as a string, we let it go. + softTarget := strings.Join(fields[5:], " ") + for name := range funcsAllowed { + if strings.Contains(softTarget, name) { + continue NextLine + } } } - } - // Does this exist already? - existing, ok := m[site] - if !ok { - existing = make([]string, 0, 1) - } - for _, other := range existing { - if target == other { - continue NextLine + // Does this exist already? + existing, ok := m[site] + if !ok { + existing = make([]string, 0, 1) + } + for _, other := range existing { + if target == other { + continue NextLine + } } + existing = append(existing, target) + m[site] = existing // Update. + } + if err == io.EOF { + break } - existing = append(existing, target) - m[site] = existing // Update. - } - if err == io.EOF { - break } + return nil + }); err != nil { + return nil, err } // Zap any accidental false positives. @@ -518,11 +481,6 @@ NextLine: final[site] = filteredCalls } - // Wait for the dump to finish. - if err := cmd.Wait(); err != nil { - return nil, err - } - return final, nil } diff --git a/tools/checklinkname/BUILD b/tools/checklinkname/BUILD new file mode 100644 index 000000000..0f1b07e24 --- /dev/null +++ b/tools/checklinkname/BUILD @@ -0,0 +1,16 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "checklinkname", + srcs = [ + "check_linkname.go", + "known.go", + ], + nogo = False, + visibility = ["//tools/nogo:__subpackages__"], + deps = [ + "@org_golang_x_tools//go/analysis:go_default_library", + ], +) diff --git a/tools/checklinkname/README.md b/tools/checklinkname/README.md new file mode 100644 index 000000000..06b3c302d --- /dev/null +++ b/tools/checklinkname/README.md @@ -0,0 +1,54 @@ +# `checklinkname` Analyzer + +`checklinkname` is an analyzer to provide rudimentary type-checking for +`//go:linkname` directives. Since `//go:linkname` only affects linker behavior, +there is no built-in type safety and it is the programmer's responsibility to +ensure the types on either side are compatible. + +`checklinkname` helps with this by checking that uses match expectations, as +defined in this package. + +`known.go` contains the set of known linkname targets. For most functions, we +expect identical types on both sides of the linkname. In a few cases, the types +may be slightly different (e.g., local redefinition of internal type). It is +still the responsibility of the programmer to ensure the signatures in +`known.go` are compatible and safe. + +## Findings + +Here are the most common findings from this package, and how to resolve them. + +### `runtime.foo signature got "BAR" want "BAZ"; stdlib type changed?` + +The definition of `runtime.foo` in the standard library does not match the +expected type in `known.go`. This means that the function signature in the +standard library changed. + +Addressing this will require creating a new linkname directive in a new Go +version build-tagged in any packages using this symbol. Be sure to also check to +ensure use with the new version is safe, as function constraints may have +changed in addition to the signature. + +<!-- TODO(b/165820485): This isn't yet explicitly supported. --> + +`known.go` will also need to be updated to accept the new signature for the new +version of Go. + +### `Cannot find known symbol "runtime.foo"` + +The standard library has removed runtime.foo entirely. Handling is similar to +above, except existing code must transition away from the symbol entirely (note +that is may simply be renamed). + +### `linkname to unknown symbol "mypkg.foo"; add this symbol to checklinkname.knownLinknames type-check against the remote type` + +A package has added a new linkname directive for a symbol not listed in +`known.go`. Address this by adding a new entry for the target symbol. The +`local` field should be the expected type in your package, while `remote` should +be expected type in the remote package (e.g., in the standard library). These +are typically identical, in which case `remote` can be omitted. + +### `usage: //go:linkname localname [linkname]` + +Malformed `//go:linkname` directive. This should be accompanied by a build +failure in the package. diff --git a/tools/checklinkname/check_linkname.go b/tools/checklinkname/check_linkname.go new file mode 100644 index 000000000..5373dd762 --- /dev/null +++ b/tools/checklinkname/check_linkname.go @@ -0,0 +1,229 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package checklinkname ensures that linkname declarations match their source. +package checklinkname + +import ( + "fmt" + "go/ast" + "go/token" + "go/types" + "strings" + + "golang.org/x/tools/go/analysis" +) + +// Analyzer implements the checklinkname analyzer. +var Analyzer = &analysis.Analyzer{ + Name: "checklinkname", + Doc: "verifies that linkname declarations match their source", + Run: run, +} + +// go:linkname can be rather confusing. https://pkg.go.dev/cmd/compile says: +// +// //go:linkname localname [importpath.name] +// +// This special directive does not apply to the Go code that follows it. +// Instead, the //go:linkname directive instructs the compiler to use +// “importpath.name” as the object file symbol name for the variable or +// function declared as “localname” in the source code. If the +// “importpath.name” argument is omitted, the directive uses the symbol's +// default object file symbol name and only has the effect of making the symbol +// accessible to other packages. Because this directive can subvert the type +// system and package modularity, it is only enabled in files that have +// imported "unsafe". +// +// In this package we use the term "local" to refer to the symbol name in the +// same package as the //go:linkname directive, whose name will be changed by +// the linker. We use the term "remote" to refer to the symbol name that we are +// changing to. +// +// In the general case, the local symbol is a function declaration, and the +// remote symbol is a real function in the standard library. + +// linknameSignatures describes a the type signatures of the symbols in a +// //go:linkname directive. +type linknameSignatures struct { + local string + remote string // equivalent to local if "". +} + +func (l *linknameSignatures) Remote() string { + if l.remote == "" { + return l.local + } + return l.remote +} + +// linknameSymbols describes the symbol namess in a single //go:linkname +// directive. +type linknameSymbols struct { + pos token.Pos + local string + remote string +} + +func findLinknames(pass *analysis.Pass, f *ast.File) []linknameSymbols { + var names []linknameSymbols + + for _, cg := range f.Comments { + for _, c := range cg.List { + if len(c.Text) <= 2 || !strings.HasPrefix(c.Text[2:], "go:linkname ") { + continue + } + + f := strings.Fields(c.Text) + if len(f) < 2 || len(f) > 3 { + // Malformed linkname. This is the same error the compiler emits. + pass.Reportf(c.Slash, "usage: //go:linkname localname [linkname]") + } + + if len(f) == 2 { + // "If the “importpath.name” argument is + // omitted, the directive uses the symbol's + // default object file symbol name and only has + // the effect of making the symbol accessible + // to other packages." + // -https://golang.org/cmd/compile + // + // There is no type-checking to be done here. + continue + } + + names = append(names, linknameSymbols{ + pos: c.Slash, + local: f[1], + remote: f[2], + }) + } + } + + return names +} + +func splitSymbol(pkg *types.Package, symbol string) (packagePath, name string) { + // Note that some runtime symbols can have multiple dots. e.g., + // runtime..init_task. + s := strings.SplitN(symbol, ".", 2) + + switch len(s) { + case 1: + // Package name omitted, use current package. + return pkg.Path(), symbol + case 2: + return s[0], s[1] + default: + panic("unreachable") + } +} + +func findObject(pkg *types.Package, symbol string) (types.Object, error) { + packagePath, symbolName := splitSymbol(pkg, symbol) + return findPackageObject(pkg, packagePath, symbolName) +} + +func findPackageObject(pkg *types.Package, packagePath, symbolName string) (types.Object, error) { + if pkg.Path() == packagePath { + o := pkg.Scope().Lookup(symbolName) + if o == nil { + return nil, fmt.Errorf("%q not found in %q (names: %+v)", symbolName, packagePath, pkg.Scope().Names()) + } + return o, nil + } + + for _, p := range pkg.Imports() { + if o, err := findPackageObject(p, packagePath, symbolName); err == nil { + return o, nil + } + } + + return nil, fmt.Errorf("package %q not found", packagePath) +} + +// checkOneLinkname verifies that the type of sym.local matches the type from +// knownLinknames. +func checkOneLinkname(pass *analysis.Pass, f *ast.File, sym linknameSymbols) { + remotePackage, remoteName := splitSymbol(pass.Pkg, sym.remote) + + m, ok := knownLinknames[remotePackage] + if !ok { + pass.Reportf(sym.pos, "linkname to unknown symbol %q; add this symbol to checklinkname.knownLinknames type-check against the remote type", sym.remote) + return + } + + linkname, ok := m[remoteName] + if !ok { + pass.Reportf(sym.pos, "linkname to unknown symbol %q; add this symbol to checklinkname.knownLinknames type-check against the remote type", sym.remote) + return + } + + local, err := findObject(pass.Pkg, sym.local) + if err != nil { + pass.Reportf(sym.pos, "Unable to find symbol %q: %v", sym.local, err) + return + } + + localSig, ok := local.Type().(*types.Signature) + if !ok { + pass.Reportf(local.Pos(), "%q object is not a signature: %+#v", sym.local, local) + return + } + + if linkname.local != localSig.String() { + pass.Reportf(local.Pos(), "%q signature got %q want %q; mismatched types?", sym.local, localSig.String(), linkname.local) + return + } +} + +// checkOneRemote verifies that the type of sym matches wantSig. +func checkOneRemote(pass *analysis.Pass, sym, wantSig string) { + o := pass.Pkg.Scope().Lookup(sym) + if o == nil { + pass.Reportf(pass.Files[0].Package, "Cannot find known symbol %q", sym) + return + } + + sig, ok := o.Type().(*types.Signature) + if !ok { + pass.Reportf(o.Pos(), "%q object is not a signature: %+#v", sym, o) + return + } + + if sig.String() != wantSig { + pass.Reportf(o.Pos(), "%q signature got %q want %q; stdlib type changed?", sym, sig.String(), wantSig) + return + } +} + +func run(pass *analysis.Pass) (interface{}, error) { + // First, check if any remote symbols are in this package. + p, ok := knownLinknames[pass.Pkg.Path()] + if ok { + for sym, l := range p { + checkOneRemote(pass, sym, l.Remote()) + } + } + + // Then check for local //go:linkname directives in this package. + for _, f := range pass.Files { + names := findLinknames(pass, f) + for _, n := range names { + checkOneLinkname(pass, f, n) + } + } + + return nil, nil +} diff --git a/tools/checklinkname/known.go b/tools/checklinkname/known.go new file mode 100644 index 000000000..54e5155fc --- /dev/null +++ b/tools/checklinkname/known.go @@ -0,0 +1,110 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checklinkname + +// knownLinknames is the set of the symbols for which we can do a rudimentary +// type-check on. +// +// When analyzing the remote package (e.g., runtime), we verify the symbol +// signature matches 'remote'. When analyzing local packages with //go:linkname +// directives, we verify the symbol signature matches 'local'. +// +// Usually these are identical, but may differ slightly if equivalent +// replacement types are used in the local packages, such as a copy of a struct +// or uintptr instead of a pointer type. +// +// NOTE: It is the responsibility of the developer to verify the safety of the +// signatures used here! This analyzer only checks that types match this map; +// it does not verify compatibility of the entries themselves. +// +// //go:linkname directives with no corresponding entry here will trigger a +// finding. +// +// We preform only rudimentary string-based type-checking due to limitations in +// the analysis framework. Ideally, from the local package we'd lookup the +// remote symbol's types.Object and perform robust type-checking. +// Unfortunately, remote symbols are typically loaded from the remote package's +// gcexportdata. Since //go:linkname targets are usually not exported symbols, +// they are no included in gcexportdata and we cannot load their types.Object. +// +// TODO(b/165820485): Add option to specific per-version signatures. +var knownLinknames = map[string]map[string]linknameSignatures{ + "runtime": map[string]linknameSignatures{ + "entersyscall": linknameSignatures{ + local: "func()", + }, + "entersyscallblock": linknameSignatures{ + local: "func()", + }, + "exitsyscall": linknameSignatures{ + local: "func()", + }, + "fastrand": linknameSignatures{ + local: "func() uint32", + }, + "gopark": linknameSignatures{ + // TODO(b/165820485): add verification of waitReason + // size and reason and traceEv values. + local: "func(unlockf func(uintptr, unsafe.Pointer) bool, lock unsafe.Pointer, reason uint8, traceEv byte, traceskip int)", + remote: "func(unlockf func(*runtime.g, unsafe.Pointer) bool, lock unsafe.Pointer, reason runtime.waitReason, traceEv byte, traceskip int)", + }, + "goready": linknameSignatures{ + local: "func(gp uintptr, traceskip int)", + remote: "func(gp *runtime.g, traceskip int)", + }, + "goyield": linknameSignatures{ + local: "func()", + }, + "memmove": linknameSignatures{ + local: "func(to unsafe.Pointer, from unsafe.Pointer, n uintptr)", + }, + "throw": linknameSignatures{ + local: "func(s string)", + }, + }, + "sync": map[string]linknameSignatures{ + "runtime_canSpin": linknameSignatures{ + local: "func(i int) bool", + }, + "runtime_doSpin": linknameSignatures{ + local: "func()", + }, + "runtime_Semacquire": linknameSignatures{ + // The only difference here is the parameter names. We + // can't just change our local use to match remote, as + // the stdlib runtime and sync packages also disagree + // on the name, and the analyzer checks that use as + // well. + local: "func(addr *uint32)", + remote: "func(s *uint32)", + }, + "runtime_Semrelease": linknameSignatures{ + // See above. + local: "func(addr *uint32, handoff bool, skipframes int)", + remote: "func(s *uint32, handoff bool, skipframes int)", + }, + }, + "syscall": map[string]linknameSignatures{ + "runtime_BeforeFork": linknameSignatures{ + local: "func()", + }, + "runtime_AfterFork": linknameSignatures{ + local: "func()", + }, + "runtime_AfterForkInChild": linknameSignatures{ + local: "func()", + }, + }, +} diff --git a/tools/checklinkname/test/BUILD b/tools/checklinkname/test/BUILD new file mode 100644 index 000000000..b29bd84f2 --- /dev/null +++ b/tools/checklinkname/test/BUILD @@ -0,0 +1,9 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "test", + testonly = 1, + srcs = ["test_unsafe.go"], +) diff --git a/tools/checklinkname/test/test_unsafe.go b/tools/checklinkname/test/test_unsafe.go new file mode 100644 index 000000000..a7504591c --- /dev/null +++ b/tools/checklinkname/test/test_unsafe.go @@ -0,0 +1,34 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package test provides linkname test targets. +package test + +import ( + _ "unsafe" // for go:linkname. +) + +//go:linkname DetachedLinkname runtime.fastrand + +//go:linkname attachedLinkname runtime.entersyscall +func attachedLinkname() + +// AttachedLinkname reexports attachedLinkname because go vet doesn't like an +// exported go:linkname without a comment starting with "// AttachedLinkname". +func AttachedLinkname() { + attachedLinkname() +} + +// DetachedLinkname has a linkname elsewhere in the file. +func DetachedLinkname() uint32 diff --git a/tools/checklocks/BUILD b/tools/checklocks/BUILD index 7d4c63dc7..d23b7cde6 100644 --- a/tools/checklocks/BUILD +++ b/tools/checklocks/BUILD @@ -4,11 +4,16 @@ package(licenses = ["notice"]) go_library( name = "checklocks", - srcs = ["checklocks.go"], + srcs = [ + "analysis.go", + "annotations.go", + "checklocks.go", + "facts.go", + "state.go", + ], nogo = False, visibility = ["//tools/nogo:__subpackages__"], deps = [ - "//pkg/log", "@org_golang_x_tools//go/analysis:go_default_library", "@org_golang_x_tools//go/analysis/passes/buildssa:go_default_library", "@org_golang_x_tools//go/ssa:go_default_library", diff --git a/tools/checklocks/README.md b/tools/checklocks/README.md index dfb0275ab..bd4beb649 100644 --- a/tools/checklocks/README.md +++ b/tools/checklocks/README.md @@ -1,16 +1,29 @@ # CheckLocks Analyzer -<!--* freshness: { owner: 'gvisor-eng' reviewed: '2020-10-05' } *--> +<!--* freshness: { owner: 'gvisor-eng' reviewed: '2021-03-21' } *--> -Checklocks is a nogo analyzer that at compile time uses Go's static analysis -tools to identify and flag cases where a field that is guarded by a mutex in the -same struct is accessed outside of a mutex lock. +Checklocks is an analyzer for lock and atomic constraints. The analyzer relies +on explicit annotations to identify fields that should be checked for access. -The analyzer relies on explicit '// +checklocks:<mutex-name>' kind of -annotations to identify fields that should be checked for access. +## Atomic annotations -Individual struct members may be protected by annotations that indicate how they -must be accessed. These annotations are of the form: +Individual struct members may be noted as requiring atomic access. These +annotations are of the form: + +```go +type foo struct { + // +checkatomic + bar int32 +} +``` + +This will ensure that all accesses to bar are atomic, with the exception of +operations on newly allocated objects. + +## Lock annotations + +Individual struct members may be protected by annotations that indicate locking +requirements for accessing members. These annotations are of the form: ```go type foo struct { @@ -64,30 +77,6 @@ annotations from becoming stale over time as fields are renamed, etc. # Currently not supported -1. The analyzer does not correctly handle deferred functions. e.g The following - code is not correctly checked by the analyzer. The defer call is never - evaluated. As a result if the lock was to be say unlocked twice via deferred - functions it would not be caught by the analyzer. - - Similarly deferred anonymous functions are not evaluated either. - -```go -type A struct { - mu sync.Mutex - - // +checklocks:mu - x int -} - -func abc() { - var a A - a.mu.Lock() - defer a.mu.Unlock() - defer a.mu.Unlock() - a.x = 1 -} -``` - 1. Anonymous functions are not correctly evaluated. The analyzer does not currently support specifying annotations on anonymous functions as a result evaluation of a function that accesses protected fields will fail. @@ -107,10 +96,9 @@ func abc() { f() a.mu.Unlock() } - ``` -# Explicitly Not Supported +### Explicitly Not Supported 1. Checking for embedded mutexes as sync.Locker rather than directly as 'sync.Mutex'. In other words, the checker will not track mutex Lock and @@ -140,3 +128,30 @@ func abc() { checklocks. Only struct members can be used. 2. The checker will not support checking for lock ordering violations. + +## Mixed mode + +Some members may allow read-only atomic access, but be protected against writes +by a mutex. Generally, this imposes the following requirements: + +For a read, one of the following must be true: + +1. A lock held be held. +1. The access is atomic. + +For a write, both of the following must be true: + +1. The lock must be held. +1. The write must be atomic. + +In order to annotate a relevant field, simply apply *both* annotations from +above. For example: + +```go +type foo struct { + mu sync.Mutex + // +checklocks:mu + // +checkatomic + bar int32 +} +``` diff --git a/tools/checklocks/analysis.go b/tools/checklocks/analysis.go new file mode 100644 index 000000000..d3fd797d0 --- /dev/null +++ b/tools/checklocks/analysis.go @@ -0,0 +1,628 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checklocks + +import ( + "go/token" + "go/types" + "strings" + + "golang.org/x/tools/go/ssa" +) + +func gcd(a, b atomicAlignment) atomicAlignment { + for b != 0 { + a, b = b, a%b + } + return a +} + +// typeAlignment returns the type alignment for the given type. +func (pc *passContext) typeAlignment(pkg *types.Package, obj types.Object) atomicAlignment { + requiredOffset := atomicAlignment(1) + if pc.pass.ImportObjectFact(obj, &requiredOffset) { + return requiredOffset + } + + switch x := obj.Type().Underlying().(type) { + case *types.Struct: + fields := make([]*types.Var, x.NumFields()) + for i := 0; i < x.NumFields(); i++ { + fields[i] = x.Field(i) + } + offsets := pc.pass.TypesSizes.Offsetsof(fields) + for i := 0; i < x.NumFields(); i++ { + // Check the offset, and then assuming that this offset + // aligns with the offset for the broader type. + fieldRequired := pc.typeAlignment(pkg, fields[i]) + if offsets[i]%int64(fieldRequired) != 0 { + // The offset of this field is not compatible. + pc.maybeFail(fields[i].Pos(), "have alignment %d, need %d", offsets[i], fieldRequired) + } + // Ensure the requiredOffset is the LCM of the offset. + requiredOffset *= fieldRequired / gcd(requiredOffset, fieldRequired) + } + case *types.Array: + // Export direct alignment requirements. + if named, ok := x.Elem().(*types.Named); ok { + requiredOffset = pc.typeAlignment(pkg, named.Obj()) + } + default: + // Use the compiler's underlying alignment. + requiredOffset = atomicAlignment(pc.pass.TypesSizes.Alignof(obj.Type().Underlying())) + } + + if pkg == obj.Pkg() { + // Cache as an object fact, to subsequent calls. Note that we + // can only export object facts for the package that we are + // currently analyzing. There may be no exported facts for + // array types or alias types, for example. + pc.pass.ExportObjectFact(obj, &requiredOffset) + } + + return requiredOffset +} + +// checkTypeAlignment checks the alignment of the given type. +// +// This calls typeAlignment, which resolves all types recursively. This method +// should be called for all types individual to ensure full coverage. +func (pc *passContext) checkTypeAlignment(pkg *types.Package, typ *types.Named) { + _ = pc.typeAlignment(pkg, typ.Obj()) +} + +// checkAtomicCall checks for an atomic access. +// +// inst is the instruction analyzed, obj is used only for maybeFail. +// +// If mustBeAtomic is true, then we assert that the instruction *is* an atomic +// fucnction call. If it is false, then we assert that it is *not* an atomic +// dispatch. +// +// If readOnly is true, then only atomic read access are allowed. Note that +// readOnly is only meaningful if mustBeAtomic is set. +func (pc *passContext) checkAtomicCall(inst ssa.Instruction, obj types.Object, mustBeAtomic, readOnly bool) { + switch x := inst.(type) { + case *ssa.Call: + if x.Common().IsInvoke() { + if mustBeAtomic { + // This is an illegal interface dispatch. + pc.maybeFail(inst.Pos(), "dynamic dispatch with atomic-only field") + } + return + } + fn, ok := x.Common().Value.(*ssa.Function) + if !ok { + if mustBeAtomic { + // This is an illegal call to a non-static function. + pc.maybeFail(inst.Pos(), "dispatch to non-static function with atomic-only field") + } + return + } + pkg := fn.Package() + if pkg == nil { + if mustBeAtomic { + // This is a call to some shared wrapper function. + pc.maybeFail(inst.Pos(), "dispatch to shared function or wrapper") + } + return + } + var lff lockFunctionFacts // Check for exemption. + if obj := fn.Object(); obj != nil && pc.pass.ImportObjectFact(obj, &lff) && lff.Ignore { + return + } + if name := pkg.Pkg.Name(); name != "atomic" && name != "atomicbitops" { + if mustBeAtomic { + // This is an illegal call to a non-atomic package function. + pc.maybeFail(inst.Pos(), "dispatch to non-atomic function with atomic-only field") + } + return + } + if !mustBeAtomic { + // We are *not* expecting an atomic dispatch. + if _, ok := pc.forced[pc.positionKey(inst.Pos())]; !ok { + pc.maybeFail(inst.Pos(), "unexpected call to atomic function") + } + } + if !strings.HasPrefix(fn.Name(), "Load") && readOnly { + // We are not allowing any reads in this context. + if _, ok := pc.forced[pc.positionKey(inst.Pos())]; !ok { + pc.maybeFail(inst.Pos(), "unexpected call to atomic write function, is a lock missing?") + } + return + } + default: + if mustBeAtomic { + // This is something else entirely. + if _, ok := pc.forced[pc.positionKey(inst.Pos())]; !ok { + pc.maybeFail(inst.Pos(), "illegal use of atomic-only field by %T instruction", inst) + } + return + } + } +} + +func resolveStruct(typ types.Type) (*types.Struct, bool) { + structType, ok := typ.Underlying().(*types.Struct) + if ok { + return structType, true + } + ptrType, ok := typ.Underlying().(*types.Pointer) + if ok { + return resolveStruct(ptrType.Elem()) + } + return nil, false +} + +func findField(typ types.Type, field int) (types.Object, bool) { + structType, ok := resolveStruct(typ) + if !ok { + return nil, false + } + return structType.Field(field), true +} + +// instructionWithReferrers is a generalization over ssa.Field, ssa.FieldAddr. +type instructionWithReferrers interface { + ssa.Instruction + Referrers() *[]ssa.Instruction +} + +// checkFieldAccess checks the validity of a field access. +// +// This also enforces atomicity constraints for fields that must be accessed +// atomically. The parameter isWrite indicates whether this field is used +// downstream for a write operation. +func (pc *passContext) checkFieldAccess(inst instructionWithReferrers, structObj ssa.Value, field int, ls *lockState, isWrite bool) { + var ( + lff lockFieldFacts + lgf lockGuardFacts + guardsFound int + guardsHeld int + ) + + fieldObj, _ := findField(structObj.Type(), field) + pc.pass.ImportObjectFact(fieldObj, &lff) + pc.pass.ImportObjectFact(fieldObj, &lgf) + + for guardName, fl := range lgf.GuardedBy { + guardsFound++ + r := fl.resolve(structObj) + if _, ok := ls.isHeld(r); ok { + guardsHeld++ + continue + } + if _, ok := pc.forced[pc.positionKey(inst.Pos())]; ok { + // Mark this as locked, since it has been forced. + ls.lockField(r) + guardsHeld++ + continue + } + // Note that we may allow this if the disposition is atomic, + // and we are allowing atomic reads only. This will fall into + // the atomic disposition check below, which asserts that the + // access is atomic. Further, guardsHeld < guardsFound will be + // true for this case, so we require it to be read-only. + if lgf.AtomicDisposition != atomicRequired { + // There is no force key, no atomic access and no lock held. + pc.maybeFail(inst.Pos(), "invalid field access, %s must be locked when accessing %s (locks: %s)", guardName, fieldObj.Name(), ls.String()) + } + } + + // Check the atomic access for this field. + switch lgf.AtomicDisposition { + case atomicRequired: + // Check that this is used safely as an input. + readOnly := guardsHeld < guardsFound + if refs := inst.Referrers(); refs != nil { + for _, otherInst := range *refs { + pc.checkAtomicCall(otherInst, fieldObj, true, readOnly) + } + } + // Check that this is not otherwise written non-atomically, + // even if we do hold all the locks. + if isWrite { + pc.maybeFail(inst.Pos(), "non-atomic write of field %s, writes must still be atomic with locks held (locks: %s)", fieldObj.Name(), ls.String()) + } + case atomicDisallow: + // Check that this is *not* used atomically. + if refs := inst.Referrers(); refs != nil { + for _, otherInst := range *refs { + pc.checkAtomicCall(otherInst, fieldObj, false, false) + } + } + } +} + +func (pc *passContext) checkCall(call callCommon, ls *lockState) { + // See: https://godoc.org/golang.org/x/tools/go/ssa#CallCommon + // + // 1. "call" mode: when Method is nil (!IsInvoke), a CallCommon represents an ordinary + // function call of the value in Value, which may be a *Builtin, a *Function or any + // other value of kind 'func'. + // + // Value may be one of: + // (a) a *Function, indicating a statically dispatched call + // to a package-level function, an anonymous function, or + // a method of a named type. + // + // (b) a *MakeClosure, indicating an immediately applied + // function literal with free variables. + // + // (c) a *Builtin, indicating a statically dispatched call + // to a built-in function. + // + // (d) any other value, indicating a dynamically dispatched + // function call. + switch fn := call.Common().Value.(type) { + case *ssa.Function: + var lff lockFunctionFacts + if fn.Object() != nil { + pc.pass.ImportObjectFact(fn.Object(), &lff) + pc.checkFunctionCall(call, fn, &lff, ls) + } else { + // Anonymous functions have no facts, and cannot be + // annotated. We don't check for violations using the + // function facts, since they cannot exist. Instead, we + // do a fresh analysis using the current lock state. + fnls := ls.fork() + for i, arg := range call.Common().Args { + fnls.store(fn.Params[i], arg) + } + pc.checkFunction(call, fn, &lff, fnls, true /* force */) + } + case *ssa.MakeClosure: + // Note that creating and then invoking closures locally is + // allowed, but analysis of passing closures is done when + // checking individual instructions. + pc.checkClosure(call, fn, ls) + default: + return + } +} + +// postFunctionCallUpdate updates all conditions. +func (pc *passContext) postFunctionCallUpdate(call callCommon, lff *lockFunctionFacts, ls *lockState) { + // Release all locks not still held. + for fieldName, fg := range lff.HeldOnEntry { + if _, ok := lff.HeldOnExit[fieldName]; ok { + continue + } + r := fg.resolveCall(call.Common().Args, call.Value()) + if s, ok := ls.unlockField(r); !ok { + if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok { + pc.maybeFail(call.Pos(), "attempt to release %s (%s), but not held (locks: %s)", fieldName, s, ls.String()) + } + } + } + + // Update all held locks if acquired. + for fieldName, fg := range lff.HeldOnExit { + if _, ok := lff.HeldOnEntry[fieldName]; ok { + continue + } + // Acquire the lock per the annotation. + r := fg.resolveCall(call.Common().Args, call.Value()) + if s, ok := ls.lockField(r); !ok { + if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok { + pc.maybeFail(call.Pos(), "attempt to acquire %s (%s), but already held (locks: %s)", fieldName, s, ls.String()) + } + } + } +} + +// checkFunctionCall checks preconditions for function calls, and tracks the +// lock state by recording relevant calls to sync functions. Note that calls to +// atomic functions are tracked by checkFieldAccess by looking directly at the +// referrers (because ordering doesn't matter there, so we need not scan in +// instruction order). +func (pc *passContext) checkFunctionCall(call callCommon, fn *ssa.Function, lff *lockFunctionFacts, ls *lockState) { + // Check all guards required are held. + for fieldName, fg := range lff.HeldOnEntry { + r := fg.resolveCall(call.Common().Args, call.Value()) + if s, ok := ls.isHeld(r); !ok { + if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok { + pc.maybeFail(call.Pos(), "must hold %s (%s) to call %s, but not held (locks: %s)", fieldName, s, fn.Name(), ls.String()) + } else { + // Force the lock to be acquired. + ls.lockField(r) + } + } + } + + // Update all lock state accordingly. + pc.postFunctionCallUpdate(call, lff, ls) + + // Check if it's a method dispatch for something in the sync package. + // See: https://godoc.org/golang.org/x/tools/go/ssa#Function + if fn.Package() != nil && fn.Package().Pkg.Name() == "sync" && fn.Signature.Recv() != nil { + switch fn.Name() { + case "Lock", "RLock": + if s, ok := ls.lockField(resolvedValue{value: call.Common().Args[0], valid: true}); !ok { + if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok { + // Double locking a mutex that is already locked. + pc.maybeFail(call.Pos(), "%s already locked (locks: %s)", s, ls.String()) + } + } + case "Unlock", "RUnlock": + if s, ok := ls.unlockField(resolvedValue{value: call.Common().Args[0], valid: true}); !ok { + if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok { + // Unlocking something that is already unlocked. + pc.maybeFail(call.Pos(), "%s already unlocked (locks: %s)", s, ls.String()) + } + } + } + } +} + +// checkClosure forks the lock state, and creates a binding for the FreeVars of +// the closure. This allows the analysis to resolve the closure. +func (pc *passContext) checkClosure(call callCommon, fn *ssa.MakeClosure, ls *lockState) { + clls := ls.fork() + clfn := fn.Fn.(*ssa.Function) + for i, fv := range clfn.FreeVars { + clls.store(fv, fn.Bindings[i]) + } + + // Note that this is *not* a call to check function call, which checks + // against the function preconditions. Instead, this does a fresh + // analysis of the function from source code with a different state. + var nolff lockFunctionFacts + pc.checkFunction(call, clfn, &nolff, clls, true /* force */) +} + +// freshAlloc indicates that v has been allocated within the local scope. There +// is no lock checking done on objects that are freshly allocated. +func freshAlloc(v ssa.Value) bool { + switch x := v.(type) { + case *ssa.Alloc: + return true + case *ssa.FieldAddr: + return freshAlloc(x.X) + case *ssa.Field: + return freshAlloc(x.X) + case *ssa.IndexAddr: + return freshAlloc(x.X) + case *ssa.Index: + return freshAlloc(x.X) + case *ssa.Convert: + return freshAlloc(x.X) + case *ssa.ChangeType: + return freshAlloc(x.X) + default: + return false + } +} + +// isWrite indicates that this value is used as the addr field in a store. +// +// Note that this may still be used for a write. The return here is optimistic +// but sufficient for basic analysis. +func isWrite(v ssa.Value) bool { + refs := v.Referrers() + if refs == nil { + return false + } + for _, ref := range *refs { + if s, ok := ref.(*ssa.Store); ok && s.Addr == v { + return true + } + } + return false +} + +// callCommon is an ssa.Value that also implements Common. +type callCommon interface { + Pos() token.Pos + Common() *ssa.CallCommon + Value() *ssa.Call +} + +// checkInstruction checks the legality the single instruction based on the +// current lockState. +func (pc *passContext) checkInstruction(inst ssa.Instruction, ls *lockState) (*ssa.Return, *lockState) { + switch x := inst.(type) { + case *ssa.Store: + // Record that this value is holding this other value. This is + // because at the beginning of each ssa execution, there is a + // series of assignments of parameter values to alloc objects. + // This allows us to trace these back to the original + // parameters as aliases above. + // + // Note that this may overwrite an existing value in the lock + // state, but this is intentional. + ls.store(x.Addr, x.Val) + case *ssa.Field: + if !freshAlloc(x.X) { + pc.checkFieldAccess(x, x.X, x.Field, ls, false) + } + case *ssa.FieldAddr: + if !freshAlloc(x.X) { + pc.checkFieldAccess(x, x.X, x.Field, ls, isWrite(x)) + } + case *ssa.Call: + pc.checkCall(x, ls) + case *ssa.Defer: + ls.pushDefer(x) + case *ssa.RunDefers: + for d := ls.popDefer(); d != nil; d = ls.popDefer() { + pc.checkCall(d, ls) + } + case *ssa.MakeClosure: + refs := x.Referrers() + if refs == nil { + // This is strange, it's not used? Ignore this case, + // since it will probably be optimized away. + return nil, nil + } + hasNonCall := false + for _, ref := range *refs { + switch ref.(type) { + case *ssa.Call, *ssa.Defer: + // Analysis will be done on the call itself + // subsequently, including the lock state at + // the time of the call. + default: + // We need to analyze separately. Per below, + // this means that we'll analyze at closure + // construction time no zero assumptions about + // when it will be called. + hasNonCall = true + } + } + if !hasNonCall { + return nil, nil + } + // Analyze the closure without bindings. This means that we + // assume no lock facts or have any existing lock state. Only + // trivial closures are acceptable in this case. + clfn := x.Fn.(*ssa.Function) + var nolff lockFunctionFacts + pc.checkFunction(nil, clfn, &nolff, nil, false /* force */) + case *ssa.Return: + return x, ls // Valid return state. + } + return nil, nil +} + +// checkBasicBlock traverses the control flow graph starting at a set of given +// block and checks each instruction for allowed operations. +func (pc *passContext) checkBasicBlock(fn *ssa.Function, block *ssa.BasicBlock, lff *lockFunctionFacts, parent *lockState, seen map[*ssa.BasicBlock]*lockState) *lockState { + if oldLS, ok := seen[block]; ok && oldLS.isCompatible(parent) { + return nil + } + + // If the lock state is not compatible, then we need to do the + // recursive analysis to ensure that it is still sane. For example, the + // following is guaranteed to generate incompatible locking states: + // + // if foo { + // mu.Lock() + // } + // other stuff ... + // if foo { + // mu.Unlock() + // } + + var ( + rv *ssa.Return + rls *lockState + ) + + // Analyze this block. + seen[block] = parent + ls := parent.fork() + for _, inst := range block.Instrs { + rv, rls = pc.checkInstruction(inst, ls) + if rls != nil { + failed := false + // Validate held locks. + for fieldName, fg := range lff.HeldOnExit { + r := fg.resolveStatic(fn, rv) + if s, ok := rls.isHeld(r); !ok { + if _, ok := pc.forced[pc.positionKey(rv.Pos())]; !ok { + pc.maybeFail(rv.Pos(), "lock %s (%s) not held (locks: %s)", fieldName, s, rls.String()) + failed = true + } else { + // Force the lock to be acquired. + rls.lockField(r) + } + } + } + // Check for other locks, but only if the above didn't trip. + if !failed && rls.count() != len(lff.HeldOnExit) { + pc.maybeFail(rv.Pos(), "return with unexpected locks held (locks: %s)", rls.String()) + } + } + } + + // Analyze all successors. + for _, succ := range block.Succs { + // Collect possible return values, and make sure that the lock + // state aligns with any return value that we may have found + // above. Note that checkBasicBlock will recursively analyze + // the lock state to ensure that Releases and Acquires are + // respected. + if pls := pc.checkBasicBlock(fn, succ, lff, ls, seen); pls != nil { + if rls != nil && !rls.isCompatible(pls) { + if _, ok := pc.forced[pc.positionKey(fn.Pos())]; !ok { + pc.maybeFail(fn.Pos(), "incompatible return states (first: %s, second: %v)", rls.String(), pls.String()) + } + } + rls = pls + } + } + return rls +} + +// checkFunction checks a function invocation, typically starting with nil lockState. +func (pc *passContext) checkFunction(call callCommon, fn *ssa.Function, lff *lockFunctionFacts, parent *lockState, force bool) { + defer func() { + // Mark this function as checked. This is used by the top-level + // loop to ensure that all anonymous functions are scanned, if + // they are not explicitly invoked here. Note that this can + // happen if the anonymous functions are e.g. passed only as + // parameters or used to initialize some structure. + pc.functions[fn] = struct{}{} + }() + if _, ok := pc.functions[fn]; !force && ok { + // This function has already been analyzed at least once. + // That's all we permit for each function, although this may + // cause some anonymous functions to be analyzed in only one + // context. + return + } + + // If no return value is provided, then synthesize one. This is used + // below only to check against the locks preconditions, which may + // include return values. + if call == nil { + call = &ssa.Call{Call: ssa.CallCommon{Value: fn}} + } + + // Initialize ls with any preconditions that require locks to be held + // for the method to be invoked. Note that in the overwhleming majority + // of cases, parent will be nil. However, in the case of closures and + // anonymous functions, we may start with a non-nil lock state. + ls := parent.fork() + for fieldName, fg := range lff.HeldOnEntry { + // The first is the method object itself so we skip that when looking + // for receiver/function parameters. + r := fg.resolveStatic(fn, call.Value()) + if s, ok := ls.lockField(r); !ok { + // This can only happen if the same value is declared + // multiple times, and should be caught by the earlier + // fact scanning. Keep it here as a sanity check. + pc.maybeFail(fn.Pos(), "lock %s (%s) acquired multiple times (locks: %s)", fieldName, s, ls.String()) + } + } + + // Scan the blocks. + seen := make(map[*ssa.BasicBlock]*lockState) + if len(fn.Blocks) > 0 { + pc.checkBasicBlock(fn, fn.Blocks[0], lff, ls, seen) + } + + // Scan the recover block. + if fn.Recover != nil { + pc.checkBasicBlock(fn, fn.Recover, lff, ls, seen) + } + + // Update all lock state accordingly. This will be called only if we + // are doing inline analysis for e.g. an anonymous function. + if call != nil && parent != nil { + pc.postFunctionCallUpdate(call, lff, parent) + } +} diff --git a/tools/checklocks/annotations.go b/tools/checklocks/annotations.go new file mode 100644 index 000000000..371260980 --- /dev/null +++ b/tools/checklocks/annotations.go @@ -0,0 +1,129 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checklocks + +import ( + "fmt" + + "go/token" + "strconv" + "strings" +) + +const ( + checkLocksAnnotation = "// +checklocks:" + checkLocksAcquires = "// +checklocksacquire:" + checkLocksReleases = "// +checklocksrelease:" + checkLocksIgnore = "// +checklocksignore" + checkLocksForce = "// +checklocksforce" + checkLocksFail = "// +checklocksfail" + checkAtomicAnnotation = "// +checkatomic" +) + +// failData indicates an expected failure. +type failData struct { + pos token.Pos + count int + seen int +} + +// positionKey is a simple position string. +type positionKey string + +// positionKey converts from a token.Pos to a key we can use to track failures +// as the position of the failure annotation is not the same as the position of +// the actual failure (different column/offsets). Hence we ignore these fields +// and only use the file/line numbers to track failures. +func (pc *passContext) positionKey(pos token.Pos) positionKey { + position := pc.pass.Fset.Position(pos) + return positionKey(fmt.Sprintf("%s:%d", position.Filename, position.Line)) +} + +// addFailures adds an expected failure. +func (pc *passContext) addFailures(pos token.Pos, s string) { + count := 1 + if len(s) > 0 && s[0] == ':' { + parsedCount, err := strconv.Atoi(s[1:]) + if err != nil { + pc.pass.Reportf(pos, "unable to parse failure annotation %q: %v", s[1:], err) + return + } + count = parsedCount + } + pc.failures[pc.positionKey(pos)] = &failData{ + pos: pos, + count: count, + } +} + +// addExemption adds an exemption. +func (pc *passContext) addExemption(pos token.Pos) { + pc.exemptions[pc.positionKey(pos)] = struct{}{} +} + +// addForce adds a force annotation. +func (pc *passContext) addForce(pos token.Pos) { + pc.forced[pc.positionKey(pos)] = struct{}{} +} + +// maybeFail checks a potential failure against a specific failure map. +func (pc *passContext) maybeFail(pos token.Pos, fmtStr string, args ...interface{}) { + if fd, ok := pc.failures[pc.positionKey(pos)]; ok { + fd.seen++ + return + } + if _, ok := pc.exemptions[pc.positionKey(pos)]; ok { + return // Ignored, not counted. + } + pc.pass.Reportf(pos, fmtStr, args...) +} + +// checkFailure checks for the expected failure counts. +func (pc *passContext) checkFailures() { + for _, fd := range pc.failures { + if fd.count != fd.seen { + // We are missing expect failures, report as much as possible. + pc.pass.Reportf(fd.pos, "got %d failures, want %d failures", fd.seen, fd.count) + } + } +} + +// extractAnnotations extracts annotations from text. +func (pc *passContext) extractAnnotations(s string, fns map[string]func(p string)) { + for prefix, fn := range fns { + if strings.HasPrefix(s, prefix) { + fn(s[len(prefix):]) + } + } +} + +// extractLineFailures extracts all line-based exceptions. +// +// Note that this applies only to individual line exemptions, and does not +// consider function-wide exemptions, or specific field exemptions, which are +// extracted separately as part of the saved facts for those objects. +func (pc *passContext) extractLineFailures() { + for _, f := range pc.pass.Files { + for _, cg := range f.Comments { + for _, c := range cg.List { + pc.extractAnnotations(c.Text, map[string]func(string){ + checkLocksFail: func(p string) { pc.addFailures(c.Pos(), p) }, + checkLocksIgnore: func(string) { pc.addExemption(c.Pos()) }, + checkLocksForce: func(string) { pc.addForce(c.Pos()) }, + }) + } + } + } +} diff --git a/tools/checklocks/checklocks.go b/tools/checklocks/checklocks.go index 1e877d394..401fb55ec 100644 --- a/tools/checklocks/checklocks.go +++ b/tools/checklocks/checklocks.go @@ -13,32 +13,19 @@ // limitations under the License. // Package checklocks performs lock analysis to identify and flag unprotected -// access to field annotated with a '// +checklocks:<mutex-name>' annotation. +// access to annotated fields. // -// For detailed ussage refer to README.md in the same directory. +// For detailed usage refer to README.md in the same directory. package checklocks import ( - "bytes" - "fmt" "go/ast" "go/token" "go/types" - "reflect" - "regexp" - "strconv" - "strings" "golang.org/x/tools/go/analysis" "golang.org/x/tools/go/analysis/passes/buildssa" "golang.org/x/tools/go/ssa" - "gvisor.dev/gvisor/pkg/log" -) - -const ( - checkLocksAnnotation = "// +checklocks:" - checkLocksIgnore = "// +checklocksignore" - checkLocksFail = "// +checklocksfail" ) // Analyzer is the main entrypoint. @@ -47,712 +34,123 @@ var Analyzer = &analysis.Analyzer{ Doc: "checks lock preconditions on functions and fields", Run: run, Requires: []*analysis.Analyzer{buildssa.Analyzer}, - FactTypes: []analysis.Fact{(*lockFieldFacts)(nil), (*lockFunctionFacts)(nil)}, -} - -// lockFieldFacts apply on every struct field protected by a lock or that is a -// lock. -type lockFieldFacts struct { - // GuardedBy tracks the names and field numbers that guard this field. - GuardedBy map[string]int - - // IsMutex is true if the field is of type sync.Mutex. - IsMutex bool - - // IsRWMutex is true if the field is of type sync.RWMutex. - IsRWMutex bool - - // FieldNumber is the number of this field in the struct. - FieldNumber int -} - -// AFact implements analysis.Fact.AFact. -func (*lockFieldFacts) AFact() {} - -type functionGuard struct { - // ParameterNumber is the index of the object that contains the guarding mutex. - // This is required during SSA analysis as field names and parameters names are - // not available in SSA. For example, from the example below ParameterNumber would - // be 1 and FieldNumber would correspond to the field number of 'mu' within b's type. - // - // //+checklocks:b.mu - // func (a *A) method(b *B, c *C) { - // ... - // } - ParameterNumber int - - // FieldNumber is the field index of the mutex in the parameter's struct - // type. Refer to example above for more details. - FieldNumber int -} - -// lockFunctionFacts apply on every method. -type lockFunctionFacts struct { - // GuardedBy tracks the names and number of parameter (including receiver) - // lockFuncfields that guard calls to this function. - // The key is the name specified in the checklocks annotation. e.g given - // the following code. - // ``` - // type A struct { - // mu sync.Mutex - // a int - // } - // - // // +checklocks:a.mu - // func xyz(a *A) {..} - // ``` - // - // '`+checklocks:a.mu' will result in an entry in this map as shown below. - // GuardedBy: {"a.mu" => {ParameterNumber: 0, FieldNumber: 0} - GuardedBy map[string]functionGuard -} - -// AFact implements analysis.Fact.AFact. -func (*lockFunctionFacts) AFact() {} - -type positionKey string - -// toPositionKey converts from a token.Position to a key we can use to track -// failures as the position of the failure annotation is not the same as the -// position of the actual failure (different column/offsets). Hence we ignore -// these fields and only use the file/line numbers to track failures. -func toPositionKey(position token.Position) positionKey { - return positionKey(fmt.Sprintf("%s:%d", position.Filename, position.Line)) -} - -type failData struct { - pos token.Pos - count int -} - -func (f failData) String() string { - return fmt.Sprintf("pos: %d, count: %d", f.pos, f.count) + FactTypes: []analysis.Fact{(*atomicAlignment)(nil), (*lockFieldFacts)(nil), (*lockGuardFacts)(nil), (*lockFunctionFacts)(nil)}, } +// passContext is a pass with additional expected failures. type passContext struct { - pass *analysis.Pass - - // exemptions tracks functions that should be exempted from lock checking due - // to '// +checklocksignore' annotation. - exemptions map[types.Object]struct{} - - failures map[positionKey]*failData + pass *analysis.Pass + failures map[positionKey]*failData + exemptions map[positionKey]struct{} + forced map[positionKey]struct{} + functions map[*ssa.Function]struct{} } -var ( - mutexRE = regexp.MustCompile("((.*/)|^)sync.(CrossGoroutineMutex|Mutex)") - rwMutexRE = regexp.MustCompile("((.*/)|^)sync.(CrossGoroutineRWMutex|RWMutex)") -) - -func (pc *passContext) extractFieldAnnotations(field *ast.Field, fieldType *types.Var) *lockFieldFacts { - s := fieldType.Type().String() - // We use HasSuffix below because fieldType can be fully qualified with the - // package name eg for the gvisor sync package mutex fields have the type: - // "<package path>/sync/sync.Mutex" - switch { - case mutexRE.Match([]byte(s)): - return &lockFieldFacts{IsMutex: true} - case rwMutexRE.Match([]byte(s)): - return &lockFieldFacts{IsRWMutex: true} - default: - } - if field.Doc == nil { - return nil - } - fieldFacts := &lockFieldFacts{GuardedBy: make(map[string]int)} - for _, l := range field.Doc.List { - if strings.HasPrefix(l.Text, checkLocksAnnotation) { - guardName := strings.TrimPrefix(l.Text, checkLocksAnnotation) - if _, ok := fieldFacts.GuardedBy[guardName]; ok { - pc.pass.Reportf(field.Pos(), "annotation %s specified more than once.", l.Text) - continue - } - fieldFacts.GuardedBy[guardName] = -1 - } - } - - return fieldFacts -} - -func (pc *passContext) findField(v ssa.Value, fieldNumber int) types.Object { - structType, ok := v.Type().Underlying().(*types.Struct) - if !ok { - structType = v.Type().Underlying().(*types.Pointer).Elem().Underlying().(*types.Struct) - } - return structType.Field(fieldNumber) -} - -// findAndExportStructFacts finds any struct fields that are annotated with the -// "// +checklocks:" annotation and exports relevant facts about the fields to -// be used in later analysis. -func (pc *passContext) findAndExportStructFacts(ss *ast.StructType, structType *types.Struct) { - type fieldRef struct { - fieldObj *types.Var - facts *lockFieldFacts - } - mutexes := make(map[string]*fieldRef) - rwMutexes := make(map[string]*fieldRef) - guardedFields := make(map[string]*fieldRef) - for i, field := range ss.Fields.List { - fieldObj := structType.Field(i) - fieldFacts := pc.extractFieldAnnotations(field, fieldObj) - if fieldFacts == nil { - continue - } - fieldFacts.FieldNumber = i - - ref := &fieldRef{fieldObj, fieldFacts} - if fieldFacts.IsMutex { - mutexes[fieldObj.Name()] = ref - } - if fieldFacts.IsRWMutex { - rwMutexes[fieldObj.Name()] = ref - } - if len(fieldFacts.GuardedBy) != 0 { - guardedFields[fieldObj.Name()] = ref - } - } - - // Export facts about all mutexes. - for _, f := range mutexes { - pc.pass.ExportObjectFact(f.fieldObj, f.facts) - } - // Export facts about all rwMutexes. - for _, f := range rwMutexes { - pc.pass.ExportObjectFact(f.fieldObj, f.facts) - } - - // Validate that guarded fields annotations refer to actual mutexes or - // rwMutexes in the struct. - for _, gf := range guardedFields { - for g := range gf.facts.GuardedBy { - if f, ok := mutexes[g]; ok { - gf.facts.GuardedBy[g] = f.facts.FieldNumber - } else if f, ok := rwMutexes[g]; ok { - gf.facts.GuardedBy[g] = f.facts.FieldNumber - } else { - pc.maybeFail(gf.fieldObj.Pos(), false /* isExempted */, "invalid mutex guard, no such mutex %s in struct %s", g, structType.String()) - continue - } - // Export guarded field fact. - pc.pass.ExportObjectFact(gf.fieldObj, gf.facts) - } - } -} - -func (pc *passContext) findAndExportFuncFacts(d *ast.FuncDecl) { - log.Debugf("finding and exporting function facts\n") - // for each function definition, check for +checklocks:mu annotation, which - // means that the function must be called with that lock held. - fnObj := pc.pass.TypesInfo.ObjectOf(d.Name) - funcFacts := lockFunctionFacts{GuardedBy: make(map[string]functionGuard)} - var ( - ignore bool - ignorePos token.Pos - ) - -outerLoop: - for _, l := range d.Doc.List { - if strings.HasPrefix(l.Text, checkLocksIgnore) { - pc.exemptions[fnObj] = struct{}{} - ignore = true - ignorePos = l.Pos() - continue - } - if strings.HasPrefix(l.Text, checkLocksAnnotation) { - guardName := strings.TrimPrefix(l.Text, checkLocksAnnotation) - if _, ok := funcFacts.GuardedBy[guardName]; ok { - pc.pass.Reportf(l.Pos(), "annotation %s specified more than once.", l.Text) - continue - } - - found := false - x := strings.Split(guardName, ".") - if len(x) != 2 { - pc.pass.Reportf(l.Pos(), "checklocks mutex annotation should be of the form 'a.b'") +// forAllTypes applies the given function over all types. +func (pc *passContext) forAllTypes(fn func(ts *ast.TypeSpec)) { + for _, f := range pc.pass.Files { + for _, decl := range f.Decls { + d, ok := decl.(*ast.GenDecl) + if !ok || d.Tok != token.TYPE { continue } - paramName, fieldName := x[0], x[1] - log.Debugf("paramName: %s, fieldName: %s", paramName, fieldName) - var paramList []*ast.Field - if d.Recv != nil { - paramList = append(paramList, d.Recv.List...) - } - if d.Type.Params != nil { - paramList = append(paramList, d.Type.Params.List...) - } - for paramNum, field := range paramList { - log.Debugf("field names: %+v", field.Names) - if len(field.Names) == 0 { - log.Debugf("skipping because parameter is unnamed", paramName) - continue - } - nameExists := false - for _, name := range field.Names { - if name.Name == paramName { - nameExists = true - } - } - if !nameExists { - log.Debugf("skipping because parameter name(s) does not match : %s", paramName) - continue - } - ptrType, ok := pc.pass.TypesInfo.TypeOf(field.Type).Underlying().(*types.Pointer) - if !ok { - // Since mutexes cannot be copied we only care about parameters that - // are pointer types when checking for guards. - pc.pass.Reportf(l.Pos(), "annotation %s incorrectly specified, parameter name does not refer to a pointer type", l.Text) - continue outerLoop - } - - structType, ok := ptrType.Elem().Underlying().(*types.Struct) - if !ok { - pc.pass.Reportf(l.Pos(), "annotation %s incorrectly specified, parameter name does not refer to a pointer to a struct", l.Text) - continue outerLoop - } - - for i := 0; i < structType.NumFields(); i++ { - if structType.Field(i).Name() == fieldName { - var fieldFacts lockFieldFacts - pc.pass.ImportObjectFact(structType.Field(i), &fieldFacts) - if !fieldFacts.IsMutex && !fieldFacts.IsRWMutex { - pc.pass.Reportf(l.Pos(), "field %s of param %s is not a mutex or an rwmutex", paramName, structType.Field(i)) - continue outerLoop - } - funcFacts.GuardedBy[guardName] = functionGuard{ParameterNumber: paramNum, FieldNumber: i} - found = true - continue outerLoop - } - } - if !found { - pc.pass.Reportf(l.Pos(), "annotation refers to a non-existent field %s in %s", guardName, structType) - continue outerLoop - } - } - if !found { - pc.pass.Reportf(l.Pos(), "annotation refers to a non-existent parameter %s", paramName) - } - } - } - - if len(funcFacts.GuardedBy) == 0 { - return - } - if ignore { - pc.pass.Reportf(ignorePos, "//+checklocksignore cannot be specified with other annotations on the function") - } - funcObj, ok := pc.pass.TypesInfo.Defs[d.Name].(*types.Func) - if !ok { - panic(fmt.Sprintf("function type information missing for %+v", d)) - } - log.Debugf("export fact for d: %+v, funcObj: %+v, funcFacts: %+v\n", d, funcObj, funcFacts) - pc.pass.ExportObjectFact(funcObj, &funcFacts) -} - -type mutexState struct { - // lockedMutexes is used to track which mutexes in a given struct are - // currently locked using the field number of the mutex as the key. - lockedMutexes map[int]struct{} -} - -// locksHeld tracks all currently held locks. -type locksHeld struct { - locks map[ssa.Value]mutexState -} - -// Same returns true if the locks held by other and l are the same. -func (l *locksHeld) Same(other *locksHeld) bool { - return reflect.DeepEqual(l.locks, other.locks) -} - -// Copy creates a copy of all the lock state held by l. -func (l *locksHeld) Copy() *locksHeld { - out := &locksHeld{locks: make(map[ssa.Value]mutexState)} - for ssaVal, mState := range l.locks { - newLM := make(map[int]struct{}) - for k, v := range mState.lockedMutexes { - newLM[k] = v - } - out.locks[ssaVal] = mutexState{lockedMutexes: newLM} - } - return out -} - -func isAlias(first, second ssa.Value) bool { - if first == second { - return true - } - switch x := first.(type) { - case *ssa.Field: - if y, ok := second.(*ssa.Field); ok { - return x.Field == y.Field && isAlias(x.X, y.X) - } - case *ssa.FieldAddr: - if y, ok := second.(*ssa.FieldAddr); ok { - return x.Field == y.Field && isAlias(x.X, y.X) - } - case *ssa.Index: - if y, ok := second.(*ssa.Index); ok { - return isAlias(x.Index, y.Index) && isAlias(x.X, y.X) - } - case *ssa.IndexAddr: - if y, ok := second.(*ssa.IndexAddr); ok { - return isAlias(x.Index, y.Index) && isAlias(x.X, y.X) - } - case *ssa.UnOp: - if y, ok := second.(*ssa.UnOp); ok { - return isAlias(x.X, y.X) - } - } - return false -} - -// checkBasicBlocks traverses the control flow graph starting at a set of given -// block and checks each instruction for allowed operations. -// -// funcFact are the exported facts for the enclosing function for these basic -// blocks. -func (pc *passContext) checkBasicBlocks(blocks []*ssa.BasicBlock, recoverBlock *ssa.BasicBlock, fn *ssa.Function, funcFact lockFunctionFacts) { - if len(blocks) == 0 { - return - } - - // mutexes is used to track currently locked sync.Mutexes/sync.RWMutexes for a - // given *struct identified by ssa.Value. - seen := make(map[*ssa.BasicBlock]*locksHeld) - var scan func(block *ssa.BasicBlock, parent *locksHeld) - scan = func(block *ssa.BasicBlock, parent *locksHeld) { - _, isExempted := pc.exemptions[block.Parent().Object()] - if oldLocksHeld, ok := seen[block]; ok { - if oldLocksHeld.Same(parent) { - return - } - pc.maybeFail(block.Instrs[0].Pos(), isExempted, "failure entering a block %+v with different sets of lock held, oldLocks: %+v, parentLocks: %+v", block, oldLocksHeld, parent) - return - } - seen[block] = parent - var lh = parent.Copy() - for _, inst := range block.Instrs { - pc.checkInstruction(inst, isExempted, lh) - } - for _, b := range block.Succs { - scan(b, lh) - } - } - - // Initialize lh with any preconditions that require locks to be held for the - // method to be invoked. - lh := &locksHeld{locks: make(map[ssa.Value]mutexState)} - for _, fg := range funcFact.GuardedBy { - // The first is the method object itself so we skip that when looking - // for receiver/function parameters. - log.Debugf("fn: %s, fn.Operands() == %+v", fn, fn.Operands(nil)) - r := fn.Params[fg.ParameterNumber] - guardObj := findField(r, fg.FieldNumber) - var fieldFacts lockFieldFacts - pc.pass.ImportObjectFact(guardObj, &fieldFacts) - if fieldFacts.IsMutex || fieldFacts.IsRWMutex { - m, ok := lh.locks[r] - if !ok { - m = mutexState{lockedMutexes: make(map[int]struct{})} - lh.locks[r] = m + for _, gs := range d.Specs { + fn(gs.(*ast.TypeSpec)) } - m.lockedMutexes[fieldFacts.FieldNumber] = struct{}{} - } else { - panic(fmt.Sprintf("function: %+v has an invalid guard that is not a mutex: %+v", fn, guardObj)) - } - } - - // Start scanning from the first basic block. - scan(blocks[0], lh) - - // Validate that all blocks were touched. - for _, b := range blocks { - if _, ok := seen[b]; !ok && b != recoverBlock { - panic(fmt.Sprintf("block %+v was not visited during checkBasicBlocks", b)) - } - } -} - -func (pc *passContext) checkInstruction(inst ssa.Instruction, isExempted bool, lh *locksHeld) { - log.Debugf("checking instruction: %s, isExempted: %t", inst, isExempted) - switch x := inst.(type) { - case *ssa.Field: - pc.checkFieldAccess(inst, x.X, x.Field, isExempted, lh) - case *ssa.FieldAddr: - pc.checkFieldAccess(inst, x.X, x.Field, isExempted, lh) - case *ssa.Call: - pc.checkFunctionCall(x, isExempted, lh) - } -} - -func findField(v ssa.Value, field int) types.Object { - structType, ok := v.Type().Underlying().(*types.Struct) - if !ok { - ptrType, ok := v.Type().Underlying().(*types.Pointer) - if !ok { - return nil - } - structType = ptrType.Elem().Underlying().(*types.Struct) - } - return structType.Field(field) -} - -func (pc *passContext) maybeFail(pos token.Pos, isExempted bool, fmtStr string, args ...interface{}) { - posKey := toPositionKey(pc.pass.Fset.Position(pos)) - log.Debugf("maybeFail: pos: %d, positionKey: %s", pos, posKey) - if fData, ok := pc.failures[posKey]; ok { - fData.count-- - if fData.count == 0 { - delete(pc.failures, posKey) } - return - } - if !isExempted { - pc.pass.Reportf(pos, fmt.Sprintf(fmtStr, args...)) } } -func (pc *passContext) checkFieldAccess(inst ssa.Instruction, structObj ssa.Value, field int, isExempted bool, lh *locksHeld) { - var fieldFacts lockFieldFacts - fieldObj := findField(structObj, field) - pc.pass.ImportObjectFact(fieldObj, &fieldFacts) - log.Debugf("fieldObj: %s, fieldFacts: %+v", fieldObj, fieldFacts) - for _, guardFieldNumber := range fieldFacts.GuardedBy { - guardObj := findField(structObj, guardFieldNumber) - var guardfieldFacts lockFieldFacts - pc.pass.ImportObjectFact(guardObj, &guardfieldFacts) - log.Debugf("guardObj: %s, guardFieldFacts: %+v", guardObj, guardfieldFacts) - if guardfieldFacts.IsMutex || guardfieldFacts.IsRWMutex { - log.Debugf("guard is a mutex") - m, ok := lh.locks[structObj] +// forAllFunctions applies the given function over all functions. +func (pc *passContext) forAllFunctions(fn func(fn *ast.FuncDecl)) { + for _, f := range pc.pass.Files { + for _, decl := range f.Decls { + d, ok := decl.(*ast.FuncDecl) if !ok { - pc.maybeFail(inst.Pos(), isExempted, "invalid field access, %s must be locked when accessing %s", guardObj.Name(), fieldObj.Name()) - continue - } - if _, ok := m.lockedMutexes[guardfieldFacts.FieldNumber]; !ok { - pc.maybeFail(inst.Pos(), isExempted, "invalid field access, %s must be locked when accessing %s", guardObj.Name(), fieldObj.Name()) - } - } else { - panic("incorrect guard that is not a mutex or an RWMutex") - } - } -} - -func (pc *passContext) checkFunctionCall(call *ssa.Call, isExempted bool, lh *locksHeld) { - // See: https://godoc.org/golang.org/x/tools/go/ssa#CallCommon - // - // 1. "call" mode: when Method is nil (!IsInvoke), a CallCommon represents an ordinary - // function call of the value in Value, which may be a *Builtin, a *Function or any - // other value of kind 'func'. - // - // Value may be one of: - // (a) a *Function, indicating a statically dispatched call - // to a package-level function, an anonymous function, or - // a method of a named type. - // - // (b) a *MakeClosure, indicating an immediately applied - // function literal with free variables. - // - // (c) a *Builtin, indicating a statically dispatched call - // to a built-in function. - // - // (d) any other value, indicating a dynamically dispatched - // function call. - fn, ok := call.Common().Value.(*ssa.Function) - if !ok { - return - } - if fn.Object() == nil { - return - } - - // Check if the function should be called with any locks held. - var funcFact lockFunctionFacts - pc.pass.ImportObjectFact(fn.Object(), &funcFact) - if len(funcFact.GuardedBy) > 0 { - for _, fg := range funcFact.GuardedBy { - // The first is the method object itself so we skip that when looking - // for receiver/function parameters. - r := (*call.Value().Operands(nil)[fg.ParameterNumber+1]) - guardObj := findField(r, fg.FieldNumber) - if guardObj == nil { continue } - var fieldFacts lockFieldFacts - pc.pass.ImportObjectFact(guardObj, &fieldFacts) - if fieldFacts.IsMutex || fieldFacts.IsRWMutex { - heldMutexes, ok := lh.locks[r] - if !ok { - log.Debugf("fn: %s, funcFact: %+v", fn, funcFact) - pc.maybeFail(call.Pos(), isExempted, "invalid function call %s must be held", guardObj.Name()) - continue - } - if _, ok := heldMutexes.lockedMutexes[fg.FieldNumber]; !ok { - log.Debugf("fn: %s, funcFact: %+v", fn, funcFact) - pc.maybeFail(call.Pos(), isExempted, "invalid function call %s must be held", guardObj.Name()) - } - } else { - panic(fmt.Sprintf("function: %+v has an invalid guard that is not a mutex: %+v", fn, guardObj)) - } - } - } - - // Check if it's a method dispatch for something in the sync package. - // See: https://godoc.org/golang.org/x/tools/go/ssa#Function - if fn.Package() != nil && fn.Package().Pkg.Name() == "sync" && fn.Signature.Recv() != nil { - r, ok := call.Common().Args[0].(*ssa.FieldAddr) - if !ok { - return - } - guardObj := findField(r.X, r.Field) - var fieldFacts lockFieldFacts - pc.pass.ImportObjectFact(guardObj, &fieldFacts) - if fieldFacts.IsMutex || fieldFacts.IsRWMutex { - switch fn.Name() { - case "Lock", "RLock": - obj := r.X - m := mutexState{lockedMutexes: make(map[int]struct{})} - for k, v := range lh.locks { - if isAlias(r.X, k) { - obj = k - m = v - } - } - if _, ok := m.lockedMutexes[r.Field]; ok { - // Double locking a mutex that is already locked. - pc.maybeFail(call.Pos(), isExempted, "trying to a lock %s when already locked", guardObj.Name()) - return - } - m.lockedMutexes[r.Field] = struct{}{} - lh.locks[obj] = m - case "Unlock", "RUnlock": - // Find the associated locker object. - var ( - obj ssa.Value - m mutexState - ) - for k, v := range lh.locks { - if isAlias(r.X, k) { - obj = k - m = v - break - } - } - if _, ok := m.lockedMutexes[r.Field]; !ok { - pc.maybeFail(call.Pos(), isExempted, "trying to unlock a mutex %s that is already unlocked", guardObj.Name()) - return - } - delete(m.lockedMutexes, r.Field) - if len(m.lockedMutexes) == 0 { - delete(lh.locks, obj) - } - case "RLocker", "DowngradeLock", "TryLock", "TryRLock": - // we explicitly ignore this for now. - default: - panic(fmt.Sprintf("unexpected mutex/rwmutex method invoked: %s", fn.Name())) - } + fn(d) } } } +// run is the main entrypoint. func run(pass *analysis.Pass) (interface{}, error) { pc := &passContext{ pass: pass, - exemptions: make(map[types.Object]struct{}), failures: make(map[positionKey]*failData), + exemptions: make(map[positionKey]struct{}), + forced: make(map[positionKey]struct{}), + functions: make(map[*ssa.Function]struct{}), } // Find all line failure annotations. - for _, f := range pass.Files { - for _, cg := range f.Comments { - for _, c := range cg.List { - if strings.Contains(c.Text, checkLocksFail) { - cnt := 1 - if strings.Contains(c.Text, checkLocksFail+":") { - parts := strings.SplitAfter(c.Text, checkLocksFail+":") - parsedCount, err := strconv.Atoi(parts[1]) - if err != nil { - pc.pass.Reportf(c.Pos(), "invalid checklocks annotation : %s", err) - continue - } - cnt = parsedCount - } - position := toPositionKey(pass.Fset.Position(c.Pos())) - pc.failures[position] = &failData{pos: c.Pos(), count: cnt} - } - } - } - } - - // Find all struct declarations and export any relevant facts. - for _, f := range pass.Files { - for _, decl := range f.Decls { - d, ok := decl.(*ast.GenDecl) - // A GenDecl node (generic declaration node) represents an import, - // constant, type or variable declaration. We only care about struct - // declarations so skip any declaration that doesn't declare a new type. - if !ok || d.Tok != token.TYPE { - continue - } + pc.extractLineFailures() - for _, gs := range d.Specs { - ts := gs.(*ast.TypeSpec) - ss, ok := ts.Type.(*ast.StructType) - if !ok { - continue - } - structType := pass.TypesInfo.TypeOf(ts.Name).Underlying().(*types.Struct) - pc.findAndExportStructFacts(ss, structType) - } + // Find all struct declarations and export relevant facts. + pc.forAllTypes(func(ts *ast.TypeSpec) { + if ss, ok := ts.Type.(*ast.StructType); ok { + structType := pc.pass.TypesInfo.TypeOf(ts.Name).Underlying().(*types.Struct) + pc.exportLockFieldFacts(structType, ss) } - } + }) + pc.forAllTypes(func(ts *ast.TypeSpec) { + if ss, ok := ts.Type.(*ast.StructType); ok { + structType := pc.pass.TypesInfo.TypeOf(ts.Name).Underlying().(*types.Struct) + pc.exportLockGuardFacts(structType, ss) + } + }) - // Find all method calls and export any relevant facts. - for _, f := range pass.Files { - for _, decl := range f.Decls { - d, ok := decl.(*ast.FuncDecl) - // Ignore any non function declarations and any functions that do not have - // any comments. - if !ok || d.Doc == nil { - continue - } - pc.findAndExportFuncFacts(d) + // Check all alignments. + pc.forAllTypes(func(ts *ast.TypeSpec) { + typ, ok := pass.TypesInfo.TypeOf(ts.Name).(*types.Named) + if !ok { + return } - } + pc.checkTypeAlignment(pass.Pkg, typ) + }) - // log all known facts and all failures if debug logging is enabled. - allFacts := pass.AllObjectFacts() - for i := range allFacts { - log.Debugf("fact.object: %+v, fact.Fact: %+v", allFacts[i].Object, allFacts[i].Fact) - } - log.Debugf("all expected failures: %+v", pc.failures) + // Find all function declarations and export relevant facts. + pc.forAllFunctions(func(fn *ast.FuncDecl) { + pc.exportFunctionFacts(fn) + }) // Scan all code looking for invalid accesses. state := pass.ResultOf[buildssa.Analyzer].(*buildssa.SSA) for _, fn := range state.SrcFuncs { - var funcFact lockFunctionFacts - // Anonymous(closures) functions do not have an object() but do show up in - // the SSA. - if obj := fn.Object(); obj != nil { - pc.pass.ImportObjectFact(fn.Object(), &funcFact) + // Import function facts generated above. + // + // Note that anonymous(closures) functions do not have an + // object but do show up in the SSA. They can only be invoked + // by named functions in the package, and they are analyzing + // inline on every call. Thus we skip the analysis here. They + // will be hit on calls, or picked up in the pass below. + if obj := fn.Object(); obj == nil { + continue } + var lff lockFunctionFacts + pc.pass.ImportObjectFact(fn.Object(), &lff) - log.Debugf("checking function: %s", fn) - var b bytes.Buffer - ssa.WriteFunction(&b, fn) - log.Debugf("function SSA: %s", b.String()) - if fn.Recover != nil { - pc.checkBasicBlocks([]*ssa.BasicBlock{fn.Recover}, nil, fn, funcFact) + // Do we ignore this? + if lff.Ignore { + continue } - pc.checkBasicBlocks(fn.Blocks, fn.Recover, fn, funcFact) - } - // Scan for remaining failures we expect. - for _, failure := range pc.failures { - // We are missing expect failures, report as much as possible. - pass.Reportf(failure.pos, "expected %d failures", failure.count) + // Check the basic blocks in the function. + pc.checkFunction(nil, fn, &lff, nil, false /* force */) } + for _, fn := range state.SrcFuncs { + // Ensure all anonymous functions are hit. They are not + // permitted to have any lock preconditions. + if obj := fn.Object(); obj != nil { + continue + } + var nolff lockFunctionFacts + pc.checkFunction(nil, fn, &nolff, nil, false /* force */) + } + + // Check for expected failures. + pc.checkFailures() return nil, nil } diff --git a/tools/checklocks/facts.go b/tools/checklocks/facts.go new file mode 100644 index 000000000..34c9f5ef1 --- /dev/null +++ b/tools/checklocks/facts.go @@ -0,0 +1,624 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checklocks + +import ( + "fmt" + "go/ast" + "go/token" + "go/types" + "regexp" + "strings" + + "golang.org/x/tools/go/ssa" +) + +// atomicAlignment is saved per type. +// +// This represents the alignment required for the type, which may +// be implied and imposed by other types within the aggregate type. +type atomicAlignment int + +// AFact implements analysis.Fact.AFact. +func (*atomicAlignment) AFact() {} + +// atomicDisposition is saved per field. +// +// This represents how the field must be accessed. It must either +// be non-atomic (default), atomic or ignored. +type atomicDisposition int + +const ( + atomicDisallow atomicDisposition = iota + atomicIgnore + atomicRequired +) + +// fieldList is a simple list of fields, used in two types below. +// +// Note that the integers in this list refer to one of two things: +// - A positive integer refers to a field index in a struct. +// - A negative integer refers to a field index in a struct, where +// that field is a pointer and must be subsequently resolved. +type fieldList []int + +// resolvedValue is an ssa.Value with additional fields. +// +// This can be resolved to a string as part of a lock state. +type resolvedValue struct { + value ssa.Value + valid bool + fieldList []int +} + +// findExtract finds a relevant extract. This must exist within the referrers +// to the call object. If this doesn't then the object which is locked is never +// consumed, and we should consider this a bug. +func findExtract(v ssa.Value, index int) (ssa.Value, bool) { + if refs := v.Referrers(); refs != nil { + for _, inst := range *refs { + if x, ok := inst.(*ssa.Extract); ok && x.Tuple == v && x.Index == index { + return inst.(ssa.Value), true + } + } + } + return nil, false +} + +// resolve resolves the given field list. +func (fl fieldList) resolve(v ssa.Value) (rv resolvedValue) { + return resolvedValue{ + value: v, + fieldList: fl, + valid: true, + } +} + +// valueAsString returns a string representing this value. +// +// This must align with how the string is generated in valueAsString. +func (rv resolvedValue) valueAsString(ls *lockState) string { + typ := rv.value.Type() + s := ls.valueAsString(rv.value) + for i, fieldNumber := range rv.fieldList { + switch { + case fieldNumber > 0: + field, ok := findField(typ, fieldNumber-1) + if !ok { + // This can't be resolved, return for debugging. + return fmt.Sprintf("{%s+%v}", s, rv.fieldList[i:]) + } + s = fmt.Sprintf("&(%s.%s)", s, field.Name()) + typ = field.Type() + case fieldNumber < 1: + field, ok := findField(typ, (-fieldNumber)-1) + if !ok { + // See above. + return fmt.Sprintf("{%s+%v}", s, rv.fieldList[i:]) + } + s = fmt.Sprintf("*(&(%s.%s))", s, field.Name()) + typ = field.Type() + } + } + return s +} + +// lockFieldFacts apply on every struct field. +type lockFieldFacts struct { + // IsMutex is true if the field is of type sync.Mutex. + IsMutex bool + + // IsRWMutex is true if the field is of type sync.RWMutex. + IsRWMutex bool + + // IsPointer indicates if the field is a pointer. + IsPointer bool + + // FieldNumber is the number of this field in the struct. + FieldNumber int +} + +// AFact implements analysis.Fact.AFact. +func (*lockFieldFacts) AFact() {} + +// lockGuardFacts contains guard information. +type lockGuardFacts struct { + // GuardedBy is the set of locks that are guarding this field. The key + // is the original annotation value, and the field list is the object + // traversal path. + GuardedBy map[string]fieldList + + // AtomicDisposition is the disposition for this field. Note that this + // can affect the interpretation of the GuardedBy field above, see the + // relevant comment. + AtomicDisposition atomicDisposition +} + +// AFact implements analysis.Fact.AFact. +func (*lockGuardFacts) AFact() {} + +// functionGuard is used by lockFunctionFacts, below. +type functionGuard struct { + // ParameterNumber is the index of the object that contains the + // guarding mutex. From this parameter, a walk is performed + // subsequently using the resolve method. + // + // Note that is ParameterNumber is beyond the size of parameters, then + // it may return to a return value. This applies only for the Acquires + // relation below. + ParameterNumber int + + // NeedsExtract is used in the case of a return value, and indicates + // that the field must be extracted from a tuple. + NeedsExtract bool + + // FieldList is the traversal path to the object. + FieldList fieldList +} + +// resolveReturn resolves a return value. +// +// Precondition: rv is either an ssa.Value, or an *ssa.Return. +func (fg *functionGuard) resolveReturn(rv interface{}, args int) resolvedValue { + if rv == nil { + // For defers and other objects, this may be nil. This is + // handled in state.go in the actual lock checking logic. + return resolvedValue{ + value: nil, + valid: false, + } + } + index := fg.ParameterNumber - args + // If this is a *ssa.Return object, i.e. we are analyzing the function + // and not the call site, then we can just pull the result directly. + if r, ok := rv.(*ssa.Return); ok { + return fg.FieldList.resolve(r.Results[index]) + } + if fg.NeedsExtract { + // Resolve on the extracted field, this is necessary if the + // type here is not an explicit return. Note that rv must be an + // ssa.Value, since it is not an *ssa.Return. + v, ok := findExtract(rv.(ssa.Value), index) + if !ok { + return resolvedValue{ + value: v, + valid: false, + } + } + return fg.FieldList.resolve(v) + } + if index != 0 { + // This should not happen, NeedsExtract should always be set. + panic("NeedsExtract is false, but return value index is non-zero") + } + // Resolve on the single return. + return fg.FieldList.resolve(rv.(ssa.Value)) +} + +// resolveStatic returns an ssa.Value representing the given field. +// +// Precondition: per resolveReturn. +func (fg *functionGuard) resolveStatic(fn *ssa.Function, rv interface{}) resolvedValue { + if fg.ParameterNumber >= len(fn.Params) { + return fg.resolveReturn(rv, len(fn.Params)) + } + return fg.FieldList.resolve(fn.Params[fg.ParameterNumber]) +} + +// resolveCall returns an ssa.Value representing the given field. +func (fg *functionGuard) resolveCall(args []ssa.Value, rv ssa.Value) resolvedValue { + if fg.ParameterNumber >= len(args) { + return fg.resolveReturn(rv, len(args)) + } + return fg.FieldList.resolve(args[fg.ParameterNumber]) +} + +// lockFunctionFacts apply on every method. +type lockFunctionFacts struct { + // HeldOnEntry tracks the names and number of parameter (including receiver) + // lockFuncfields that guard calls to this function. + // + // The key is the name specified in the checklocks annotation. e.g given + // the following code: + // + // ``` + // type A struct { + // mu sync.Mutex + // a int + // } + // + // // +checklocks:a.mu + // func xyz(a *A) {..} + // ``` + // + // '`+checklocks:a.mu' will result in an entry in this map as shown below. + // HeldOnEntry: {"a.mu" => {ParameterNumber: 0, FieldNumbers: {0}} + // + // Unlikely lockFieldFacts, there is no atomic interpretation. + HeldOnEntry map[string]functionGuard + + // HeldOnExit tracks the locks that are expected to be held on exit. + HeldOnExit map[string]functionGuard + + // Ignore means this function has local analysis ignores. + // + // This is not used outside the local package. + Ignore bool +} + +// AFact implements analysis.Fact.AFact. +func (*lockFunctionFacts) AFact() {} + +// checkGuard validates the guardName. +func (lff *lockFunctionFacts) checkGuard(pc *passContext, d *ast.FuncDecl, guardName string, allowReturn bool) (functionGuard, bool) { + if _, ok := lff.HeldOnEntry[guardName]; ok { + pc.maybeFail(d.Pos(), "annotation %s specified more than once, already required", guardName) + return functionGuard{}, false + } + if _, ok := lff.HeldOnExit[guardName]; ok { + pc.maybeFail(d.Pos(), "annotation %s specified more than once, already acquired", guardName) + return functionGuard{}, false + } + fg, ok := pc.findFunctionGuard(d, guardName, allowReturn) + return fg, ok +} + +// addGuardedBy adds a field to both HeldOnEntry and HeldOnExit. +func (lff *lockFunctionFacts) addGuardedBy(pc *passContext, d *ast.FuncDecl, guardName string) { + if fg, ok := lff.checkGuard(pc, d, guardName, false /* allowReturn */); ok { + if lff.HeldOnEntry == nil { + lff.HeldOnEntry = make(map[string]functionGuard) + } + if lff.HeldOnExit == nil { + lff.HeldOnExit = make(map[string]functionGuard) + } + lff.HeldOnEntry[guardName] = fg + lff.HeldOnExit[guardName] = fg + } +} + +// addAcquires adds a field to HeldOnExit. +func (lff *lockFunctionFacts) addAcquires(pc *passContext, d *ast.FuncDecl, guardName string) { + if fg, ok := lff.checkGuard(pc, d, guardName, true /* allowReturn */); ok { + if lff.HeldOnExit == nil { + lff.HeldOnExit = make(map[string]functionGuard) + } + lff.HeldOnExit[guardName] = fg + } +} + +// addReleases adds a field to HeldOnEntry. +func (lff *lockFunctionFacts) addReleases(pc *passContext, d *ast.FuncDecl, guardName string) { + if fg, ok := lff.checkGuard(pc, d, guardName, false /* allowReturn */); ok { + if lff.HeldOnEntry == nil { + lff.HeldOnEntry = make(map[string]functionGuard) + } + lff.HeldOnEntry[guardName] = fg + } +} + +// fieldListFor returns the fieldList for the given object. +func (pc *passContext) fieldListFor(pos token.Pos, fieldObj types.Object, index int, fieldName string, checkMutex bool) (int, bool) { + var lff lockFieldFacts + if !pc.pass.ImportObjectFact(fieldObj, &lff) { + // This should not happen: we export facts for all fields. + panic(fmt.Sprintf("no lockFieldFacts available for field %s", fieldName)) + } + // Check that it is indeed a mutex. + if checkMutex && !lff.IsMutex && !lff.IsRWMutex { + pc.maybeFail(pos, "field %s is not a mutex or an rwmutex", fieldName) + return 0, false + } + // Return the resolution path. + if lff.IsPointer { + return -(index + 1), true + } + return (index + 1), true +} + +// resolveOneField resolves a field in a single struct. +func (pc *passContext) resolveOneField(pos token.Pos, structType *types.Struct, fieldName string, checkMutex bool) (fl fieldList, fieldObj types.Object, ok bool) { + // Scan to match the next field. + for i := 0; i < structType.NumFields(); i++ { + fieldObj := structType.Field(i) + if fieldObj.Name() != fieldName { + continue + } + flOne, ok := pc.fieldListFor(pos, fieldObj, i, fieldName, checkMutex) + if !ok { + return nil, nil, false + } + fl = append(fl, flOne) + return fl, fieldObj, true + } + // Is this an embed? + for i := 0; i < structType.NumFields(); i++ { + fieldObj := structType.Field(i) + if !fieldObj.Embedded() { + continue + } + // Is this an embedded struct? + structType, ok := resolveStruct(fieldObj.Type()) + if !ok { + continue + } + // Need to check that there is a resolution path. If there is + // no resolution path that's not a failure: we just continue + // scanning the next embed to find a match. + flEmbed, okEmbed := pc.fieldListFor(pos, fieldObj, i, fieldName, false) + flCont, fieldObjCont, okCont := pc.resolveOneField(pos, structType, fieldName, checkMutex) + if okEmbed && okCont { + fl = append(fl, flEmbed) + fl = append(fl, flCont...) + return fl, fieldObjCont, true + } + } + pc.maybeFail(pos, "field %s does not exist", fieldName) + return nil, nil, false +} + +// resolveField resolves a set of fields given a string, such a 'a.b.c'. +// +// Note that this checks that the final element is a mutex of some kind, and +// will fail appropriately. +func (pc *passContext) resolveField(pos token.Pos, structType *types.Struct, parts []string) (fl fieldList, ok bool) { + for partNumber, fieldName := range parts { + flOne, fieldObj, ok := pc.resolveOneField(pos, structType, fieldName, partNumber >= len(parts)-1 /* checkMutex */) + if !ok { + // Error already reported. + return nil, false + } + fl = append(fl, flOne...) + if partNumber < len(parts)-1 { + // Traverse to the next type. + structType, ok = resolveStruct(fieldObj.Type()) + if !ok { + pc.maybeFail(pos, "invalid intermediate field %s", fieldName) + return fl, false + } + } + } + return fl, true +} + +var ( + mutexRE = regexp.MustCompile("((.*/)|^)sync.(CrossGoroutineMutex|Mutex)") + rwMutexRE = regexp.MustCompile("((.*/)|^)sync.(CrossGoroutineRWMutex|RWMutex)") +) + +// exportLockFieldFacts finds all struct fields that are mutexes, and ensures +// that they are annotated properly. +// +// This information is consumed subsequently by exportLockGuardFacts, and this +// function must be called first on all structures. +func (pc *passContext) exportLockFieldFacts(structType *types.Struct, ss *ast.StructType) { + for i, field := range ss.Fields.List { + lff := &lockFieldFacts{ + FieldNumber: i, + } + // We use HasSuffix below because fieldType can be fully + // qualified with the package name eg for the gvisor sync + // package mutex fields have the type: + // "<package path>/sync/sync.Mutex" + fieldObj := structType.Field(i) + s := fieldObj.Type().String() + switch { + case mutexRE.MatchString(s): + lff.IsMutex = true + case rwMutexRE.MatchString(s): + lff.IsRWMutex = true + } + // Save whether this is a pointer. + _, lff.IsPointer = fieldObj.Type().Underlying().(*types.Pointer) + // We must always export the lockFieldFacts, since traversal + // can take place along any object in the struct. + pc.pass.ExportObjectFact(fieldObj, lff) + // If this is an anonymous type, then we won't discover it via + // the AST global declarations. We can recurse from here. + if ss, ok := field.Type.(*ast.StructType); ok { + if st, ok := fieldObj.Type().(*types.Struct); ok { + pc.exportLockFieldFacts(st, ss) + } + } + } +} + +// exportLockGuardFacts finds all relevant guard information for structures. +// +// This function requires exportLockFieldFacts be called first on all +// structures. +func (pc *passContext) exportLockGuardFacts(structType *types.Struct, ss *ast.StructType) { + for i, field := range ss.Fields.List { + fieldObj := structType.Field(i) + if field.Doc != nil { + var ( + lff lockFieldFacts + lgf lockGuardFacts + ) + pc.pass.ImportObjectFact(structType.Field(i), &lff) + for _, l := range field.Doc.List { + pc.extractAnnotations(l.Text, map[string]func(string){ + checkAtomicAnnotation: func(string) { + switch lgf.AtomicDisposition { + case atomicRequired: + pc.maybeFail(fieldObj.Pos(), "annotation is redundant, already atomic required") + case atomicIgnore: + pc.maybeFail(fieldObj.Pos(), "annotation is contradictory, already atomic ignored") + } + lgf.AtomicDisposition = atomicRequired + }, + checkLocksIgnore: func(string) { + switch lgf.AtomicDisposition { + case atomicIgnore: + pc.maybeFail(fieldObj.Pos(), "annotation is redundant, already atomic ignored") + case atomicRequired: + pc.maybeFail(fieldObj.Pos(), "annotation is contradictory, already atomic required") + } + lgf.AtomicDisposition = atomicIgnore + }, + checkLocksAnnotation: func(guardName string) { + // Check for a duplicate annotation. + if _, ok := lgf.GuardedBy[guardName]; ok { + pc.maybeFail(fieldObj.Pos(), "annotation %s specified more than once", guardName) + return + } + fl, ok := pc.resolveField(fieldObj.Pos(), structType, strings.Split(guardName, ".")) + if ok { + // If we successfully resolved + // the field, then save it. + if lgf.GuardedBy == nil { + lgf.GuardedBy = make(map[string]fieldList) + } + lgf.GuardedBy[guardName] = fl + } + }, + }) + } + // Save only if there is something meaningful. + if len(lgf.GuardedBy) > 0 || lgf.AtomicDisposition != atomicDisallow { + pc.pass.ExportObjectFact(structType.Field(i), &lgf) + } + } + // See above, for anonymous structure fields. + if ss, ok := field.Type.(*ast.StructType); ok { + if st, ok := fieldObj.Type().(*types.Struct); ok { + pc.exportLockGuardFacts(st, ss) + } + } + } +} + +// countFields gives an accurate field count, according for unnamed arguments +// and return values and the compact identifier format. +func countFields(fl []*ast.Field) (count int) { + for _, field := range fl { + if len(field.Names) == 0 { + count++ + continue + } + count += len(field.Names) + } + return +} + +// matchFieldList attempts to match the given field. +func (pc *passContext) matchFieldList(pos token.Pos, fl []*ast.Field, guardName string) (functionGuard, bool) { + parts := strings.Split(guardName, ".") + parameterName := parts[0] + parameterNumber := 0 + for _, field := range fl { + // See countFields, above. + if len(field.Names) == 0 { + parameterNumber++ + continue + } + for _, name := range field.Names { + if name.Name != parameterName { + parameterNumber++ + continue + } + ptrType, ok := pc.pass.TypesInfo.TypeOf(field.Type).Underlying().(*types.Pointer) + if !ok { + // Since mutexes cannot be copied we only care + // about parameters that are pointer types when + // checking for guards. + pc.maybeFail(pos, "parameter name %s does not refer to a pointer type", parameterName) + return functionGuard{}, false + } + structType, ok := ptrType.Elem().Underlying().(*types.Struct) + if !ok { + // Fields can only be in named structures. + pc.maybeFail(pos, "parameter name %s does not refer to a pointer to a struct", parameterName) + return functionGuard{}, false + } + fg := functionGuard{ + ParameterNumber: parameterNumber, + } + fl, ok := pc.resolveField(pos, structType, parts[1:]) + fg.FieldList = fl + return fg, ok // If ok is false, already failed. + } + } + return functionGuard{}, false +} + +// findFunctionGuard identifies the parameter number and field number for a +// particular string of the 'a.b'. +// +// This function will report any errors directly. +func (pc *passContext) findFunctionGuard(d *ast.FuncDecl, guardName string, allowReturn bool) (functionGuard, bool) { + var ( + parameterList []*ast.Field + returnList []*ast.Field + ) + if d.Recv != nil { + parameterList = append(parameterList, d.Recv.List...) + } + if d.Type.Params != nil { + parameterList = append(parameterList, d.Type.Params.List...) + } + if fg, ok := pc.matchFieldList(d.Pos(), parameterList, guardName); ok { + return fg, ok + } + if allowReturn { + if d.Type.Results != nil { + returnList = append(returnList, d.Type.Results.List...) + } + if fg, ok := pc.matchFieldList(d.Pos(), returnList, guardName); ok { + // Fix this up to apply to the return value, as noted + // in fg.ParameterNumber. For the ssa analysis, we must + // record whether this has multiple results, since + // *ssa.Call indicates: "The Call instruction yields + // the function result if there is exactly one. + // Otherwise it returns a tuple, the components of + // which are accessed via Extract." + fg.ParameterNumber += countFields(parameterList) + fg.NeedsExtract = countFields(returnList) > 1 + return fg, ok + } + } + // We never saw a matching parameter. + pc.maybeFail(d.Pos(), "annotation %s does not have a matching parameter", guardName) + return functionGuard{}, false +} + +// exportFunctionFacts exports relevant function findings. +func (pc *passContext) exportFunctionFacts(d *ast.FuncDecl) { + if d.Doc == nil || d.Doc.List == nil { + return + } + var lff lockFunctionFacts + for _, l := range d.Doc.List { + pc.extractAnnotations(l.Text, map[string]func(string){ + checkLocksIgnore: func(string) { + // Note that this applies to all atomic + // analysis as well. There is no provided way + // to selectively ignore only lock analysis or + // atomic analysis, as we expect this use to be + // extremely rare. + lff.Ignore = true + }, + checkLocksAnnotation: func(guardName string) { lff.addGuardedBy(pc, d, guardName) }, + checkLocksAcquires: func(guardName string) { lff.addAcquires(pc, d, guardName) }, + checkLocksReleases: func(guardName string) { lff.addReleases(pc, d, guardName) }, + }) + } + + // Export the function facts if there is anything to save. + if lff.Ignore || len(lff.HeldOnEntry) > 0 || len(lff.HeldOnExit) > 0 { + funcObj := pc.pass.TypesInfo.Defs[d.Name].(*types.Func) + pc.pass.ExportObjectFact(funcObj, &lff) + } +} diff --git a/tools/checklocks/state.go b/tools/checklocks/state.go new file mode 100644 index 000000000..57061a32e --- /dev/null +++ b/tools/checklocks/state.go @@ -0,0 +1,315 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checklocks + +import ( + "fmt" + "go/token" + "go/types" + "strings" + "sync/atomic" + + "golang.org/x/tools/go/ssa" +) + +// lockState tracks the locking state and aliases. +type lockState struct { + // lockedMutexes is used to track which mutexes in a given struct are + // currently locked. Note that most of the heavy lifting is done by + // valueAsString below, which maps to specific structure fields, etc. + lockedMutexes []string + + // stored stores values that have been stored in memory, bound to + // FreeVars or passed as Parameterse. + stored map[ssa.Value]ssa.Value + + // used is a temporary map, used only for valueAsString. It prevents + // multiple use of the same memory location. + used map[ssa.Value]struct{} + + // defers are the stack of defers that have been pushed. + defers []*ssa.Defer + + // refs indicates the number of references on this structure. If it's + // greater than one, we will do copy-on-write. + refs *int32 +} + +// newLockState makes a new lockState. +func newLockState() *lockState { + refs := int32(1) // Not shared. + return &lockState{ + lockedMutexes: make([]string, 0), + used: make(map[ssa.Value]struct{}), + stored: make(map[ssa.Value]ssa.Value), + defers: make([]*ssa.Defer, 0), + refs: &refs, + } +} + +// fork forks the locking state. When a lockState is forked, any modifications +// will cause maps to be copied. +func (l *lockState) fork() *lockState { + if l == nil { + return newLockState() + } + atomic.AddInt32(l.refs, 1) + return &lockState{ + lockedMutexes: l.lockedMutexes, + used: make(map[ssa.Value]struct{}), + stored: l.stored, + defers: l.defers, + refs: l.refs, + } +} + +// modify indicates that this state will be modified. +func (l *lockState) modify() { + if atomic.LoadInt32(l.refs) > 1 { + // Copy the lockedMutexes. + lm := make([]string, len(l.lockedMutexes)) + copy(lm, l.lockedMutexes) + l.lockedMutexes = lm + + // Copy the stored values. + s := make(map[ssa.Value]ssa.Value) + for k, v := range l.stored { + s[k] = v + } + l.stored = s + + // Reset the used values. + l.used = make(map[ssa.Value]struct{}) + + // Copy the defers. + ds := make([]*ssa.Defer, len(l.defers)) + copy(ds, l.defers) + l.defers = ds + + // Drop our reference. + atomic.AddInt32(l.refs, -1) + newRefs := int32(1) // Not shared. + l.refs = &newRefs + } +} + +// isHeld indicates whether the field is held is not. +func (l *lockState) isHeld(rv resolvedValue) (string, bool) { + if !rv.valid { + return rv.valueAsString(l), false + } + s := rv.valueAsString(l) + for _, k := range l.lockedMutexes { + if k == s { + return s, true + } + } + return s, false +} + +// lockField locks the given field. +// +// If false is returned, the field was already locked. +func (l *lockState) lockField(rv resolvedValue) (string, bool) { + if !rv.valid { + return rv.valueAsString(l), false + } + s := rv.valueAsString(l) + for _, k := range l.lockedMutexes { + if k == s { + return s, false + } + } + l.modify() + l.lockedMutexes = append(l.lockedMutexes, s) + return s, true +} + +// unlockField unlocks the given field. +// +// If false is returned, the field was not locked. +func (l *lockState) unlockField(rv resolvedValue) (string, bool) { + if !rv.valid { + return rv.valueAsString(l), false + } + s := rv.valueAsString(l) + for i, k := range l.lockedMutexes { + if k == s { + // Copy the last lock in and truncate. + l.modify() + l.lockedMutexes[i] = l.lockedMutexes[len(l.lockedMutexes)-1] + l.lockedMutexes = l.lockedMutexes[:len(l.lockedMutexes)-1] + return s, true + } + } + return s, false +} + +// store records an alias. +func (l *lockState) store(addr ssa.Value, v ssa.Value) { + l.modify() + l.stored[addr] = v +} + +// isSubset indicates other holds all the locks held by l. +func (l *lockState) isSubset(other *lockState) bool { + held := 0 // Number in l, held by other. + for _, k := range l.lockedMutexes { + for _, ok := range other.lockedMutexes { + if k == ok { + held++ + break + } + } + } + return held >= len(l.lockedMutexes) +} + +// count indicates the number of locks held. +func (l *lockState) count() int { + return len(l.lockedMutexes) +} + +// isCompatible returns true if the states are compatible. +func (l *lockState) isCompatible(other *lockState) bool { + return l.isSubset(other) && other.isSubset(l) +} + +// elemType is a type that implements the Elem function. +type elemType interface { + Elem() types.Type +} + +// valueAsString returns a string for a given value. +// +// This decomposes the value into the simplest possible representation in terms +// of parameters, free variables and globals. During resolution, stored values +// may be transferred, as well as bound free variables. +// +// Nil may not be passed here. +func (l *lockState) valueAsString(v ssa.Value) string { + switch x := v.(type) { + case *ssa.Parameter: + // Was this provided as a paramter for a local anonymous + // function invocation? + v, ok := l.stored[x] + if ok { + return l.valueAsString(v) + } + return fmt.Sprintf("{param:%s}", x.Name()) + case *ssa.Global: + return fmt.Sprintf("{global:%s}", x.Name()) + case *ssa.FreeVar: + // Attempt to resolve this, in case we are being invoked in a + // scope where all the variables are bound. + v, ok := l.stored[x] + if ok { + // The FreeVar is typically bound to a location, so we + // check what's been stored there. Note that the second + // may map to the same FreeVar, which we can check. + stored, ok := l.stored[v] + if ok { + return l.valueAsString(stored) + } + } + return fmt.Sprintf("{freevar:%s}", x.Name()) + case *ssa.Convert: + // Just disregard conversion. + return l.valueAsString(x.X) + case *ssa.ChangeType: + // Ditto, disregard. + return l.valueAsString(x.X) + case *ssa.UnOp: + if x.Op != token.MUL { + break + } + // Is this loading a free variable? If yes, then this can be + // resolved in the original isAlias function. + if fv, ok := x.X.(*ssa.FreeVar); ok { + return l.valueAsString(fv) + } + // Should be try to resolve via a memory address? This needs to + // be done since a memory location can hold its own value. + if _, ok := l.used[x.X]; !ok { + // Check if we know what the accessed location holds. + // This is used to disambiguate memory locations. + v, ok := l.stored[x.X] + if ok { + l.used[x.X] = struct{}{} + defer func() { delete(l.used, x.X) }() + return l.valueAsString(v) + } + } + // x.X.Type is pointer. We must construct this type + // dynamically, since the ssa.Value could be synthetic. + return fmt.Sprintf("*(%s)", l.valueAsString(x.X)) + case *ssa.Field: + structType, ok := resolveStruct(x.X.Type()) + if !ok { + // This should not happen. + panic(fmt.Sprintf("structType not available for struct: %#v", x.X)) + } + fieldObj := structType.Field(x.Field) + return fmt.Sprintf("%s.%s", l.valueAsString(x.X), fieldObj.Name()) + case *ssa.FieldAddr: + structType, ok := resolveStruct(x.X.Type()) + if !ok { + // This should not happen. + panic(fmt.Sprintf("structType not available for struct: %#v", x.X)) + } + fieldObj := structType.Field(x.Field) + return fmt.Sprintf("&(%s.%s)", l.valueAsString(x.X), fieldObj.Name()) + case *ssa.Index: + return fmt.Sprintf("%s[%s]", l.valueAsString(x.X), l.valueAsString(x.Index)) + case *ssa.IndexAddr: + return fmt.Sprintf("&(%s[%s])", l.valueAsString(x.X), l.valueAsString(x.Index)) + case *ssa.Lookup: + return fmt.Sprintf("%s[%s]", l.valueAsString(x.X), l.valueAsString(x.Index)) + case *ssa.Extract: + return fmt.Sprintf("%s[%d]", l.valueAsString(x.Tuple), x.Index) + } + + // In the case of any other type (e.g. this may be an alloc, a return + // value, etc.), just return the literal pointer value to the Value. + // This will be unique within the ssa graph, and so if two values are + // equal, they are from the same type. + return fmt.Sprintf("{%T:%p}", v, v) +} + +// String returns the full lock state. +func (l *lockState) String() string { + if l.count() == 0 { + return "no locks held" + } + return strings.Join(l.lockedMutexes, ",") +} + +// pushDefer pushes a defer onto the stack. +func (l *lockState) pushDefer(d *ssa.Defer) { + l.modify() + l.defers = append(l.defers, d) +} + +// popDefer pops a defer from the stack. +func (l *lockState) popDefer() *ssa.Defer { + // Does not technically modify the underlying slice. + count := len(l.defers) + if count == 0 { + return nil + } + d := l.defers[count-1] + l.defers = l.defers[:count-1] + return d +} diff --git a/tools/checklocks/test/BUILD b/tools/checklocks/test/BUILD index b055e71d9..d4d98c256 100644 --- a/tools/checklocks/test/BUILD +++ b/tools/checklocks/test/BUILD @@ -4,5 +4,18 @@ package(licenses = ["notice"]) go_library( name = "test", - srcs = ["test.go"], + srcs = [ + "alignment.go", + "anon.go", + "atomics.go", + "basics.go", + "branches.go", + "closures.go", + "defer.go", + "incompat.go", + "methods.go", + "parameters.go", + "return.go", + "test.go", + ], ) diff --git a/tools/checklocks/test/alignment.go b/tools/checklocks/test/alignment.go new file mode 100644 index 000000000..cd857ff73 --- /dev/null +++ b/tools/checklocks/test/alignment.go @@ -0,0 +1,51 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +type alignedStruct32 struct { + v int32 +} + +type alignedStruct64 struct { + v int64 +} + +type alignedStructGood struct { + v0 alignedStruct32 + v1 alignedStruct32 + v2 alignedStruct64 +} + +type alignedStructGoodArray0 struct { + v0 [3]alignedStruct32 + v1 [3]alignedStruct32 + v2 alignedStruct64 +} + +type alignedStructGoodArray1 [16]alignedStructGood + +type alignedStructBad struct { + v0 alignedStruct32 + v1 alignedStruct64 + v2 alignedStruct32 +} + +type alignedStructBadArray0 struct { + v0 [3]alignedStruct32 + v1 [2]alignedStruct64 + v2 [1]alignedStruct32 +} + +type alignedStructBadArray1 [16]alignedStructBad diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/tools/checklocks/test/anon.go index a4bc08411..a1f6bddda 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go +++ b/tools/checklocks/test/anon.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,19 +12,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -package disklayout +package test -import ( - "reflect" - "testing" +import "sync" - "gvisor.dev/gvisor/pkg/marshal" -) +type anonStruct struct { + anon struct { + mu sync.RWMutex + // +checklocks:mu + x int + } +} -func assertSize(t *testing.T, v marshal.Marshallable, want int) { - t.Helper() +func testAnonAccessValid(tc *anonStruct) { + tc.anon.mu.Lock() + tc.anon.x = 1 + tc.anon.mu.Unlock() +} - if got := v.SizeBytes(); got != want { - t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) - } +func testAnonAccessInvalid(tc *anonStruct) { + tc.anon.x = 1 // +checklocksfail } diff --git a/tools/checklocks/test/atomics.go b/tools/checklocks/test/atomics.go new file mode 100644 index 000000000..8e060d8a2 --- /dev/null +++ b/tools/checklocks/test/atomics.go @@ -0,0 +1,91 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +import ( + "sync" + "sync/atomic" +) + +type atomicStruct struct { + accessedNormally int32 + + // +checkatomic + accessedAtomically int32 + + // +checklocksignore + ignored int32 +} + +func testNormalAccess(tc *atomicStruct, v chan int32, p chan *int32) { + v <- tc.accessedNormally + p <- &tc.accessedNormally +} + +func testAtomicAccess(tc *atomicStruct, v chan int32) { + v <- atomic.LoadInt32(&tc.accessedAtomically) +} + +func testAtomicAccessInvalid(tc *atomicStruct, v chan int32) { + v <- atomic.LoadInt32(&tc.accessedNormally) // +checklocksfail +} + +func testNormalAccessInvalid(tc *atomicStruct, v chan int32, p chan *int32) { + v <- tc.accessedAtomically // +checklocksfail + p <- &tc.accessedAtomically // +checklocksfail +} + +func testIgnored(tc *atomicStruct, v chan int32, p chan *int32) { + v <- atomic.LoadInt32(&tc.ignored) + v <- tc.ignored + p <- &tc.ignored +} + +type atomicMixedStruct struct { + mu sync.Mutex + + // +checkatomic + // +checklocks:mu + accessedMixed int32 +} + +func testAtomicMixedValidRead(tc *atomicMixedStruct, v chan int32) { + v <- atomic.LoadInt32(&tc.accessedMixed) +} + +func testAtomicMixedInvalidRead(tc *atomicMixedStruct, v chan int32, p chan *int32) { + v <- tc.accessedMixed // +checklocksfail + p <- &tc.accessedMixed // +checklocksfail +} + +func testAtomicMixedValidLockedWrite(tc *atomicMixedStruct, v chan int32, p chan *int32) { + tc.mu.Lock() + atomic.StoreInt32(&tc.accessedMixed, 1) + tc.mu.Unlock() +} + +func testAtomicMixedInvalidLockedWrite(tc *atomicMixedStruct, v chan int32, p chan *int32) { + tc.mu.Lock() + tc.accessedMixed = 1 // +checklocksfail:2 + tc.mu.Unlock() +} + +func testAtomicMixedInvalidAtomicWrite(tc *atomicMixedStruct, v chan int32, p chan *int32) { + atomic.StoreInt32(&tc.accessedMixed, 1) // +checklocksfail +} + +func testAtomicMixedInvalidWrite(tc *atomicMixedStruct, v chan int32, p chan *int32) { + tc.accessedMixed = 1 // +checklocksfail:2 +} diff --git a/tools/checklocks/test/basics.go b/tools/checklocks/test/basics.go new file mode 100644 index 000000000..7a773171f --- /dev/null +++ b/tools/checklocks/test/basics.go @@ -0,0 +1,145 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +import ( + "sync" +) + +func testLockedAccessValid(tc *oneGuardStruct) { + tc.mu.Lock() + tc.guardedField = 1 + tc.mu.Unlock() +} + +func testLockedAccessIgnore(tc *oneGuardStruct) { + tc.mu.Lock() + tc.unguardedField = 1 + tc.mu.Unlock() +} + +func testUnlockedAccessInvalidWrite(tc *oneGuardStruct) { + tc.guardedField = 2 // +checklocksfail +} + +func testUnlockedAccessInvalidRead(tc *oneGuardStruct) { + x := tc.guardedField // +checklocksfail + _ = x +} + +func testUnlockedAccessValid(tc *oneGuardStruct) { + tc.unguardedField = 2 +} + +func testCallValidAccess(tc *oneGuardStruct) { + callValidAccess(tc) +} + +func callValidAccess(tc *oneGuardStruct) { + tc.mu.Lock() + tc.guardedField = 1 + tc.mu.Unlock() +} + +func testCallValueMixup(tc *oneGuardStruct) { + callValueMixup(tc, tc) +} + +func callValueMixup(tc1, tc2 *oneGuardStruct) { + tc1.mu.Lock() + tc2.guardedField = 2 // +checklocksfail + tc1.mu.Unlock() +} + +func testCallPreconditionsInvalid(tc *oneGuardStruct) { + callPreconditions(tc) // +checklocksfail +} + +func testCallPreconditionsValid(tc *oneGuardStruct) { + tc.mu.Lock() + callPreconditions(tc) + tc.mu.Unlock() +} + +// +checklocks:tc.mu +func callPreconditions(tc *oneGuardStruct) { + tc.guardedField = 1 +} + +type nestedFieldsStruct struct { + mu sync.Mutex + + // +checklocks:mu + nestedStruct struct { + nested1 int + nested2 int + } +} + +func testNestedGuardValid(tc *nestedFieldsStruct) { + tc.mu.Lock() + tc.nestedStruct.nested1 = 1 + tc.nestedStruct.nested2 = 2 + tc.mu.Unlock() +} + +func testNestedGuardInvalid(tc *nestedFieldsStruct) { + tc.nestedStruct.nested1 = 1 // +checklocksfail +} + +type rwGuardStruct struct { + rwMu sync.RWMutex + + // +checklocks:rwMu + guardedField int +} + +func testRWValidRead(tc *rwGuardStruct) { + tc.rwMu.Lock() + tc.guardedField = 1 + tc.rwMu.Unlock() +} + +func testRWValidWrite(tc *rwGuardStruct) { + tc.rwMu.RLock() + tc.guardedField = 2 + tc.rwMu.RUnlock() +} + +func testRWInvalidWrite(tc *rwGuardStruct) { + tc.guardedField = 3 // +checklocksfail +} + +func testRWInvalidRead(tc *rwGuardStruct) { + x := tc.guardedField + 3 // +checklocksfail + _ = x +} + +func testTwoLocksDoubleGuardStructValid(tc *twoLocksDoubleGuardStruct) { + tc.mu.Lock() + tc.secondMu.Lock() + tc.doubleGuardedField = 1 + tc.secondMu.Unlock() +} + +func testTwoLocksDoubleGuardStructOnlyOne(tc *twoLocksDoubleGuardStruct) { + tc.mu.Lock() + tc.doubleGuardedField = 2 // +checklocksfail + tc.mu.Unlock() +} + +func testTwoLocksDoubleGuardStructInvalid(tc *twoLocksDoubleGuardStruct) { + tc.doubleGuardedField = 3 // +checklocksfail:2 +} diff --git a/tools/checklocks/test/branches.go b/tools/checklocks/test/branches.go new file mode 100644 index 000000000..81fec29e5 --- /dev/null +++ b/tools/checklocks/test/branches.go @@ -0,0 +1,56 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +import ( + "math/rand" +) + +func testInconsistentReturn(tc *oneGuardStruct) { // +checklocksfail + if x := rand.Intn(10); x%2 == 1 { + tc.mu.Lock() + } +} + +func testConsistentBranching(tc *oneGuardStruct) { + x := rand.Intn(10) + if x%2 == 1 { + tc.mu.Lock() + } else { + tc.mu.Lock() + } + tc.guardedField = 1 + if x%2 == 1 { + tc.mu.Unlock() + } else { + tc.mu.Unlock() + } +} + +func testInconsistentBranching(tc *oneGuardStruct) { // +checklocksfail:2 + // We traverse the control flow graph in all consistent ways. We cannot + // determine however, that the first if block and second if block will + // evaluate to the same condition. Therefore, there are two consistent + // paths through this code, and two inconsistent paths. Either way, the + // guardedField should be also marked as an invalid access. + x := rand.Intn(10) + if x%2 == 1 { + tc.mu.Lock() + } + tc.guardedField = 1 // +checklocksfail + if x%2 == 1 { + tc.mu.Unlock() // +checklocksforce + } +} diff --git a/tools/checklocks/test/closures.go b/tools/checklocks/test/closures.go new file mode 100644 index 000000000..7da87540a --- /dev/null +++ b/tools/checklocks/test/closures.go @@ -0,0 +1,100 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +func testClosureInvalid(tc *oneGuardStruct) { + // This is expected to fail. + callClosure(func() { + tc.guardedField = 1 // +checklocksfail + }) +} + +func testClosureUnsupported(tc *oneGuardStruct) { + // Locked outside the closure, so may or may not be valid. This cannot + // be handled and we should explicitly fail. This can't be handled + // because of the call through callClosure, below, which means the + // closure will actually be passed as a value somewhere. + tc.mu.Lock() + callClosure(func() { + tc.guardedField = 1 // +checklocksfail + }) + tc.mu.Unlock() +} + +func testClosureValid(tc *oneGuardStruct) { + // All locking happens within the closure. This should not present a + // problem for analysis. + callClosure(func() { + tc.mu.Lock() + tc.guardedField = 1 + tc.mu.Unlock() + }) +} + +func testClosureInline(tc *oneGuardStruct) { + // If the closure is being dispatching inline only, then we should be + // able to analyze this call and give it a thumbs up. + tc.mu.Lock() + func() { + tc.guardedField = 1 + }() + tc.mu.Unlock() +} + +func testAnonymousInvalid(tc *oneGuardStruct) { + // Invalid, as per testClosureInvalid above. + callAnonymous(func(tc *oneGuardStruct) { + tc.guardedField = 1 // +checklocksfail + }, tc) +} + +func testAnonymousUnsupported(tc *oneGuardStruct) { + // Not supportable, as per testClosureUnsupported above. + tc.mu.Lock() + callAnonymous(func(tc *oneGuardStruct) { + tc.guardedField = 1 // +checklocksfail + }, tc) + tc.mu.Unlock() +} + +func testAnonymousValid(tc *oneGuardStruct) { + // Valid, as per testClosureValid above. + callAnonymous(func(tc *oneGuardStruct) { + tc.mu.Lock() + tc.guardedField = 1 + tc.mu.Unlock() + }, tc) +} + +func testAnonymousInline(tc *oneGuardStruct) { + // Unlike the closure case, we are able to dynamically infer the set of + // preconditions for the function dispatch and assert that this is + // a valid call. + tc.mu.Lock() + func(tc *oneGuardStruct) { + tc.guardedField = 1 + }(tc) + tc.mu.Unlock() +} + +//go:noinline +func callClosure(fn func()) { + fn() +} + +//go:noinline +func callAnonymous(fn func(*oneGuardStruct), tc *oneGuardStruct) { + fn(tc) +} diff --git a/pkg/flipcall/packet_window_mmap_arm64.go b/tools/checklocks/test/defer.go index 87ad1a4a1..6e574e5eb 100644 --- a/pkg/flipcall/packet_window_mmap_arm64.go +++ b/tools/checklocks/test/defer.go @@ -12,14 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build arm64 +package test -package flipcall +func testDeferValidUnlock(tc *oneGuardStruct) { + tc.mu.Lock() + tc.guardedField = 1 + defer tc.mu.Unlock() +} -import "golang.org/x/sys/unix" +func testDeferValidAccess(tc *oneGuardStruct) { + tc.mu.Lock() + defer func() { + tc.guardedField = 1 + tc.mu.Unlock() + }() +} -// Return a memory mapping of the pwd in memory that can be shared outside the sandbox. -func packetWindowMmap(pwd PacketWindowDescriptor) (uintptr, unix.Errno) { - m, _, err := unix.RawSyscall6(unix.SYS_MMAP, 0, uintptr(pwd.Length), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, uintptr(pwd.FD), uintptr(pwd.Offset)) - return m, err +func testDeferInvalidAccess(tc *oneGuardStruct) { + tc.mu.Lock() + defer func() { + // N.B. Executed after tc.mu.Unlock(). + tc.guardedField = 1 // +checklocksfail + }() + tc.mu.Unlock() } diff --git a/tools/checklocks/test/incompat.go b/tools/checklocks/test/incompat.go new file mode 100644 index 000000000..b39bc66c1 --- /dev/null +++ b/tools/checklocks/test/incompat.go @@ -0,0 +1,54 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +import ( + "sync" +) + +// unsupportedLockerStruct verifies that trying to annotate a field that is not a +// sync.Mutex or sync.RWMutex results in a failure. +type unsupportedLockerStruct struct { + mu sync.Locker + + // +checklocks:mu + x int // +checklocksfail +} + +// badFieldsStruct verifies that refering invalid fields fails. +type badFieldsStruct struct { + // +checklocks:mu + x int // +checklocksfail +} + +// redundantStruct verifies that redundant annotations fail. +type redundantStruct struct { + mu sync.Mutex + + // +checklocks:mu + // +checklocks:mu + x int // +checklocksfail +} + +// conflictsStruct verifies that conflicting annotations fail. +type conflictsStruct struct { + // +checkatomicignore + // +checkatomic + x int // +checklocksfail + + // +checkatomic + // +checkatomicignore + y int // +checklocksfail +} diff --git a/tools/checklocks/test/methods.go b/tools/checklocks/test/methods.go new file mode 100644 index 000000000..72e26fca6 --- /dev/null +++ b/tools/checklocks/test/methods.go @@ -0,0 +1,117 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +import ( + "sync" +) + +type testMethods struct { + mu sync.Mutex + + // +checklocks:mu + guardedField int +} + +func (t *testMethods) methodValid() { + t.mu.Lock() + t.guardedField = 1 + t.mu.Unlock() +} + +func (t *testMethods) methodInvalid() { + t.guardedField = 2 // +checklocksfail +} + +// +checklocks:t.mu +func (t *testMethods) MethodLocked(a, b, c int) { + t.guardedField = 3 +} + +// +checklocksignore +func (t *testMethods) methodIgnore() { + t.guardedField = 2 +} + +func testMethodCallsValid(tc *testMethods) { + tc.methodValid() +} + +func testMethodCallsValidPreconditions(tc *testMethods) { + tc.mu.Lock() + tc.MethodLocked(1, 2, 3) + tc.mu.Unlock() +} + +func testMethodCallsInvalid(tc *testMethods) { + tc.MethodLocked(4, 5, 6) // +checklocksfail +} + +func testMultipleParameters(tc1, tc2, tc3 *testMethods) { + tc1.mu.Lock() + tc1.guardedField = 1 + tc2.guardedField = 2 // +checklocksfail + tc3.guardedField = 3 // +checklocksfail + tc1.mu.Unlock() +} + +type testMethodsWithParameters struct { + mu sync.Mutex + + // +checklocks:mu + guardedField int +} + +type ptrToTestMethodsWithParameters *testMethodsWithParameters + +// +checklocks:t.mu +// +checklocks:a.mu +func (t *testMethodsWithParameters) methodLockedWithParameters(a *testMethodsWithParameters, b *testMethodsWithParameters) { + t.guardedField = a.guardedField + b.guardedField = a.guardedField // +checklocksfail +} + +// +checklocks:t.mu +// +checklocks:a.mu +// +checklocks:b.mu +func (t *testMethodsWithParameters) methodLockedWithPtrType(a *testMethodsWithParameters, b ptrToTestMethodsWithParameters) { + t.guardedField = a.guardedField + b.guardedField = a.guardedField +} + +// +checklocks:a.mu +func standaloneFunctionWithGuard(a *testMethodsWithParameters) { + a.guardedField = 1 + a.mu.Unlock() + a.guardedField = 1 // +checklocksfail +} + +type testMethodsWithEmbedded struct { + mu sync.Mutex + + // +checklocks:mu + guardedField int + p *testMethodsWithParameters +} + +// +checklocks:t.mu +func (t *testMethodsWithEmbedded) DoLocked(a, b *testMethodsWithParameters) { + t.guardedField = 1 + a.mu.Lock() + b.mu.Lock() + t.p.methodLockedWithParameters(a, b) // +checklocksfail + a.mu.Unlock() + b.mu.Unlock() +} diff --git a/tools/checklocks/test/parameters.go b/tools/checklocks/test/parameters.go new file mode 100644 index 000000000..5b9e664b6 --- /dev/null +++ b/tools/checklocks/test/parameters.go @@ -0,0 +1,48 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +func testParameterPassingbyAddrValid(tc *oneGuardStruct) { + tc.mu.Lock() + nestedWithGuardByAddr(&tc.guardedField, &tc.unguardedField) + tc.mu.Unlock() +} + +func testParameterPassingByAddrInalid(tc *oneGuardStruct) { + nestedWithGuardByAddr(&tc.guardedField, &tc.unguardedField) // +checklocksfail +} + +func testParameterPassingByValueValid(tc *oneGuardStruct) { + tc.mu.Lock() + nestedWithGuardByValue(tc.guardedField, tc.unguardedField) + tc.mu.Unlock() +} + +func testParameterPassingByValueInalid(tc *oneGuardStruct) { + nestedWithGuardByValue(tc.guardedField, tc.unguardedField) // +checklocksfail +} + +func nestedWithGuardByAddr(guardedField, unguardedField *int) { + *guardedField = 4 + *unguardedField = 5 +} + +func nestedWithGuardByValue(guardedField, unguardedField int) { + // read the fields to keep SA4009 static analyzer happy. + _ = guardedField + _ = unguardedField + guardedField = 4 + unguardedField = 5 +} diff --git a/tools/checklocks/test/return.go b/tools/checklocks/test/return.go new file mode 100644 index 000000000..47c7b6773 --- /dev/null +++ b/tools/checklocks/test/return.go @@ -0,0 +1,61 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +// +checklocks:tc.mu +func testReturnInvalidGuard() (tc *oneGuardStruct) { // +checklocksfail + return new(oneGuardStruct) +} + +// +checklocksrelease:tc.mu +func testReturnInvalidRelease() (tc *oneGuardStruct) { // +checklocksfail + return new(oneGuardStruct) +} + +// +checklocksacquire:tc.mu +func testReturnInvalidAcquire() (tc *oneGuardStruct) { + return new(oneGuardStruct) // +checklocksfail +} + +// +checklocksacquire:tc.mu +func testReturnValidAcquire() (tc *oneGuardStruct) { + tc = new(oneGuardStruct) + tc.mu.Lock() + return tc +} + +func testReturnAcquireCall() { + tc := testReturnValidAcquire() + tc.guardedField = 1 + tc.mu.Unlock() +} + +// +checklocksacquire:tc.val.mu +// +checklocksacquire:tc.ptr.mu +func testReturnValidNestedAcquire() (tc *nestedGuardStruct) { + tc = new(nestedGuardStruct) + tc.ptr = new(oneGuardStruct) + tc.val.mu.Lock() + tc.ptr.mu.Lock() + return tc +} + +func testReturnNestedAcquireCall() { + tc := testReturnValidNestedAcquire() + tc.val.guardedField = 1 + tc.ptr.guardedField = 1 + tc.val.mu.Unlock() + tc.ptr.mu.Unlock() +} diff --git a/tools/checklocks/test/test.go b/tools/checklocks/test/test.go index 05693c183..cbf6b1635 100644 --- a/tools/checklocks/test/test.go +++ b/tools/checklocks/test/test.go @@ -13,99 +13,24 @@ // limitations under the License. // Package test is a test package. +// +// Tests are all compilation tests in separate files. package test import ( - "math/rand" "sync" ) -type oneGuarded struct { +// oneGuardStruct has one guarded field. +type oneGuardStruct struct { mu sync.Mutex // +checklocks:mu - guardedField int - + guardedField int unguardedField int } -func testAccessOne() { - var tc oneGuarded - // Valid access - tc.mu.Lock() - tc.guardedField = 1 - tc.unguardedField = 1 - tc.mu.Unlock() - - // Valid access as unguarded field is not protected by mu. - tc.unguardedField = 2 - - // Invalid access - tc.guardedField = 2 // +checklocksfail - - // Invalid read of a guarded field. - x := tc.guardedField // +checklocksfail - _ = x -} - -func testFunctionCallsNoParameters() { - // Couple of regular function calls with no parameters. - funcCallWithValidAccess() - funcCallWithInvalidAccess() -} - -func funcCallWithValidAccess() { - var tc2 oneGuarded - // Valid tc2 access - tc2.mu.Lock() - tc2.guardedField = 1 - tc2.mu.Unlock() -} - -func funcCallWithInvalidAccess() { - var tc oneGuarded - var tc2 oneGuarded - // Invalid access, wrong mutex is held. - tc.mu.Lock() - tc2.guardedField = 2 // +checklocksfail - tc.mu.Unlock() -} - -func testParameterPassing() { - var tc oneGuarded - - // Valid call where a guardedField is passed to a function as a parameter. - tc.mu.Lock() - nestedWithGuardByAddr(&tc.guardedField, &tc.unguardedField) - tc.mu.Unlock() - - // Invalid call where a guardedField is passed to a function as a parameter - // without holding locks. - nestedWithGuardByAddr(&tc.guardedField, &tc.unguardedField) // +checklocksfail - - // Valid call where a guardedField is passed to a function as a parameter. - tc.mu.Lock() - nestedWithGuardByValue(tc.guardedField, tc.unguardedField) - tc.mu.Unlock() - - // Invalid call where a guardedField is passed to a function as a parameter - // without holding locks. - nestedWithGuardByValue(tc.guardedField, tc.unguardedField) // +checklocksfail -} - -func nestedWithGuardByAddr(guardedField, unguardedField *int) { - *guardedField = 4 - *unguardedField = 5 -} - -func nestedWithGuardByValue(guardedField, unguardedField int) { - // read the fields to keep SA4009 static analyzer happy. - _ = guardedField - _ = unguardedField - guardedField = 4 - unguardedField = 5 -} - -type twoGuarded struct { +// twoGuardStruct has two guarded fields. +type twoGuardStruct struct { mu sync.Mutex // +checklocks:mu guardedField1 int @@ -113,250 +38,27 @@ type twoGuarded struct { guardedField2 int } -type twoLocks struct { +// twoLocksStruct has two locks and two fields. +type twoLocksStruct struct { mu sync.Mutex secondMu sync.Mutex - // +checklocks:mu guardedField1 int // +checklocks:secondMu guardedField2 int } -type twoLocksDoubleGuard struct { +// twoLocksDoubleGuardStruct has two locks and a single field with two guards. +type twoLocksDoubleGuardStruct struct { mu sync.Mutex secondMu sync.Mutex - // +checklocks:mu // +checklocks:secondMu doubleGuardedField int } -func testTwoLocksDoubleGuard() { - var tc twoLocksDoubleGuard - - // Double guarded field - tc.mu.Lock() - tc.secondMu.Lock() - tc.doubleGuardedField = 1 - tc.secondMu.Unlock() - - // This should fail as we released the secondMu. - tc.doubleGuardedField = 2 // +checklocksfail - tc.mu.Unlock() - - // This should fail as well as now we are not holding any locks. - // - // This line triggers two failures one for each mutex, hence the 2 after - // fail. - tc.doubleGuardedField = 3 // +checklocksfail:2 -} - -type rwGuarded struct { - rwMu sync.RWMutex - - // +checklocks:rwMu - rwGuardedField int -} - -func testRWGuarded() { - var tc rwGuarded - - // Assignment w/ exclusive lock should pass. - tc.rwMu.Lock() - tc.rwGuardedField = 1 - tc.rwMu.Unlock() - - // Assignment w/ RWLock should pass as we don't differentiate between - // Lock/RLock. - tc.rwMu.RLock() - tc.rwGuardedField = 2 - tc.rwMu.RUnlock() - - // Assignment w/o hold Lock() should fail. - tc.rwGuardedField = 3 // +checklocksfail - - // Reading w/o holding lock should fail. - x := tc.rwGuardedField + 3 // +checklocksfail - _ = x -} - -type nestedFields struct { - mu sync.Mutex - - // +checklocks:mu - nestedStruct struct { - nested1 int - nested2 int - } -} - -func testNestedStructGuards() { - var tc nestedFields - // Valid access with mu held. - tc.mu.Lock() - tc.nestedStruct.nested1 = 1 - tc.nestedStruct.nested2 = 2 - tc.mu.Unlock() - - // Invalid access to nested1 wihout holding mu. - tc.nestedStruct.nested1 = 1 // +checklocksfail -} - -type testCaseMethods struct { - mu sync.Mutex - - // +checklocks:mu - guardedField int -} - -func (t *testCaseMethods) Method() { - // Valid access - t.mu.Lock() - t.guardedField = 1 - t.mu.Unlock() - - // invalid access - t.guardedField = 2 // +checklocksfail -} - -// +checklocks:t.mu -func (t *testCaseMethods) MethodLocked(a, b, c int) { - t.guardedField = 3 -} - -// +checklocksignore -func (t *testCaseMethods) IgnoredMethod() { - // Invalid access but should not fail as the function is annotated - // with "// +checklocksignore" - t.guardedField = 2 -} - -func testMethodCalls() { - var tc2 testCaseMethods - - // Valid use, tc2.Method acquires lock. - tc2.Method() - - // Valid access tc2.mu is held before calling tc2.MethodLocked. - tc2.mu.Lock() - tc2.MethodLocked(1, 2, 3) - tc2.mu.Unlock() - - // Invalid access no locks are being held. - tc2.MethodLocked(4, 5, 6) // +checklocksfail -} - -type noMutex struct { - f int - g int -} - -func (n noMutex) method() { - n.f = 1 - n.f = n.g -} - -func testNoMutex() { - var n noMutex - n.method() -} - -func testMultiple() { - var tc1, tc2, tc3 testCaseMethods - - tc1.mu.Lock() - - // Valid access we are holding tc1's lock. - tc1.guardedField = 1 - - // Invalid access we are not holding tc2 or tc3's lock. - tc2.guardedField = 2 // +checklocksfail - tc3.guardedField = 3 // +checklocksfail - tc1.mu.Unlock() -} - -func testConditionalBranchingLocks() { - var tc2 testCaseMethods - x := rand.Intn(10) - if x%2 == 1 { - tc2.mu.Lock() - } - // This is invalid access as tc2.mu is not held if we never entered - // the if block. - tc2.guardedField = 1 // +checklocksfail - - var tc3 testCaseMethods - if x%2 == 1 { - tc3.mu.Lock() - } else { - tc3.mu.Lock() - } - // This is valid as tc3.mu is held in if and else blocks. - tc3.guardedField = 1 -} - -type testMethodWithParams struct { - mu sync.Mutex - - // +checklocks:mu - guardedField int -} - -type ptrToTestMethodWithParams *testMethodWithParams - -// +checklocks:t.mu -// +checklocks:a.mu -func (t *testMethodWithParams) methodLockedWithParams(a *testMethodWithParams, b *testMethodWithParams) { - t.guardedField = a.guardedField - b.guardedField = a.guardedField // +checklocksfail -} - -// +checklocks:t.mu -// +checklocks:a.mu -// +checklocks:b.mu -func (t *testMethodWithParams) methodLockedWithPtrType(a *testMethodWithParams, b ptrToTestMethodWithParams) { - t.guardedField = a.guardedField - b.guardedField = a.guardedField -} - -// +checklocks:a.mu -func standaloneFunctionWithGuard(a *testMethodWithParams) { - a.guardedField = 1 - a.mu.Unlock() - a.guardedField = 1 // +checklocksfail -} - -type testMethodWithEmbedded struct { - mu sync.Mutex - - // +checklocks:mu - guardedField int - p *testMethodWithParams -} - -// +checklocks:t.mu -func (t *testMethodWithEmbedded) DoLocked() { - var a, b testMethodWithParams - t.guardedField = 1 - a.mu.Lock() - b.mu.Lock() - t.p.methodLockedWithParams(&a, &b) // +checklocksfail - a.mu.Unlock() - b.mu.Unlock() -} - -// UnsupportedLockerExample is a test that verifies that trying to annotate a -// field that is not a sync.Mutex/RWMutex results in a failure. -type UnsupportedLockerExample struct { - mu sync.Locker - - // +checklocks:mu - x int // +checklocksfail -} - -func abc() { - var mu sync.Mutex - a := UnsupportedLockerExample{mu: &mu} - a.x = 1 +// nestedGuardStruct nests oneGuardStruct fields. +type nestedGuardStruct struct { + val oneGuardStruct + ptr *oneGuardStruct } diff --git a/tools/constraintutil/BUILD b/tools/constraintutil/BUILD new file mode 100644 index 000000000..004b708c4 --- /dev/null +++ b/tools/constraintutil/BUILD @@ -0,0 +1,18 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "constraintutil", + srcs = ["constraintutil.go"], + marshal = False, + stateify = False, + visibility = ["//tools:__subpackages__"], +) + +go_test( + name = "constraintutil_test", + size = "small", + srcs = ["constraintutil_test.go"], + library = ":constraintutil", +) diff --git a/tools/constraintutil/constraintutil.go b/tools/constraintutil/constraintutil.go new file mode 100644 index 000000000..fb3fbe5c2 --- /dev/null +++ b/tools/constraintutil/constraintutil.go @@ -0,0 +1,169 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package constraintutil provides utilities for working with Go build +// constraints. +package constraintutil + +import ( + "bufio" + "bytes" + "fmt" + "go/build/constraint" + "io" + "os" + "strings" +) + +// FromReader extracts the build constraint from the Go source or assembly file +// whose contents are read by r. +func FromReader(r io.Reader) (constraint.Expr, error) { + // See go/build.parseFileHeader() for the "official" logic that this is + // derived from. + const ( + slashStar = "/*" + starSlash = "*/" + gobuildPrefix = "//go:build" + ) + s := bufio.NewScanner(r) + var ( + inSlashStar = false // between /* and */ + haveGobuild = false + e constraint.Expr + ) +Lines: + for s.Scan() { + line := bytes.TrimSpace(s.Bytes()) + if !inSlashStar && constraint.IsGoBuild(string(line)) { + if haveGobuild { + return nil, fmt.Errorf("multiple go:build directives") + } + haveGobuild = true + var err error + e, err = constraint.Parse(string(line)) + if err != nil { + return nil, err + } + } + ThisLine: + for len(line) > 0 { + if inSlashStar { + if i := bytes.Index(line, []byte(starSlash)); i >= 0 { + inSlashStar = false + line = bytes.TrimSpace(line[i+len(starSlash):]) + continue ThisLine + } + continue Lines + } + if bytes.HasPrefix(line, []byte("//")) { + continue Lines + } + // Note that if /* appears in the line, but not at the beginning, + // then the line is still non-empty, so skipping this and + // terminating below is correct. + if bytes.HasPrefix(line, []byte(slashStar)) { + inSlashStar = true + line = bytes.TrimSpace(line[len(slashStar):]) + continue ThisLine + } + // A non-empty non-comment line terminates scanning for go:build. + break Lines + } + } + return e, s.Err() +} + +// FromString extracts the build constraint from the Go source or assembly file +// containing the given data. If no build constraint applies to the file, it +// returns nil. +func FromString(str string) (constraint.Expr, error) { + return FromReader(strings.NewReader(str)) +} + +// FromFile extracts the build constraint from the Go source or assembly file +// at the given path. If no build constraint applies to the file, it returns +// nil. +func FromFile(path string) (constraint.Expr, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + return FromReader(f) +} + +// Combine returns a constraint.Expr that evaluates to true iff all expressions +// in es evaluate to true. If es is empty, Combine returns nil. +// +// Preconditions: All constraint.Exprs in es are non-nil. +func Combine(es []constraint.Expr) constraint.Expr { + switch len(es) { + case 0: + return nil + case 1: + return es[0] + default: + a := &constraint.AndExpr{es[0], es[1]} + for i := 2; i < len(es); i++ { + a = &constraint.AndExpr{a, es[i]} + } + return a + } +} + +// CombineFromFiles returns a build constraint expression that evaluates to +// true iff the build constraints from all of the given Go source or assembly +// files evaluate to true. If no build constraints apply to any of the given +// files, it returns nil. +func CombineFromFiles(paths []string) (constraint.Expr, error) { + var es []constraint.Expr + for _, path := range paths { + e, err := FromFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read build constraints from %q: %v", path, err) + } + if e != nil { + es = append(es, e) + } + } + return Combine(es), nil +} + +// Lines returns a string containing build constraint directives for the given +// constraint.Expr, including two trailing newlines, as appropriate for a Go +// source or assembly file. At least a go:build directive will be emitted; if +// the constraint is expressible using +build directives as well, then +build +// directives will also be emitted. +// +// If e is nil, Lines returns the empty string. +func Lines(e constraint.Expr) string { + if e == nil { + return "" + } + + var b strings.Builder + b.WriteString("//go:build ") + b.WriteString(e.String()) + b.WriteByte('\n') + + if pblines, err := constraint.PlusBuildLines(e); err == nil { + for _, line := range pblines { + b.WriteString(line) + b.WriteByte('\n') + } + } + + b.WriteByte('\n') + return b.String() +} diff --git a/tools/constraintutil/constraintutil_test.go b/tools/constraintutil/constraintutil_test.go new file mode 100644 index 000000000..eeabd8dcf --- /dev/null +++ b/tools/constraintutil/constraintutil_test.go @@ -0,0 +1,138 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package constraintutil + +import ( + "go/build/constraint" + "testing" +) + +func TestFileParsing(t *testing.T) { + for _, test := range []struct { + name string + data string + expr string + }{ + { + name: "Empty", + }, + { + name: "NoConstraint", + data: "// copyright header\n\npackage main", + }, + { + name: "ConstraintOnFirstLine", + data: "//go:build amd64\n#include \"textflag.h\"", + expr: "amd64", + }, + { + name: "ConstraintAfterSlashSlashComment", + data: "// copyright header\n\n//go:build linux\n\npackage newlib", + expr: "linux", + }, + { + name: "ConstraintAfterSlashStarComment", + data: "/*\ncopyright header\n*/\n\n//go:build !race\n\npackage oldlib", + expr: "!race", + }, + { + name: "ConstraintInSlashSlashComment", + data: "// blah blah //go:build windows", + }, + { + name: "ConstraintInSlashStarComment", + data: "/*\n//go:build windows\n*/", + }, + { + name: "ConstraintAfterPackageClause", + data: "package oops\n//go:build race", + }, + { + name: "ConstraintAfterCppInclude", + data: "#include \"textflag.h\"\n//go:build arm64", + }, + } { + t.Run(test.name, func(t *testing.T) { + e, err := FromString(test.data) + if err != nil { + t.Fatalf("FromString(%q) failed: %v", test.data, err) + } + if e == nil { + if len(test.expr) != 0 { + t.Errorf("FromString(%q): got no constraint, wanted %q", test.data, test.expr) + } + } else { + got := e.String() + if len(test.expr) == 0 { + t.Errorf("FromString(%q): got %q, wanted no constraint", test.data, got) + } else if got != test.expr { + t.Errorf("FromString(%q): got %q, wanted %q", test.data, got, test.expr) + } + } + }) + } +} + +func TestCombine(t *testing.T) { + for _, test := range []struct { + name string + in []string + out string + }{ + { + name: "0", + }, + { + name: "1", + in: []string{"amd64 || arm64"}, + out: "amd64 || arm64", + }, + { + name: "2", + in: []string{"amd64", "amd64 && linux"}, + out: "amd64 && amd64 && linux", + }, + { + name: "3", + in: []string{"amd64", "amd64 || arm64", "amd64 || riscv64"}, + out: "amd64 && (amd64 || arm64) && (amd64 || riscv64)", + }, + } { + t.Run(test.name, func(t *testing.T) { + inexprs := make([]constraint.Expr, 0, len(test.in)) + for _, estr := range test.in { + line := "//go:build " + estr + e, err := constraint.Parse(line) + if err != nil { + t.Fatalf("constraint.Parse(%q) failed: %v", line, err) + } + inexprs = append(inexprs, e) + } + outexpr := Combine(inexprs) + if outexpr == nil { + if len(test.out) != 0 { + t.Errorf("Combine(%v): got no constraint, wanted %q", test.in, test.out) + } + } else { + got := outexpr.String() + if len(test.out) == 0 { + t.Errorf("Combine(%v): got %q, wanted no constraint", test.in, got) + } else if got != test.out { + t.Errorf("Combine(%v): got %q, wanted %q", test.in, got, test.out) + } + } + }) + } +} diff --git a/tools/defs.bzl b/tools/defs.bzl index 27542a2f5..f4266e1de 100644 --- a/tools/defs.bzl +++ b/tools/defs.bzl @@ -9,7 +9,7 @@ load("//tools/go_stateify:defs.bzl", "go_stateify") load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps") load("//tools/nogo:defs.bzl", "nogo_test") load("//tools/bazeldefs:defs.bzl", _arch_genrule = "arch_genrule", _build_test = "build_test", _bzl_library = "bzl_library", _coreutil = "coreutil", _default_installer = "default_installer", _default_net_util = "default_net_util", _more_shards = "more_shards", _most_shards = "most_shards", _proto_library = "proto_library", _select_arch = "select_arch", _select_system = "select_system", _short_path = "short_path", _version = "version") -load("//tools/bazeldefs:cc.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _gbenchmark = "gbenchmark", _grpcpp = "grpcpp", _gtest = "gtest", _vdso_linker_option = "vdso_linker_option") +load("//tools/bazeldefs:cc.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _gbenchmark = "gbenchmark", _gbenchmark_internal = "gbenchmark_internal", _grpcpp = "grpcpp", _gtest = "gtest", _vdso_linker_option = "vdso_linker_option") load("//tools/bazeldefs:go.bzl", _bazel_worker_proto = "bazel_worker_proto", _gazelle = "gazelle", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_path = "go_path", _go_proto_library = "go_proto_library", _go_rule = "go_rule", _go_test = "go_test", _select_goarch = "select_goarch", _select_goos = "select_goos") load("//tools/bazeldefs:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar") load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms") @@ -37,6 +37,7 @@ cc_library = _cc_library cc_test = _cc_test cc_toolchain = _cc_toolchain gbenchmark = _gbenchmark +gbenchmark_internal = _gbenchmark_internal gtest = _gtest grpcpp = _grpcpp vdso_linker_option = _vdso_linker_option diff --git a/tools/go_fieldenum/BUILD b/tools/go_fieldenum/BUILD new file mode 100644 index 000000000..2bfdaeb2f --- /dev/null +++ b/tools/go_fieldenum/BUILD @@ -0,0 +1,15 @@ +load("//tools:defs.bzl", "bzl_library", "go_binary") + +licenses(["notice"]) + +go_binary( + name = "fieldenum", + srcs = ["main.go"], + visibility = ["//:sandbox"], +) + +bzl_library( + name = "defs_bzl", + srcs = ["defs.bzl"], + visibility = ["//visibility:private"], +) diff --git a/tools/go_fieldenum/defs.bzl b/tools/go_fieldenum/defs.bzl new file mode 100644 index 000000000..0cd2679ca --- /dev/null +++ b/tools/go_fieldenum/defs.bzl @@ -0,0 +1,29 @@ +"""The go_fieldenum target infers Field, Fields, and FieldSet types for each +struct in an input source file marked +fieldenum. +""" + +def _go_fieldenum_impl(ctx): + output = ctx.outputs.out + + args = ["-pkg=%s" % ctx.attr.package, "-out=%s" % output.path] + for src in ctx.attr.srcs: + args += [f.path for f in src.files.to_list()] + + ctx.actions.run( + inputs = ctx.files.srcs, + outputs = [output], + mnemonic = "GoFieldenum", + progress_message = "Generating Go field enumerators %s" % ctx.label, + arguments = args, + executable = ctx.executable._tool, + ) + +go_fieldenum = rule( + implementation = _go_fieldenum_impl, + attrs = { + "srcs": attr.label_list(doc = "input source files", mandatory = True, allow_files = True), + "package": attr.string(doc = "the package for the generated source file", mandatory = True), + "out": attr.output(doc = "output file", mandatory = True), + "_tool": attr.label(executable = True, cfg = "host", default = Label("//tools/go_fieldenum:fieldenum")), + }, +) diff --git a/tools/go_fieldenum/main.go b/tools/go_fieldenum/main.go new file mode 100644 index 000000000..68dfdb3db --- /dev/null +++ b/tools/go_fieldenum/main.go @@ -0,0 +1,310 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Binary fieldenum emits field bitmasks for all structs in a package marked +// "+fieldenum". +package main + +import ( + "flag" + "fmt" + "go/ast" + "go/parser" + "go/token" + "log" + "os" + "strings" +) + +var ( + outputPkg = flag.String("pkg", "", "output package") + outputFilename = flag.String("out", "-", "output filename") +) + +func main() { + // Parse command line arguments. + flag.Parse() + if len(*outputPkg) == 0 { + log.Fatalf("-pkg must be provided") + } + if len(flag.Args()) == 0 { + log.Fatalf("Input files must be provided") + } + + // Parse input files. + inputFiles := make([]*ast.File, 0, len(flag.Args())) + fset := token.NewFileSet() + for _, filename := range flag.Args() { + f, err := parser.ParseFile(fset, filename, nil, parser.ParseComments) + if err != nil { + log.Fatalf("Failed to parse input file %q: %v", filename, err) + } + inputFiles = append(inputFiles, f) + } + + // Determine which types are marked "+fieldenum" and will consequently have + // code generated. + fieldEnumTypes := make(map[string]fieldEnumTypeInfo) + for _, f := range inputFiles { + for _, decl := range f.Decls { + d, ok := decl.(*ast.GenDecl) + if !ok || d.Tok != token.TYPE || d.Doc == nil || len(d.Specs) == 0 { + continue + } + for _, l := range d.Doc.List { + const fieldenumPrefixWithSpace = "// +fieldenum " + if l.Text == "// +fieldenum" || strings.HasPrefix(l.Text, fieldenumPrefixWithSpace) { + spec := d.Specs[0].(*ast.TypeSpec) + name := spec.Name.Name + prefix := name + if len(l.Text) > len(fieldenumPrefixWithSpace) { + prefix = strings.TrimSpace(l.Text[len(fieldenumPrefixWithSpace):]) + } + st, ok := spec.Type.(*ast.StructType) + if !ok { + log.Fatalf("Type %s is marked +fieldenum, but is not a struct", name) + } + fieldEnumTypes[name] = fieldEnumTypeInfo{ + prefix: prefix, + structType: st, + } + break + } + } + } + } + + // Collect information for each type for which code is being generated. + structInfos := make([]structInfo, 0, len(fieldEnumTypes)) + needSyncAtomic := false + for typeName, typeInfo := range fieldEnumTypes { + var si structInfo + si.name = typeName + si.prefix = typeInfo.prefix + for _, field := range typeInfo.structType.Fields.List { + name := structFieldName(field) + // If the field's type is a type that is also marked +fieldenum, + // include a FieldSet for that type in this one's. The field must + // be a struct by value, since if it's a pointer then that struct + // might also point to or include this one (which would make + // FieldSet inclusion circular). It must also be a type defined in + // this package, since otherwise we don't know whether it's marked + // +fieldenum. Thus, field.Type must be an identifier (rather than + // an ast.StarExpr or SelectorExpr). + if tident, ok := field.Type.(*ast.Ident); ok { + if fieldTypeInfo, ok := fieldEnumTypes[tident.Name]; ok { + fsf := fieldSetField{ + fieldName: name, + typePrefix: fieldTypeInfo.prefix, + } + si.reprByFieldSet = append(si.reprByFieldSet, fsf) + si.allFields = append(si.allFields, fsf) + continue + } + } + si.reprByBit = append(si.reprByBit, name) + si.allFields = append(si.allFields, fieldSetField{ + fieldName: name, + }) + // sync/atomic import will be needed for FieldSet.Load(). + needSyncAtomic = true + } + structInfos = append(structInfos, si) + } + + // Build the output file. + var b strings.Builder + fmt.Fprintf(&b, "// Generated by go_fieldenum.\n\n") + fmt.Fprintf(&b, "package %s\n\n", *outputPkg) + if needSyncAtomic { + fmt.Fprintf(&b, "import \"sync/atomic\"\n\n") + } + for _, si := range structInfos { + si.writeTo(&b) + } + + if *outputFilename == "-" { + // Write output to stdout. + fmt.Printf("%s", b.String()) + } else { + // Write output to file. + f, err := os.OpenFile(*outputFilename, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0644) + if err != nil { + log.Fatalf("Failed to open output file %q: %v", *outputFilename, err) + } + if _, err := f.WriteString(b.String()); err != nil { + log.Fatalf("Failed to write output file %q: %v", *outputFilename, err) + } + f.Close() + } +} + +type fieldEnumTypeInfo struct { + prefix string + structType *ast.StructType +} + +// structInfo contains information about the code generated for a given struct. +type structInfo struct { + // name is the name of the represented struct. + name string + + // prefix is the prefix X applied to the name of each generated type and + // constant, referred to as X in the comments below for convenience. + prefix string + + // reprByBit contains the names of fields in X that should be represented + // by a bit in the bit mask XFieldSet.fields, and by a bool in XFields. + reprByBit []string + + // reprByFieldSet contains fields in X whose type is a named struct (e.g. + // Y) that has a corresponding FieldSet type YFieldSet, and which should + // therefore be represented by including a value of type YFieldSet in + // XFieldSet, and a value of type YFields in XFields. + reprByFieldSet []fieldSetField + + // allFields contains all fields in X in order of declaration. Fields in + // reprByBit have fieldSetField.typePrefix == "". + allFields []fieldSetField +} + +type fieldSetField struct { + fieldName string + typePrefix string +} + +func structFieldName(f *ast.Field) string { + if len(f.Names) != 0 { + return f.Names[0].Name + } + // For embedded struct fields, the field name is the unqualified type name. + texpr := f.Type + for { + switch t := texpr.(type) { + case *ast.StarExpr: + texpr = t.X + case *ast.SelectorExpr: + texpr = t.Sel + case *ast.Ident: + return t.Name + default: + panic(fmt.Sprintf("unexpected %T", texpr)) + } + } +} + +// Workaround for Go defect (map membership test isn't usable in an +// expression). +func fetContains(xs map[string]*ast.StructType, x string) bool { + _, ok := xs[x] + return ok +} + +func (si *structInfo) writeTo(b *strings.Builder) { + fmt.Fprintf(b, "// A %sField represents a field in %s.\n", si.prefix, si.name) + fmt.Fprintf(b, "type %sField uint\n\n", si.prefix) + if len(si.reprByBit) != 0 { + fmt.Fprintf(b, "// %sFieldX represents %s field X.\n", si.prefix, si.name) + fmt.Fprintf(b, "const (\n") + fmt.Fprintf(b, "\t%sField%s %sField = iota\n", si.prefix, si.reprByBit[0], si.prefix) + for _, fieldName := range si.reprByBit[1:] { + fmt.Fprintf(b, "\t%sField%s\n", si.prefix, fieldName) + } + fmt.Fprintf(b, ")\n\n") + } + + fmt.Fprintf(b, "// %sFields represents a set of fields in %s in a literal-friendly form.\n", si.prefix, si.name) + fmt.Fprintf(b, "// The zero value of %sFields represents an empty set.\n", si.prefix) + fmt.Fprintf(b, "type %sFields struct {\n", si.prefix) + for _, fieldSetField := range si.allFields { + if fieldSetField.typePrefix == "" { + fmt.Fprintf(b, "\t%s bool\n", fieldSetField.fieldName) + } else { + fmt.Fprintf(b, "\t%s %sFields\n", fieldSetField.fieldName, fieldSetField.typePrefix) + } + } + fmt.Fprintf(b, "}\n\n") + + fmt.Fprintf(b, "// %sFieldSet represents a set of fields in %s in a compact form.\n", si.prefix, si.name) + fmt.Fprintf(b, "// The zero value of %sFieldSet represents an empty set.\n", si.prefix) + fmt.Fprintf(b, "type %sFieldSet struct {\n", si.prefix) + numBitmaskUint32s := (len(si.reprByBit) + 31) / 32 + for _, fieldSetField := range si.reprByFieldSet { + fmt.Fprintf(b, "\t%s %sFieldSet\n", fieldSetField.fieldName, fieldSetField.typePrefix) + } + if len(si.reprByBit) != 0 { + fmt.Fprintf(b, "\tfields [%d]uint32\n", numBitmaskUint32s) + } + fmt.Fprintf(b, "}\n\n") + + if len(si.reprByBit) != 0 { + fmt.Fprintf(b, "// Contains returns true if f is present in the %sFieldSet.\n", si.prefix) + fmt.Fprintf(b, "func (fs %sFieldSet) Contains(f %sField) bool {\n", si.prefix, si.prefix) + if numBitmaskUint32s == 1 { + fmt.Fprintf(b, "\treturn fs.fields[0] & (uint32(1) << uint(f)) != 0\n") + } else { + fmt.Fprintf(b, "\treturn fs.fields[f/32] & (uint32(1) << (f%%32)) != 0\n") + } + fmt.Fprintf(b, "}\n\n") + + fmt.Fprintf(b, "// Add adds f to the %sFieldSet.\n", si.prefix) + fmt.Fprintf(b, "func (fs *%sFieldSet) Add(f %sField) {\n", si.prefix, si.prefix) + if numBitmaskUint32s == 1 { + fmt.Fprintf(b, "\tfs.fields[0] |= uint32(1) << uint(f)\n") + } else { + fmt.Fprintf(b, "\tfs.fields[f/32] |= uint32(1) << (f%%32)\n") + } + fmt.Fprintf(b, "}\n\n") + + fmt.Fprintf(b, "// Remove removes f from the %sFieldSet.\n", si.prefix) + fmt.Fprintf(b, "func (fs *%sFieldSet) Remove(f %sField) {\n", si.prefix, si.prefix) + if numBitmaskUint32s == 1 { + fmt.Fprintf(b, "\tfs.fields[0] &^= uint32(1) << uint(f)\n") + } else { + fmt.Fprintf(b, "\tfs.fields[f/32] &^= uint32(1) << (f%%32)\n") + } + fmt.Fprintf(b, "}\n\n") + } + + fmt.Fprintf(b, "// Load returns a copy of the %sFieldSet.\n", si.prefix) + fmt.Fprintf(b, "// Load is safe to call concurrently with AddFieldsLoadable, but not Add or Remove.\n") + fmt.Fprintf(b, "func (fs *%sFieldSet) Load() (copied %sFieldSet) {\n", si.prefix, si.prefix) + for _, fieldSetField := range si.reprByFieldSet { + fmt.Fprintf(b, "\tcopied.%s = fs.%s.Load()\n", fieldSetField.fieldName, fieldSetField.fieldName) + } + for i := 0; i < numBitmaskUint32s; i++ { + fmt.Fprintf(b, "\tcopied.fields[%d] = atomic.LoadUint32(&fs.fields[%d])\n", i, i) + } + fmt.Fprintf(b, "\treturn\n") + fmt.Fprintf(b, "}\n\n") + + fmt.Fprintf(b, "// AddFieldsLoadable adds the given fields to the %sFieldSet.\n", si.prefix) + fmt.Fprintf(b, "// AddFieldsLoadable is safe to call concurrently with Load, but not other methods (including other calls to AddFieldsLoadable).\n") + fmt.Fprintf(b, "func (fs *%sFieldSet) AddFieldsLoadable(fields %sFields) {\n", si.prefix, si.prefix) + for _, fieldSetField := range si.reprByFieldSet { + fmt.Fprintf(b, "\tfs.%s.AddFieldsLoadable(fields.%s)\n", fieldSetField.fieldName, fieldSetField.fieldName) + } + for _, fieldName := range si.reprByBit { + fieldConstName := fmt.Sprintf("%sField%s", si.prefix, fieldName) + fmt.Fprintf(b, "\tif fields.%s {\n", fieldName) + if numBitmaskUint32s == 1 { + fmt.Fprintf(b, "\t\tatomic.StoreUint32(&fs.fields[0], fs.fields[0] | (uint32(1) << uint(%s)))\n", fieldConstName) + } else { + fmt.Fprintf(b, "\t\tword, bit := %s/32, %s%%32\n", fieldConstName, fieldConstName) + fmt.Fprintf(b, "\t\tatomic.StoreUint32(&fs.fields[word], fs.fields[word] | (uint32(1) << bit))\n") + } + fmt.Fprintf(b, "\t}\n") + } + fmt.Fprintf(b, "}\n\n") +} diff --git a/tools/go_generics/go_merge/BUILD b/tools/go_generics/go_merge/BUILD index 5e0487e93..211e6b3ed 100644 --- a/tools/go_generics/go_merge/BUILD +++ b/tools/go_generics/go_merge/BUILD @@ -7,6 +7,6 @@ go_binary( srcs = ["main.go"], visibility = ["//:sandbox"], deps = [ - "//tools/tags", + "//tools/constraintutil", ], ) diff --git a/tools/go_generics/go_merge/main.go b/tools/go_generics/go_merge/main.go index 801f2354f..81394ddce 100644 --- a/tools/go_generics/go_merge/main.go +++ b/tools/go_generics/go_merge/main.go @@ -25,9 +25,8 @@ import ( "os" "path/filepath" "strconv" - "strings" - "gvisor.dev/gvisor/tools/tags" + "gvisor.dev/gvisor/tools/constraintutil" ) var ( @@ -131,6 +130,12 @@ func main() { } f.Decls = newDecls + // Infer build constraints for the output file. + bcexpr, err := constraintutil.CombineFromFiles(flag.Args()) + if err != nil { + fatalf("Failed to read build constraints: %v\n", err) + } + // Write the output file. var buf bytes.Buffer if err := format.Node(&buf, fset, f); err != nil { @@ -141,9 +146,7 @@ func main() { fatalf("opening output: %v\n", err) } defer outf.Close() - if t := tags.Aggregate(flag.Args()); len(t) > 0 { - fmt.Fprintf(outf, "%s\n\n", strings.Join(t.Lines(), "\n")) - } + outf.WriteString(constraintutil.Lines(bcexpr)) if _, err := outf.Write(buf.Bytes()); err != nil { fatalf("write: %v\n", err) } diff --git a/tools/go_generics/rules_tests/template_test.go b/tools/go_generics/rules_tests/template_test.go index b2a3446ef..6f4d140da 100644 --- a/tools/go_generics/rules_tests/template_test.go +++ b/tools/go_generics/rules_tests/template_test.go @@ -20,14 +20,16 @@ import ( ) func TestMax(t *testing.T) { - var a int = max(10, 20) + var a int + a = max(10, 20) if a != 20 { t.Errorf("Bad result of max, got %v, want %v", a, 20) } } func TestIntConst(t *testing.T) { - var a int = add(10) + var a int + a = add(10) if a != 30 { t.Errorf("Bad result of add, got %v, want %v", a, 30) } diff --git a/tools/go_marshal/gomarshal/BUILD b/tools/go_marshal/gomarshal/BUILD index c2747d94c..aaa203115 100644 --- a/tools/go_marshal/gomarshal/BUILD +++ b/tools/go_marshal/gomarshal/BUILD @@ -18,5 +18,5 @@ go_library( visibility = [ "//:sandbox", ], - deps = ["//tools/tags"], + deps = ["//tools/constraintutil"], ) diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go index 00961c90d..4c23637c0 100644 --- a/tools/go_marshal/gomarshal/generator.go +++ b/tools/go_marshal/gomarshal/generator.go @@ -25,7 +25,7 @@ import ( "sort" "strings" - "gvisor.dev/gvisor/tools/tags" + "gvisor.dev/gvisor/tools/constraintutil" ) // List of identifiers we use in generated code that may conflict with a @@ -123,16 +123,18 @@ func (g *Generator) writeHeader() error { var b sourceBuffer b.emit("// Automatically generated marshal implementation. See tools/go_marshal.\n\n") - // Emit build tags. - b.emit("// If there are issues with build tag aggregation, see\n") - b.emit("// tools/go_marshal/gomarshal/generator.go:writeHeader(). The build tags here\n") - b.emit("// come from the input set of files used to generate this file. This input set\n") - b.emit("// is filtered based on pre-defined file suffixes related to build tags, see \n") - b.emit("// tools/defs.bzl:calculate_sets().\n\n") - - if t := tags.Aggregate(g.inputs); len(t) > 0 { - b.emit(strings.Join(t.Lines(), "\n")) - b.emit("\n\n") + bcexpr, err := constraintutil.CombineFromFiles(g.inputs) + if err != nil { + return err + } + if bcexpr != nil { + // Emit build constraints. + b.emit("// If there are issues with build constraint aggregation, see\n") + b.emit("// tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here\n") + b.emit("// come from the input set of files used to generate this file. This input set\n") + b.emit("// is filtered based on pre-defined file suffixes related to build constraints,\n") + b.emit("// see tools/defs.bzl:calculate_sets().\n\n") + b.emit(constraintutil.Lines(bcexpr)) } // Package header. @@ -553,11 +555,12 @@ func (g *Generator) writeTests(ts []*testGenerator) error { b.reset() b.emit("// Automatically generated marshal tests. See tools/go_marshal.\n\n") - // Emit build tags. - if t := tags.Aggregate(g.inputs); len(t) > 0 { - b.emit(strings.Join(t.Lines(), "\n")) - b.emit("\n\n") + // Emit build constraints. + bcexpr, err := constraintutil.CombineFromFiles(g.inputs) + if err != nil { + return err } + b.emit(constraintutil.Lines(bcexpr)) b.emit("package %s\n\n", g.pkg) if err := b.write(g.outputTest); err != nil { diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD index e872560a9..d315be060 100644 --- a/tools/go_marshal/test/BUILD +++ b/tools/go_marshal/test/BUILD @@ -41,10 +41,10 @@ go_test( srcs = ["marshal_test.go"], deps = [ ":test", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/marshal", "//pkg/marshal/primitive", - "//pkg/syserror", "//pkg/usermem", "//tools/go_marshal/analysis", "@com_github_google_go_cmp//cmp:go_default_library", diff --git a/tools/go_marshal/test/marshal_test.go b/tools/go_marshal/test/marshal_test.go index 43bafbf96..dec3e84fd 100644 --- a/tools/go_marshal/test/marshal_test.go +++ b/tools/go_marshal/test/marshal_test.go @@ -27,16 +27,16 @@ import ( "unsafe" "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/tools/go_marshal/analysis" "gvisor.dev/gvisor/tools/go_marshal/test" ) -var simulatedErr error = syserror.EFAULT +var simulatedErr error = linuxerr.EFAULT // mockCopyContext implements marshal.CopyContext. type mockCopyContext struct { diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD index 913558b4e..ad66981c7 100644 --- a/tools/go_stateify/BUILD +++ b/tools/go_stateify/BUILD @@ -6,7 +6,7 @@ go_binary( name = "stateify", srcs = ["main.go"], visibility = ["//:sandbox"], - deps = ["//tools/tags"], + deps = ["//tools/constraintutil"], ) bzl_library( diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go index 93022f504..3cf00b5dd 100644 --- a/tools/go_stateify/main.go +++ b/tools/go_stateify/main.go @@ -28,7 +28,7 @@ import ( "strings" "sync" - "gvisor.dev/gvisor/tools/tags" + "gvisor.dev/gvisor/tools/constraintutil" ) var ( @@ -214,10 +214,13 @@ func main() { // Automated warning. fmt.Fprint(outputFile, "// automatically generated by stateify.\n\n") - // Emit build tags. - if t := tags.Aggregate(flag.Args()); len(t) > 0 { - fmt.Fprintf(outputFile, "%s\n\n", strings.Join(t.Lines(), "\n")) + // Emit build constraints. + bcexpr, err := constraintutil.CombineFromFiles(flag.Args()) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to infer build constraints: %v", err) + os.Exit(1) } + outputFile.WriteString(constraintutil.Lines(bcexpr)) // Emit the package name. _, pkg := filepath.Split(*fullPkg) @@ -359,7 +362,12 @@ func main() { fmt.Fprintf(outputFile, " stateSourceObject.LoadWait(%d, &%s.%s)\n", fields[name], recv, name) } emitSaveValue := func(name, typName string) { - fmt.Fprintf(outputFile, " var %sValue %s = %s.save%s()\n", name, typName, recv, camelCased(name)) + // Emit typName to be more robust against code generation bugs, + // but instead of one line make two lines to silence ST1023 + // finding (i.e. avoid nogo finding: "should omit type $typName + // from declaration; it will be inferred from the right-hand side") + fmt.Fprintf(outputFile, " var %sValue %s\n", name, typName) + fmt.Fprintf(outputFile, " %sValue = %s.save%s()\n", name, recv, camelCased(name)) fmt.Fprintf(outputFile, " stateSinkObject.SaveValue(%d, %sValue)\n", fields[name], name) } emitSave := func(name string) { diff --git a/tools/installers/containerd.sh b/tools/installers/containerd.sh index e598bce89..b8da1fe42 100755 --- a/tools/installers/containerd.sh +++ b/tools/installers/containerd.sh @@ -20,6 +20,9 @@ declare -r CONTAINERD_VERSION=${1:-1.3.0} declare -r CONTAINERD_MAJOR="$(echo ${CONTAINERD_VERSION} | awk -F '.' '{ print $1; }')" declare -r CONTAINERD_MINOR="$(echo ${CONTAINERD_VERSION} | awk -F '.' '{ print $2; }')" +# We're running Go 1.16, but using pre-module containerd and cri-tools. +export GO111MODULE=off + # Default to an older version for crictl for containerd <= 1.2. if [[ "${CONTAINERD_MAJOR}" -eq 1 ]] && [[ "${CONTAINERD_MINOR}" -le 2 ]]; then declare -r CRITOOLS_VERSION=${CRITOOLS_VERSION:-1.13.0} @@ -29,8 +32,8 @@ fi # Helper for Go packages below. install_helper() { - PACKAGE="${1}" - TAG="${2}" + declare -r PACKAGE="${1}" + declare -r TAG="${2}" # Clone the repository. mkdir -p "${GOPATH}"/src/$(dirname "${PACKAGE}") && \ @@ -71,8 +74,8 @@ done # Install containerd & cri-tools. declare -rx GOPATH=$(mktemp -d --tmpdir gopathXXXXX) -install_helper github.com/containerd/containerd "v${CONTAINERD_VERSION}" "${GOPATH}" -install_helper github.com/kubernetes-sigs/cri-tools "v${CRITOOLS_VERSION}" "${GOPATH}" +install_helper github.com/containerd/containerd "v${CONTAINERD_VERSION}" +install_helper github.com/kubernetes-sigs/cri-tools "v${CRITOOLS_VERSION}" # Configure containerd-shim. declare -r shim_config_path=/etc/containerd/runsc/config.toml diff --git a/tools/nogo/BUILD b/tools/nogo/BUILD index 6c6f604b5..d72821377 100644 --- a/tools/nogo/BUILD +++ b/tools/nogo/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "bzl_library", "go_library", "select_goarch", "select_goos") +load("//tools:defs.bzl", "bzl_library", "go_library", "go_test", "select_goarch", "select_goos") load("//tools/nogo:defs.bzl", "nogo_objdump_tool", "nogo_stdlib", "nogo_target") package(licenses = ["notice"]) @@ -35,8 +35,10 @@ go_library( visibility = ["//:sandbox"], deps = [ "//tools/checkescape", + "//tools/checklinkname", "//tools/checklocks", "//tools/checkunsafe", + "//tools/nogo/objdump", "//tools/worker", "@co_honnef_go_tools//staticcheck:go_default_library", "@co_honnef_go_tools//stylecheck:go_default_library", @@ -68,9 +70,16 @@ go_library( "@org_golang_x_tools//go/analysis/passes/unsafeptr:go_default_library", "@org_golang_x_tools//go/analysis/passes/unusedresult:go_default_library", "@org_golang_x_tools//go/gcexportdata:go_default_library", + "@org_golang_x_tools//go/types/objectpath:go_default_library", ], ) +go_test( + name = "nogo_test", + srcs = ["config_test.go"], + library = ":nogo", +) + bzl_library( name = "defs_bzl", srcs = ["defs.bzl"], diff --git a/tools/nogo/analyzers.go b/tools/nogo/analyzers.go index 2b3c03fec..db8bbdb8a 100644 --- a/tools/nogo/analyzers.go +++ b/tools/nogo/analyzers.go @@ -47,6 +47,7 @@ import ( "honnef.co/go/tools/stylecheck" "gvisor.dev/gvisor/tools/checkescape" + "gvisor.dev/gvisor/tools/checklinkname" "gvisor.dev/gvisor/tools/checklocks" "gvisor.dev/gvisor/tools/checkunsafe" ) @@ -80,6 +81,7 @@ var AllAnalyzers = []*analysis.Analyzer{ unusedresult.Analyzer, checkescape.Analyzer, checkunsafe.Analyzer, + checklinkname.Analyzer, checklocks.Analyzer, } @@ -115,11 +117,11 @@ func register(all []*analysis.Analyzer) { func init() { // Add all staticcheck analyzers. for _, a := range staticcheck.Analyzers { - AllAnalyzers = append(AllAnalyzers, a) + AllAnalyzers = append(AllAnalyzers, a.Analyzer) } // Add all stylecheck analyzers. for _, a := range stylecheck.Analyzers { - AllAnalyzers = append(AllAnalyzers, a) + AllAnalyzers = append(AllAnalyzers, a.Analyzer) } // Register lists. diff --git a/tools/nogo/build.go b/tools/nogo/build.go index d173cff1f..4067bb480 100644 --- a/tools/nogo/build.go +++ b/tools/nogo/build.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package nogo import ( diff --git a/tools/nogo/check/main.go b/tools/nogo/check/main.go index 3a6c3fb08..0e7e92965 100644 --- a/tools/nogo/check/main.go +++ b/tools/nogo/check/main.go @@ -62,7 +62,8 @@ func run([]string) int { // Check & load the configuration. if *packageFile != "" && *stdlibFile != "" { - log.Fatalf("unable to perform stdlib and package analysis; provide only one!") + fmt.Fprintf(os.Stderr, "unable to perform stdlib and package analysis; provide only one!") + return 1 } // Run the configuration. @@ -75,18 +76,21 @@ func run([]string) int { c := loadConfig(*packageFile, new(nogo.PackageConfig)).(*nogo.PackageConfig) findings, factData, err = nogo.CheckPackage(c, nogo.AllAnalyzers, nil) } else { - log.Fatalf("please provide at least one of package or stdlib!") + fmt.Fprintf(os.Stderr, "please provide at least one of package or stdlib!") + return 1 } // Check that analysis was successful. if err != nil { - log.Fatalf("error performing analysis: %v", err) + fmt.Fprintf(os.Stderr, "error performing analysis: %v", err) + return 1 } // Save facts. if *factsOutput != "" { if err := ioutil.WriteFile(*factsOutput, factData, 0644); err != nil { - log.Fatalf("error saving findings to %q: %v", *factsOutput, err) + fmt.Fprintf(os.Stderr, "error saving findings to %q: %v", *factsOutput, err) + return 1 } } @@ -94,10 +98,12 @@ func run([]string) int { if *findingsOutput != "" { w, err := os.OpenFile(*findingsOutput, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { - log.Fatalf("error opening output file %q: %v", *findingsOutput, err) + fmt.Fprintf(os.Stderr, "error opening output file %q: %v", *findingsOutput, err) + return 1 } if err := nogo.WriteFindingsTo(w, findings, false /* json */); err != nil { - log.Fatalf("error writing findings to %q: %v", *findingsOutput, err) + fmt.Fprintf(os.Stderr, "error writing findings to %q: %v", *findingsOutput, err) + return 1 } } else { for _, finding := range findings { diff --git a/tools/nogo/config.go b/tools/nogo/config.go index 6436f9d34..ee2533610 100644 --- a/tools/nogo/config.go +++ b/tools/nogo/config.go @@ -186,16 +186,19 @@ func (a AnalyzerConfig) merge(other AnalyzerConfig) { } } -func (a AnalyzerConfig) shouldReport(groupConfig *Group, fullPos, msg string) bool { +// shouldReport returns whether the finding should be reported or suppressed. +// It returns !ok if there is no configuration sufficient to decide one way or +// another. +func (a AnalyzerConfig) shouldReport(groupConfig *Group, fullPos, msg string) (report, ok bool) { gc, ok := a[groupConfig.Name] if !ok { - return groupConfig.Default + return false, false } // Note that if a section appears for a particular group // for a particular analyzer, then it will now be enabled, // and the group default no longer applies. - return gc.shouldReport(fullPos, msg) + return gc.shouldReport(fullPos, msg), true } // Config is a nogo configuration. @@ -298,7 +301,8 @@ func (c *Config) ShouldReport(finding Finding) bool { } // Suppress via global rule? - if !c.Global.shouldReport(groupConfig, fullPos, finding.Message) { + report, ok := c.Global.shouldReport(groupConfig, fullPos, finding.Message) + if ok && !report { return false } @@ -307,5 +311,9 @@ func (c *Config) ShouldReport(finding Finding) bool { if !ok { return groupConfig.Default } - return ac.shouldReport(groupConfig, fullPos, finding.Message) + report, ok = ac.shouldReport(groupConfig, fullPos, finding.Message) + if !ok { + return groupConfig.Default + } + return report } diff --git a/tools/nogo/config_test.go b/tools/nogo/config_test.go new file mode 100644 index 000000000..685cffbec --- /dev/null +++ b/tools/nogo/config_test.go @@ -0,0 +1,301 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.package nogo +package nogo + +import ( + "go/token" + "testing" +) + +// TestShouldReport validates the suppression behavior of Config.ShouldReport. +func TestShouldReport(t *testing.T) { + config := &Config{ + Groups: []Group{ + { + Name: "default-enabled", + Regex: "^default-enabled/", + Default: true, + }, + { + Name: "default-disabled", + Regex: "^default-disabled/", + Default: false, + }, + { + Name: "default-disabled-omitted-from-global", + Regex: "^default-disabled-omitted-from-global/", + Default: false, + }, + }, + Global: AnalyzerConfig{ + "default-enabled": &ItemConfig{ + Exclude: []string{"excluded.go"}, + Suppress: []string{"suppressed"}, + }, + "default-disabled": &ItemConfig{ + Exclude: []string{"excluded.go"}, + Suppress: []string{"suppressed"}, + }, + // Omitting default-disabled-omitted-from-global here + // has no effect on configuration below. + }, + Analyzers: map[AnalyzerName]AnalyzerConfig{ + "analyzer-suppressions": AnalyzerConfig{ + // Suppress some. + "default-enabled": &ItemConfig{ + Exclude: []string{"limited-exclude.go"}, + Suppress: []string{"limited suppress"}, + }, + // Enable all. + "default-disabled": nil, + }, + "enabled-for-default-disabled": AnalyzerConfig{ + "default-disabled": nil, + "default-disabled-omitted-from-global": nil, + }, + }, + } + + if err := config.Compile(); err != nil { + t.Fatalf("Compile(%+v) = %v, want nil", config, err) + } + + cases := []struct { + name string + finding Finding + want bool + }{ + { + name: "enabled", + finding: Finding{ + Category: "foo", + Position: token.Position{ + Filename: "default-enabled/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: true, + }, + { + name: "ungrouped", + finding: Finding{ + Category: "foo", + Position: token.Position{ + Filename: "ungrouped/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: true, + }, + { + name: "suppressed", + finding: Finding{ + Category: "foo", + Position: token.Position{ + Filename: "default-enabled/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message suppressed", + }, + want: false, + }, + { + name: "excluded", + finding: Finding{ + Category: "foo", + Position: token.Position{ + Filename: "default-enabled/excluded.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: false, + }, + { + name: "disabled", + finding: Finding{ + Category: "foo", + Position: token.Position{ + Filename: "default-disabled/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: false, + }, + { + name: "analyzer suppressed", + finding: Finding{ + Category: "analyzer-suppressions", + Position: token.Position{ + Filename: "default-enabled/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message limited suppress", + }, + want: false, + }, + { + name: "analyzer suppressed not global", + finding: Finding{ + // Doesn't apply outside of analyzer-suppressions. + Category: "foo", + Position: token.Position{ + Filename: "default-enabled/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message limited suppress", + }, + want: true, + }, + { + name: "analyzer suppressed grouped", + finding: Finding{ + Category: "analyzer-suppressions", + Position: token.Position{ + // Doesn't apply outside of default-enabled. + Filename: "default-disabled/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message limited suppress", + }, + want: true, + }, + { + name: "analyzer excluded", + finding: Finding{ + Category: "analyzer-suppressions", + Position: token.Position{ + Filename: "default-enabled/limited-exclude.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: false, + }, + { + name: "analyzer excluded not global", + finding: Finding{ + // Doesn't apply outside of analyzer-suppressions. + Category: "foo", + Position: token.Position{ + Filename: "default-enabled/limited-exclude.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: true, + }, + { + name: "analyzer excluded grouped", + finding: Finding{ + Category: "analyzer-suppressions", + Position: token.Position{ + // Doesn't apply outside of default-enabled. + Filename: "default-disabled/limited-exclude.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: true, + }, + { + name: "disabled-omitted", + finding: Finding{ + Category: "foo", + Position: token.Position{ + Filename: "default-disabled-omitted-from-global/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: false, + }, + { + name: "default enabled applies to customized analyzer", + finding: Finding{ + Category: "enabled-for-default-disabled", + Position: token.Position{ + Filename: "default-enabled/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: true, + }, + { + name: "default overridden in customized analyzer", + finding: Finding{ + Category: "enabled-for-default-disabled", + Position: token.Position{ + Filename: "default-disabled/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: true, + }, + { + name: "default overridden in customized analyzer even when omitted from global", + finding: Finding{ + Category: "enabled-for-default-disabled", + Position: token.Position{ + Filename: "default-disabled-omitted-from-global/file.go", + Offset: 0, + Line: 1, + Column: 1, + }, + Message: "message", + }, + want: true, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := config.ShouldReport(tc.finding); got != tc.want { + t.Errorf("ShouldReport(%+v) = %v, want %v", tc.finding, got, tc.want) + } + }) + } +} diff --git a/tools/nogo/defs.bzl b/tools/nogo/defs.bzl index ddf5816a6..dc9a8b24e 100644 --- a/tools/nogo/defs.bzl +++ b/tools/nogo/defs.bzl @@ -160,6 +160,11 @@ def _nogo_stdlib_impl(ctx): return [NogoStdlibInfo( facts = facts, raw_findings = raw_findings, + ), DefaultInfo( + # Declare the facts and findings as default outputs. This is not + # strictly required, but ensures that the target still perform analysis + # when built directly rather than just indirectly via a nogo_test. + files = depset([facts, raw_findings]), )] nogo_stdlib = go_rule( @@ -198,6 +203,22 @@ NogoInfo = provider( }, ) +def _select_objfile(files): + """Returns (.a file, .x file, is_archive). + + If no .a file is available, then the first .x file will be returned + instead, and vice versa. If neither are available, then the first provided + file will be returned.""" + a_files = [f for f in files if f.path.endswith(".a")] + x_files = [f for f in files if f.path.endswith(".x")] + if not len(x_files) and not len(a_files): + return (files[0], files[0], False) + if not len(x_files): + x_files = a_files + if not len(a_files): + a_files = x_files + return a_files[0], x_files[0], True + def _nogo_aspect_impl(target, ctx): # If this is a nogo rule itself (and not the shadow of a go_library or # go_binary rule created by such a rule), then we simply return nothing. @@ -232,20 +253,14 @@ def _nogo_aspect_impl(target, ctx): deps = deps + info.deps # Start with all target files and srcs as input. - inputs = target.files.to_list() + srcs + binaries = target.files.to_list() + inputs = binaries + srcs # Generate a shell script that dumps the binary. Annoyingly, this seems # necessary as the context in which a run_shell command runs does not seem # to cleanly allow us redirect stdout to the actual output file. Perhaps # I'm missing something here, but the intermediate script does work. - binaries = target.files.to_list() - objfiles = [f for f in binaries if f.path.endswith(".a")] - if len(objfiles) > 0: - # Prefer the .a files for go_library targets. - target_objfile = objfiles[0] - else: - # Use the raw binary for go_binary and go_test targets. - target_objfile = binaries[0] + target_objfile, target_xfile, has_objfile = _select_objfile(binaries) inputs.append(target_objfile) # Extract the importpath for this package. @@ -274,10 +289,8 @@ def _nogo_aspect_impl(target, ctx): # Configure where to find the binary & fact files. Note that this will # use .x and .a regardless of whether this is a go_binary rule, since # these dependencies must be go_library rules. - x_files = [f.path for f in info.binaries if f.path.endswith(".x")] - if not len(x_files): - x_files = [f.path for f in info.binaries if f.path.endswith(".a")] - import_map[info.importpath] = x_files[0] + _, x_file, _ = _select_objfile(info.binaries) + import_map[info.importpath] = x_file.path fact_map[info.importpath] = info.facts.path # Collect all findings; duplicates are resolved at the end. @@ -287,6 +300,11 @@ def _nogo_aspect_impl(target, ctx): inputs.append(info.facts) inputs += info.binaries + # Add the module itself, for the type sanity check. This applies only to + # the libraries, and not binaries or tests. + if has_objfile: + import_map[importpath] = target_xfile.path + # Add the standard library facts. stdlib_info = ctx.attr._nogo_stdlib[NogoStdlibInfo] stdlib_facts = stdlib_info.facts diff --git a/tools/nogo/filter/main.go b/tools/nogo/filter/main.go index d50336b9b..4a925d03c 100644 --- a/tools/nogo/filter/main.go +++ b/tools/nogo/filter/main.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Binary check is the nogo entrypoint. +// Binary filter is the filters and reports nogo findings. package main import ( diff --git a/tools/nogo/findings.go b/tools/nogo/findings.go index 329a7062e..a73bf1a09 100644 --- a/tools/nogo/findings.go +++ b/tools/nogo/findings.go @@ -109,7 +109,7 @@ func ExtractFindingsFromFile(filename string, asJSON bool) (FindingSet, error) { return ExtractFindingsFrom(r, asJSON) } -// ExtractFindingsFromBytes loads findings from bytes. +// ExtractFindingsFrom loads findings from an io.Reader. func ExtractFindingsFrom(r io.Reader, asJSON bool) (findings FindingSet, err error) { if asJSON { dec := json.NewDecoder(r) diff --git a/tools/nogo/nogo.go b/tools/nogo/nogo.go index acee7c8bc..d95d7652f 100644 --- a/tools/nogo/nogo.go +++ b/tools/nogo/nogo.go @@ -41,9 +41,10 @@ import ( "golang.org/x/tools/go/analysis" "golang.org/x/tools/go/analysis/internal/facts" "golang.org/x/tools/go/gcexportdata" + "golang.org/x/tools/go/types/objectpath" // Special case: flags live here and change overall behavior. - "gvisor.dev/gvisor/tools/checkescape" + "gvisor.dev/gvisor/tools/nogo/objdump" "gvisor.dev/gvisor/tools/worker" ) @@ -216,6 +217,11 @@ func (i *importer) Import(path string) (*types.Package, error) { } } + // Check the cache. + if pkg, ok := i.cache[path]; ok && pkg.Complete() { + return pkg, nil + } + // Actually load the data. realPath, ok := i.ImportMap[path] var ( @@ -327,6 +333,9 @@ func CheckStdlib(config *StdlibConfig, analyzers []*analysis.Analyzer) (allFindi // Closure to check a single package. localStdlibFacts := make(stdlibFacts) localStdlibErrs := make(map[string]error) + stdlibCachedFacts.Lookup([]string{""}, func() worker.Sizer { + return localStdlibFacts + }) var checkOne func(pkg string) error // Recursive. checkOne = func(pkg string) error { // Is this already done? @@ -355,11 +364,11 @@ func CheckStdlib(config *StdlibConfig, analyzers []*analysis.Analyzer) (allFindi } // Provide the input. - oldReader := checkescape.Reader - checkescape.Reader = rc // For analysis. + oldReader := objdump.Reader + objdump.Reader = rc // For analysis. defer func() { rc.Close() - checkescape.Reader = oldReader // Restore. + objdump.Reader = oldReader // Restore. }() // Run the analysis. @@ -406,6 +415,56 @@ func CheckStdlib(config *StdlibConfig, analyzers []*analysis.Analyzer) (allFindi return allFindings, buf.Bytes(), nil } +// sanityCheckScope checks that all object in astTypes map to the correct +// objects in binaryTypes. Note that we don't check whether the sets are the +// same, we only care about the fidelity of objects in astTypes. +// +// When an inconsistency is identified, we record it in the astToBinaryMap. +// This allows us to dynamically replace facts and correct for the issue. The +// total number of mismatches is returned. +func sanityCheckScope(astScope *types.Scope, binaryTypes *types.Package, binaryScope *types.Scope, astToBinary map[types.Object]types.Object) error { + for _, x := range astScope.Names() { + fe := astScope.Lookup(x) + path, err := objectpath.For(fe) + if err != nil { + continue // Not an encoded object. + } + se, err := objectpath.Object(binaryTypes, path) + if err != nil { + continue // May be unused, see below. + } + if fe.Id() != se.Id() { + // These types are incompatible. This means that when + // this objectpath is loading from the binaryTypes (for + // dependencies) it will resolve to a fact for that + // type. We don't actually care about this error since + // we do the rewritten, but may as well alert. + log.Printf("WARNING: Object %s is a victim of go/issues/44195.", fe.Id()) + } + se = binaryScope.Lookup(x) + if se == nil { + // The fact may not be exported in the objectdata, if + // it is package internal. This is fine, as nothing out + // of this package can use these symbols. + continue + } + // Save the translation. + astToBinary[fe] = se + } + for i := 0; i < astScope.NumChildren(); i++ { + if err := sanityCheckScope(astScope.Child(i), binaryTypes, binaryScope, astToBinary); err != nil { + return err + } + } + return nil +} + +// sanityCheckTypes checks that two types are sane. The total number of +// mismatches is returned. +func sanityCheckTypes(astTypes, binaryTypes *types.Package, astToBinary map[types.Object]types.Object) error { + return sanityCheckScope(astTypes.Scope(), binaryTypes, binaryTypes.Scope(), astToBinary) +} + // CheckPackage runs all given analyzers. // // The implementation was adapted from [1], which was in turn adpated from [2]. @@ -450,17 +509,46 @@ func CheckPackage(config *PackageConfig, analyzers []*analysis.Analyzer, importC Scopes: make(map[ast.Node]*types.Scope), Selections: make(map[*ast.SelectorExpr]*types.Selection), } - types, err := typeConfig.Check(config.ImportPath, imp.fset, syntax, typesInfo) + astTypes, err := typeConfig.Check(config.ImportPath, imp.fset, syntax, typesInfo) if err != nil && imp.lastErr != ErrSkip { return nil, nil, fmt.Errorf("error checking types: %w", err) } - // Load all package facts. - facts, err := facts.Decode(types, config.factLoader) + // Load all facts using the astTypes, although it may need reconciling + // later on. See the fact functions below. + astFacts, err := facts.Decode(astTypes, config.factLoader) if err != nil { return nil, nil, fmt.Errorf("error decoding facts: %w", err) } + // Sanity check all types and record metadata to prevent + // https://github.com/golang/go/issues/44195. + // + // This block loads the binary types, whose encoding will be well + // defined and aligned with any downstream consumers. Below in the fact + // functions for the analysis, we serialize types to both the astFacts + // and the binaryFacts if available. The binaryFacts are the final + // encoded facts in order to ensure compatibility. We keep the + // intermediate astTypes in order to allow exporting and importing + // within the local package under analysis. + var ( + astToBinary = make(map[types.Object]types.Object) + binaryFacts *facts.Set + ) + if _, ok := config.ImportMap[config.ImportPath]; ok { + binaryTypes, err := imp.Import(config.ImportPath) + if err != nil { + return nil, nil, fmt.Errorf("error loading self: %w", err) + } + if err := sanityCheckTypes(astTypes, binaryTypes, astToBinary); err != nil { + return nil, nil, fmt.Errorf("error sanity checking types: %w", err) + } + binaryFacts, err = facts.Decode(binaryTypes, config.factLoader) + if err != nil { + return nil, nil, fmt.Errorf("error decoding facts: %w", err) + } + } + // Register fact types and establish dependencies between analyzers. // The visit closure will execute recursively, and populate results // will all required analysis results. @@ -479,15 +567,15 @@ func CheckPackage(config *PackageConfig, analyzers []*analysis.Analyzer, importC } // Run the analysis. - factFilter := make(map[reflect.Type]bool) + localFactsFilter := make(map[reflect.Type]bool) for _, f := range a.FactTypes { - factFilter[reflect.TypeOf(f)] = true + localFactsFilter[reflect.TypeOf(f)] = true } p := &analysis.Pass{ Analyzer: a, Fset: imp.fset, Files: syntax, - Pkg: types, + Pkg: astTypes, TypesInfo: typesInfo, ResultOf: results, // All results. Report: func(d analysis.Diagnostic) { @@ -497,13 +585,29 @@ func CheckPackage(config *PackageConfig, analyzers []*analysis.Analyzer, importC Message: d.Message, }) }, - ImportPackageFact: facts.ImportPackageFact, - ExportPackageFact: facts.ExportPackageFact, - ImportObjectFact: facts.ImportObjectFact, - ExportObjectFact: facts.ExportObjectFact, - AllPackageFacts: func() []analysis.PackageFact { return facts.AllPackageFacts(factFilter) }, - AllObjectFacts: func() []analysis.ObjectFact { return facts.AllObjectFacts(factFilter) }, - TypesSizes: typesSizes, + ImportPackageFact: astFacts.ImportPackageFact, + ExportPackageFact: func(fact analysis.Fact) { + astFacts.ExportPackageFact(fact) + if binaryFacts != nil { + binaryFacts.ExportPackageFact(fact) + } + }, + ImportObjectFact: astFacts.ImportObjectFact, + ExportObjectFact: func(obj types.Object, fact analysis.Fact) { + astFacts.ExportObjectFact(obj, fact) + // Note that if no object is recorded in + // astToBinary and binaryFacts != nil, then the + // object doesn't appear in the exported data. + // It was likely an internal object to the + // package, and there is no meaningful + // downstream consumer of the fact. + if binaryObj, ok := astToBinary[obj]; ok && binaryFacts != nil { + binaryFacts.ExportObjectFact(binaryObj, fact) + } + }, + AllPackageFacts: func() []analysis.PackageFact { return astFacts.AllPackageFacts(localFactsFilter) }, + AllObjectFacts: func() []analysis.ObjectFact { return astFacts.AllObjectFacts(localFactsFilter) }, + TypesSizes: typesSizes, } result, err := a.Run(p) if err != nil { @@ -528,8 +632,14 @@ func CheckPackage(config *PackageConfig, analyzers []*analysis.Analyzer, importC } } - // Return all findings. - return findings, facts.Encode(), nil + // Return all findings. Note that we have a preference to returning the + // binary facts if available, so that downstream consumers of these + // facts will find the export aligns with the internal type details. + // See the block above with the call to sanityCheckTypes. + if binaryFacts != nil { + return findings, binaryFacts.Encode(), nil + } + return findings, astFacts.Encode(), nil } func init() { diff --git a/tools/tags/BUILD b/tools/nogo/objdump/BUILD index 1c02e2c89..da56efdf7 100644 --- a/tools/tags/BUILD +++ b/tools/nogo/objdump/BUILD @@ -3,9 +3,8 @@ load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( - name = "tags", - srcs = ["tags.go"], - marshal = False, - stateify = False, + name = "objdump", + srcs = ["objdump.go"], + nogo = False, visibility = ["//tools:__subpackages__"], ) diff --git a/tools/nogo/objdump/objdump.go b/tools/nogo/objdump/objdump.go new file mode 100644 index 000000000..48484abf3 --- /dev/null +++ b/tools/nogo/objdump/objdump.go @@ -0,0 +1,96 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package objdump is a wrapper around relevant objdump flags. +package objdump + +import ( + "flag" + "fmt" + "io" + "os" + "os/exec" +) + +var ( + // Binary is the binary under analysis. + // + // See Reader, below. + binary = flag.String("binary", "", "binary under analysis") + + // Reader is the input stream. + // + // This may be set instead of Binary. + Reader io.Reader + + // objdumpTool is the tool used to dump a binary. + objdumpTool = flag.String("objdump_tool", "", "tool used to dump a binary") +) + +// LoadRaw reads the raw object output. +func LoadRaw(fn func(r io.Reader) error) error { + var r io.Reader + if *binary != "" { + f, err := os.Open(*binary) + if err != nil { + return err + } + defer f.Close() + r = f + } else if Reader != nil { + r = Reader + } else { + // We have no input stream. + return fmt.Errorf("no binary or reader provided") + } + return fn(r) +} + +// Load reads the objdump output. +func Load(fn func(r io.Reader) error) error { + var ( + args []string + stdin io.Reader + ) + if *binary != "" { + args = append(args, *binary) + } else if Reader != nil { + stdin = Reader + } else { + // We have no input stream or binary. + return fmt.Errorf("no binary or reader provided") + } + + // Construct our command. + cmd := exec.Command(*objdumpTool, args...) + cmd.Stdin = stdin + cmd.Stderr = os.Stderr + out, err := cmd.StdoutPipe() + if err != nil { + return err + } + if err := cmd.Start(); err != nil { + return err + } + + // Call the user hook. + userErr := fn(out) + + // Wait for the dump to finish. + if err := cmd.Wait(); userErr == nil && err != nil { + return err + } + + return userErr +} diff --git a/tools/parsers/version.go b/tools/parsers/version.go index ab9194b9d..c250f4a2a 100644 --- a/tools/parsers/version.go +++ b/tools/parsers/version.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build go1.1 +// +build go1.1 + package main // version is set during linking. diff --git a/tools/show_paths.bzl b/tools/show_paths.bzl new file mode 100644 index 000000000..f0126ac7b --- /dev/null +++ b/tools/show_paths.bzl @@ -0,0 +1,27 @@ +"""Formatter to extract the output files from a target.""" + +def format(target): + provider_map = providers(target) + if not provider_map: + return "" + outputs = dict() + + # Try to resolve in order. + files_to_run = provider_map.get("FilesToRunProvider", None) + default_info = provider_map.get("DefaultInfo", None) + output_group_info = provider_map.get("OutputGroupInfo", None) + if files_to_run and files_to_run.executable: + outputs[files_to_run.executable.path] = True + elif default_info: + for x in default_info.files: + outputs[x.path] = True + elif output_group_info: + for entry in dir(output_group_info): + # Filter out all built-ins and anything that is not a depset. + if entry.startswith("_") or not hasattr(getattr(output_group_info, entry), "to_list"): + continue + for x in getattr(output_group_info, entry).to_list(): + outputs[x.path] = True + + # Return all found files. + return "\n".join(outputs.keys()) diff --git a/tools/tags/tags.go b/tools/tags/tags.go deleted file mode 100644 index f35904e0a..000000000 --- a/tools/tags/tags.go +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package tags is a utility for parsing build tags. -package tags - -import ( - "fmt" - "io/ioutil" - "strings" -) - -// OrSet is a set of tags on a single line. -// -// Note that tags may include ",", and we don't distinguish this case in the -// logic below. Ideally, this constraints can be split into separate top-level -// build tags in order to resolve any issues. -type OrSet []string - -// Line returns the line for this or. -func (or OrSet) Line() string { - return fmt.Sprintf("// +build %s", strings.Join([]string(or), " ")) -} - -// AndSet is the set of all OrSets. -type AndSet []OrSet - -// Lines returns the lines to be printed. -func (and AndSet) Lines() (ls []string) { - for _, or := range and { - ls = append(ls, or.Line()) - } - return -} - -// Join joins this AndSet with another. -func (and AndSet) Join(other AndSet) AndSet { - return append(and, other...) -} - -// Tags returns the unique set of +build tags. -// -// Derived form the runtime's canBuild. -func Tags(file string) (tags AndSet) { - data, err := ioutil.ReadFile(file) - if err != nil { - return nil - } - // Check file contents for // +build lines. - for _, p := range strings.Split(string(data), "\n") { - p = strings.TrimSpace(p) - if p == "" { - continue - } - if !strings.HasPrefix(p, "//") { - break - } - if !strings.Contains(p, "+build") { - continue - } - fields := strings.Fields(p[2:]) - if len(fields) < 1 || fields[0] != "+build" { - continue - } - tags = append(tags, OrSet(fields[1:])) - } - return tags -} - -// Aggregate aggregates all tags from a set of files. -// -// Note that these may be in conflict, in which case the build will fail. -func Aggregate(files []string) (tags AndSet) { - for _, file := range files { - tags = tags.Join(Tags(file)) - } - return tags -} diff --git a/tools/verity/measure_tool.go b/tools/verity/measure_tool.go index 0d314ae70..4a0bc497a 100644 --- a/tools/verity/measure_tool.go +++ b/tools/verity/measure_tool.go @@ -21,12 +21,14 @@ import ( "io/ioutil" "log" "os" + "strings" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" ) var path = flag.String("path", "", "path to the verity file system.") +var rawpath = flag.String("rawpath", "", "path to the raw file system.") const maxDigestSize = 64 @@ -40,6 +42,14 @@ func main() { if *path == "" { log.Fatalf("no path provided") } + if *rawpath == "" { + log.Fatalf("no rawpath provided") + } + // TODO(b/182315468): Optimize the Merkle tree generate process to + // allow only updating certain files/directories. + if err := clearMerkle(*rawpath); err != nil { + log.Fatalf("Failed to clear merkle files in %s: %v", *rawpath, err) + } if err := enableDir(*path); err != nil { log.Fatalf("Failed to enable file system %s: %v", *path, err) } @@ -49,6 +59,26 @@ func main() { } } +func clearMerkle(path string) error { + files, err := ioutil.ReadDir(path) + if err != nil { + return err + } + + for _, file := range files { + if file.IsDir() { + if err := clearMerkle(path + "/" + file.Name()); err != nil { + return err + } + } else if strings.HasPrefix(file.Name(), ".merkle.verity") { + if err := os.Remove(path + "/" + file.Name()); err != nil { + return err + } + } + } + return nil +} + // enableDir enables verity features on all the files and sub-directories within // path. func enableDir(path string) error { diff --git a/website/_config.yml b/website/_config.yml index dc44945bc..5f1cbbdeb 100644 --- a/website/_config.yml +++ b/website/_config.yml @@ -44,3 +44,6 @@ authors: mpratt: name: Michael Pratt email: mpratt@google.com + nybidari: + name: Nayana Bidari + email: nybidari@google.com diff --git a/website/archive.key b/website/archive.key index 1a91698bf..8946884a7 100644 --- a/website/archive.key +++ b/website/archive.key @@ -11,19 +11,19 @@ lzqkT3VSMXieImTASosK5L5Q8rryvgCeI9tQLn9EpYFCtU3LXvVgTreGNEEjMOnL dR7yOU+Fs775stn6ucqmdYarx7CvKUrNAhgEeHMonLe1cjYScF7NfLO1GIrQKJR2 DE0f+uJZ52inOkO8ufh3WVQJSYszuS3HCY7w5oj1aP38k/y9zZdZvVvwAWZaiqBQ iwjVs6Kub76VVZZhRDf4iYs8k1Zh64nXdfQt250d8U5yMPF3wIJ+c1yhxwARAQAB -tCpUaGUgZ1Zpc29yIEF1dGhvcnMgPGd2aXNvci1ib3RAZ29vZ2xlLmNvbT6JAlQE -EwEKAD4WIQRvHfheOnHCSRjnJ9VvxtVU4yvZQwUCXSZ4BgIbAwUJA8JnAAULCQgH -AgYVCgkICwIEFgIDAQIeAQIXgAAKCRBvxtVU4yvZQ5WFD/9VZXMW5I2rKV+2gTHT -CsW74kZVi1VFdAVYiUJZXw2jJNtcg3xdgBcscYPyecyka/6TS2q7q2fOGAzCZkcR -e3lLzkGAngMlZ7PdHAE0PDMNFaeMZW0dxNH68vn7AiA1y2XwENnxVec7iXQH6aX5 -xUNg2OCiv5f6DJItHc/Q4SvFUi8QK7TT/GYE1RJXVJlLqfO6y4V8SeqfM+FHpHZM -gzrwdTgsNiEm4lMjWcgb2Ib4i2JUVAjIRPfcpysiV5E7c3SPXyu4bOovKKlbhiJ1 -Q1M9M0zHik34Kjf4YNO1EW936j7Msd181CJt5Bl9XvlhPb8gey/ygpIvcicLx6M5 -lRJTy4z1TtkmtZ7E8EbJZWoPTaHlA6hoMtGeE35j3vMZN1qZYaYt26eFOxxhh7PA -J0h1lS7T2O8u1c2JKhKvajtdmbqbJgI8FRhVsMoVBnqDK5aE9MOAso36OibfweEL -8iV2z8JnBpWtbbUEaWro4knPtbLJbQFvXVietm3cFsbGg+DMIwI6x6HcU91IEFYI -Sv4orK7xgLuM+f6dxo/Wel3ht18dg3x3krBLALTYBidRfnQYYR3sTfLquB8b5WaY -o829L2Bop9GBygdLevkHHN5It6q8CVpn0H5HEJMNaDOX1LcPbf0CKwkkAVCBd9YZ -eAX38ds9LliK7XPXdC4c+zEkGA== -=x8TG +tCpUaGUgZ1Zpc29yIEF1dGhvcnMgPGd2aXNvci1ib3RAZ29vZ2xlLmNvbT6JAk4E +EwEKADgCGwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AWIQRvHfheOnHCSRjnJ9Vv +xtVU4yvZQwUCYO4TxQAKCRBvxtVU4yvZQ9UoEACLPV7CnEA2bjCPi0NCWB/Mo1WL +evqv7Wv7vmXzI1K9DrqOhxuamQW75SVXg1df0hTJWbKFmDAip6NEC2Rg5P+A8hHj +nW/VG+q4ZFT662jDhnXQiO9L7EZzjyqNF4yWYzzgnqEu/SmGkDLDYiUCcGBqS2oE +EQfk7RHJSLMJXAnNDH7OUDgrirSssg/dlQ5uAHA9Au80VvC5fsTKza8b3Aydw3SV +iB8/Yuikbl8wKbpSGiXtR4viElXjNips0+mBqaUk2xpqSBrsfN+FezcInVXaXFeq +xtpq2/3M3DYbqCRjqeyd9wNi92FHdOusNrK4MYe0pAYbGjc65BwH+F0T4oJ8ZSJV +lIt+FZ0MqM1T97XadybYFsJh8qvajQpZEPL+zzNncc4f1d80e7+lwIZV/al0FZWW +Zlp7TpbeO/uW+lHs5W14YKwaQVh1whapKXTrATipNOOSCw2hnfrT8V7Hy55QWaGZ +f4/kfy929EeCP16d/LqOClv0j0RBr6NhRBQ0l/BE/mXjJwIk6nKwi+Yi4ek1ARi6 +AlCMLn9AZF7aTGpvCiftzIrlyDfVZT5IX03TayxRHZ4b1Rj8eyJaHcjI49u83gkr +4LGX08lEawn9nxFSx4RCg2swGiYw5F436wwwAIozqJuDASeTa3QND3au5v0oYWnl +umDySUl5wPaAaALgzA== +=5/8T -----END PGP PUBLIC KEY BLOCK----- diff --git a/website/assets/images/2021-08-31-rack-figure1.png b/website/assets/images/2021-08-31-rack-figure1.png Binary files differnew file mode 100644 index 000000000..6d9fdb147 --- /dev/null +++ b/website/assets/images/2021-08-31-rack-figure1.png diff --git a/website/assets/images/2021-08-31-rack-figure2.png b/website/assets/images/2021-08-31-rack-figure2.png Binary files differnew file mode 100644 index 000000000..c2043ecae --- /dev/null +++ b/website/assets/images/2021-08-31-rack-figure2.png diff --git a/website/assets/images/2021-08-31-rack-figure3.png b/website/assets/images/2021-08-31-rack-figure3.png Binary files differnew file mode 100644 index 000000000..e8b689f33 --- /dev/null +++ b/website/assets/images/2021-08-31-rack-figure3.png diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md index b6cf57a77..938605cc2 100644 --- a/website/blog/2019-11-18-security-basics.md +++ b/website/blog/2019-11-18-security-basics.md @@ -188,11 +188,11 @@ for direct access to some files. And most files will be remotely accessed through the Gofers, in which case no FDs are donated to the Sentry. The Sentry itself is only allowed access to specific -[whitelisted syscalls](https://github.com/google/gvisor/blob/master/runsc/config/config.go). +[allowlisted syscalls](https://github.com/google/gvisor/blob/master/runsc/config/config.go). Without networking, the Sentry needs 53 host syscalls in order to function, and -with networking, it uses an additional 15[^8]. By limiting the whitelist to only +with networking, it uses an additional 15[^8]. By limiting the allowlist to only these needed syscalls, we radically reduce the amount of host OS attack surface. -If any attempts are made to call something outside the whitelist, it is +If any attempts are made to call something outside the allowlist, it is immediately blocked and the sandbox is killed by the Host OS. ### Sentry/Gofer Interface: @@ -281,6 +281,8 @@ other ways the community can contribute to help make gVisor safe, fast and stable. <br> <br> +**Updated (2021-07-14):** this post was updated to use more inclusive language. +<br> -------------------------------------------------------------------------------- diff --git a/website/blog/2021-08-31-gvisor-rack.md b/website/blog/2021-08-31-gvisor-rack.md new file mode 100644 index 000000000..e7d4582e4 --- /dev/null +++ b/website/blog/2021-08-31-gvisor-rack.md @@ -0,0 +1,120 @@ +# gVisor RACK + +gVisor has implemented the [RACK](https://datatracker.ietf.org/doc/html/rfc8985) +(Recent ACKnowledgement) TCP loss-detection algorithm in our network stack, +which improves throughput in the presence of packet loss and reordering. + +TCP is a connection-oriented protocol that detects and recovers from loss by +retransmitting packets. [RACK](https://datatracker.ietf.org/doc/html/rfc8985) is +one of the recent loss-detection methods implemented in Linux and BSD, which +helps in identifying packet loss quickly and accurately in the presence of +packet reordering and tail losses. + +## Background + +The TCP congestion window indicates the number of unacknowledged packets that +can be sent at any time. When packet loss is identified, the congestion window +is reduced depending on the type of loss. The sender will recover from the loss +after all the packets sent before reducing the congestion window are +acknowledged. If the loss is identified falsely by the connection, then the +connection enters loss recovery unnecessarily, resulting in sending fewer +packets. + +Packet loss is identified mainly in two ways: + +1. Three duplicate acknowledgments, which will result in either + [Fast](https://datatracker.ietf.org/doc/html/rfc2001#section-4) or + [SACK](https://datatracker.ietf.org/doc/html/rfc6675) recovery. The + congestion window is reduced depending on the type of congestion control + algorithm. For example, in the + [Reno](https://en.wikipedia.org/wiki/TCP_congestion_control#TCP_Tahoe_and_Reno) + algorithm it is reduced to half. +2. RTO (Retransmission Timeout) which will result in Timeout recovery. The + congestion window is reduced to one + [MSS](https://en.wikipedia.org/wiki/Maximum_segment_size). + +Both of these cases result in reducing the congestion window, with RTO being +more expensive. Most of the existing algorithms do not detect packet reordering, +which get incorrectly identified as packet loss, resulting in an RTO. +Furthermore, the loss of an ACK at the end of a sequence (known as "tail loss") +will also trigger RTO and slow down future transmissions unnecessarily. RACK +helps us to identify loss accurately in all these scenarios, and will avoid +entering RTO. + +## Implementation of RACK + +Implementation of RACK requires support for: + +1. Per-packet transmission timestamps: RACK detects loss depending on the + transmission times of the packet and the timestamp at which ACK was + received. +2. SACK and ability to detect DSACK: Selective Acknowledgement and Duplicate + SACK are used to adjust the timer window after which a packet can be marked + as lost. + +### Packet Reordering + +Packet reordering commonly occurs when different packets take different paths +through a network. The diagram below shows the transmission of four packets +which get reordered in transmission, and the resulting TCP behavior with and +without RACK. + +![Figure 1](/assets/images/2021-08-31-rack-figure1.png "Packet reordering.") + +In the above example, the sender sees three duplicate acknowledgments. Without +RACK, this is identified falsely as packet loss, and the congestion window will +be reduced after entering Fast/SACK recovery. + +To detect packet reordering, RACK uses a reorder window, bounded between +[[RTT](https://en.wikipedia.org/wiki/Round-trip_delay)/4, RTT]. The reorder +timer is set to expire after _RTT+reorder\_window_. A packet is marked as lost +when the packets following it were acknowledged using SACK and the reorder timer +expires. The reorder window is increased when a DSACK is received (which +indicates that there is a higher degree of reordering). + +### Tail Loss + +Tail loss occurs when the packets are lost at the end of data transmission. The +diagram below shows an example of tail loss when the last three packets are +lost, and how it is handled with and without RACK. + +![Figure 2](/assets/images/2021-08-31-rack-figure2.png "Tail loss figure 2.") + +For tail losses, RACK uses a Tail Loss Probe (TLP), which relies on a timer for +the last packet sent. The TLP timer is set to _2 \* RTT,_ after which a probe is +sent. The probe packet will allow the connection one more chance to detect a +loss by triggering ACK feedback to avoid entering RTO. In the above example, the +loss is recovered without entering the RTO. + +TLP will also help in cases where the ACK was lost but all the packets were +received by the receiver. The below diagram shows that the ACK received for the +probe packet avoided the RTO. + +![Figure 3](/assets/images/2021-08-31-rack-figure3.png "Tail loss figure 3.") + +If there was some loss, then the ACK for the probe packet will have the SACK +blocks, which will be used to detect and retransmit the lost packets. + +In gVisor, we have support for +[NewReno](https://datatracker.ietf.org/doc/html/rfc6582) and SACK loss recovery +methods. We +[added support for RACK](https://github.com/google/gvisor/issues/5243) recently, +and it is the default when SACK is enabled. After enabling RACK, our internal +benchmarks in the presence of reordering and tail losses and the data we took +from internal users inside Google have shown ~50% reduction in the number of +RTOs. + +While RACK has improved one aspect of TCP performance by reducing the timeouts +in the presence of reordering and tail losses, in gVisor we plan to implement +the undoing of congestion windows and +[BBRv2](https://datatracker.ietf.org/doc/html/draft-cardwell-iccrg-bbr-congestion-control) +(once there is an RFC available) to further improve TCP performance in less +ideal network conditions. + +If you haven’t already, try gVisor. The instructions to get started are in our +[Quick Start](https://gvisor.dev/docs/user_guide/quick_start/docker/). You can +also get involved with the gVisor community via our +[Gitter channel](https://gitter.im/gvisor/community), +[email list](https://groups.google.com/forum/#!forum/gvisor-users), +[issue tracker](https://gvisor.dev/issue/new), and +[Github repository](https://github.com/google/gvisor). diff --git a/website/blog/BUILD b/website/blog/BUILD index 17beb721f..0384b9ba9 100644 --- a/website/blog/BUILD +++ b/website/blog/BUILD @@ -49,6 +49,16 @@ doc( permalink = "/blog/2020/10/22/platform-portability/", ) +doc( + name = "gvisor-rack", + src = "2021-08-31-gvisor-rack.md", + authors = [ + "nybidari", + ], + layout = "post", + permalink = "/blog/2021/08/31/gvisor-rack/", +) + docs( name = "posts", deps = [ diff --git a/website/cmd/server/main.go b/website/cmd/server/main.go index 707a3a8f8..1e5b56fbb 100644 --- a/website/cmd/server/main.go +++ b/website/cmd/server/main.go @@ -258,7 +258,7 @@ const pprofFixedPrefix = "https://storage.googleapis.com/" // allowedBuckets enforces constraints on the pprof target. // // If the continuous integration system is changed in the future to use -// additional buckets, they may be whitelisted here. See registerProfile. +// additional buckets, they may be allowed here. See registerProfile. var allowedBuckets = map[string]bool{ "gvisor-buildkite": true, } |