diff options
145 files changed, 5606 insertions, 2053 deletions
@@ -61,8 +61,9 @@ Make sure the following dependencies are installed: Build and install the `runsc` binary: ```sh -make runsc -sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin +mkdir -p bin +make copy TARGETS=runsc DESTINATION=bin/ +sudo cp ./bin/runsc /usr/local/bin ``` ### Testing @@ -42,10 +42,10 @@ http_archive( # binaries of symbols, which we don't want. "//tools:rules_go_symbols.patch", ], - sha256 = "69de5c704a05ff37862f7e0f5534d4f479418afc21806c887db544a316f3cb6b", + sha256 = "8e968b5fcea1d2d64071872b12737bbb5514524ee5f0a4f54f5920266c261acb", urls = [ - "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.27.0/rules_go-v0.27.0.tar.gz", - "https://github.com/bazelbuild/rules_go/releases/download/v0.27.0/rules_go-v0.27.0.tar.gz", + "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.28.0/rules_go-v0.28.0.zip", + "https://github.com/bazelbuild/rules_go/releases/download/v0.28.0/rules_go-v0.28.0.zip", ], ) @@ -69,7 +69,7 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_register_toolchains", "go_rules_depe go_rules_dependencies() -go_register_toolchains(go_version = "1.16.2") +go_register_toolchains(go_version = "1.16.8") load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository") @@ -113,11 +113,11 @@ cc_crosstool(name = "crosstool") # Load protobuf dependencies. http_archive( name = "rules_proto", - sha256 = "2a20fd8af3cad3fbab9fd3aec4a137621e0c31f858af213a7ae0f997723fc4a9", - strip_prefix = "rules_proto-a0761ed101b939e19d83b2da5f59034bffc19c12", + sha256 = "66bfdf8782796239d3875d37e7de19b1d94301e8972b3cbd2446b332429b4df1", + strip_prefix = "rules_proto-4.0.0", urls = [ - "https://mirror.bazel.build/github.com/bazelbuild/rules_proto/archive/a0761ed101b939e19d83b2da5f59034bffc19c12.tar.gz", - "https://github.com/bazelbuild/rules_proto/archive/a0761ed101b939e19d83b2da5f59034bffc19c12.tar.gz", + "https://mirror.bazel.build/github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0.tar.gz", + "https://github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0.tar.gz", ], ) @@ -1256,7 +1256,7 @@ http_archive( strip_prefix = "abseil-cpp-278e0a071885a22dcd2fd1b5576cc44757299343", urls = [ "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/278e0a071885a22dcd2fd1b5576cc44757299343.tar.gz", - "https://github.com/abseil/abseil-cpp/archive/278e0a071885a22dcd2fd1b5576cc44757299343.tar.gz" + "https://github.com/abseil/abseil-cpp/archive/278e0a071885a22dcd2fd1b5576cc44757299343.tar.gz", ], ) @@ -1278,7 +1278,6 @@ load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps") grpc_extra_deps() - http_archive( name = "com_google_googletest", sha256 = "0a10bea96d8670e5eef948d79d824162b1577bb7889539e49ec786bfc3e48912", @@ -1305,7 +1304,9 @@ http_archive( strip_prefix = "protobuf-3.17.3", urls = ["https://github.com/protocolbuffers/protobuf/archive/v3.17.3.zip"], ) + load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") + protobuf_deps() # Schemas for testing. @@ -1635,7 +1636,7 @@ go_repository( importpath = "honnef.co/go/tools", sum = "h1:Tyybiul3hjaq0dkv+kcf5/MPTfo+ZBiEWrkhgxMPH54=", version = "v0.3.0-0.dev.0.20210801021341-453cb28c0b15", - ) +) go_repository( name = "com_github_burntsushi_toml", diff --git a/g3doc/user_guide/compatibility.md b/g3doc/user_guide/compatibility.md index 76e879a01..ef50c0147 100644 --- a/g3doc/user_guide/compatibility.md +++ b/g3doc/user_guide/compatibility.md @@ -42,6 +42,9 @@ Most common utilities work. Note that: * Some tools, such as `tcpdump` and old versions of `ping`, require explicitly enabling raw sockets via the unsafe `--net-raw` runsc flag. + * In case of tcpdump the following invocations will work + * tcpdump -i any + * tcpdump -i \<device-name\> -p (-p disables promiscuous mode) * Different Docker images can behave differently. For example, Alpine Linux and Ubuntu have different `ip` binaries. @@ -82,7 +85,7 @@ Most common utilities work. Note that: | sshd | Partially working. Job control [in progress](https://gvisor.dev/issue/154). | | strace | Working. | | tar | Working. | -| tcpdump | Working. [Promiscuous mode in progress](https://gvisor.dev/issue/3333). | +| tcpdump | Working [only with libpcap versions < 1.10](https://github.com/google/gvisor/issues/6699), [Promiscuous mode in progress](https://gvisor.dev/issue/3333). | | top | Working. | | uptime | Working. | | vim | Working. | diff --git a/images/benchmarks/node/package-lock.json b/images/benchmarks/node/package-lock.json index 580e68aa5..9f59a3a71 100644 --- a/images/benchmarks/node/package-lock.json +++ b/images/benchmarks/node/package-lock.json @@ -1,63 +1,687 @@ { "name": "nodedum", "version": "1.0.0", - "lockfileVersion": 1, + "lockfileVersion": 2, "requires": true, - "dependencies": { - "accepts": { - "version": "1.3.5", - "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz", - "integrity": "sha1-63d99gEXI6OxTopywIBcjoZ0a9I=", - "requires": { - "mime-types": "~2.1.18", - "negotiator": "0.6.1" + "packages": { + "": { + "name": "nodedum", + "version": "1.0.0", + "license": "ISC", + "dependencies": { + "express": "^4.16.4", + "hbs": "^4.0.4", + "redis": "^3.1.2", + "redis-commands": "^1.2.0", + "redis-parser": "^2.6.0", + "secure-random-string": "^1.1.0" } }, - "array-flatten": { + "node_modules/accepts": { + "version": "1.3.7", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz", + "integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==", + "dependencies": { + "mime-types": "~2.1.24", + "negotiator": "0.6.2" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/array-flatten": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=" }, - "async": { + "node_modules/body-parser": { + "version": "1.19.0", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz", + "integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==", + "dependencies": { + "bytes": "3.1.0", + "content-type": "~1.0.4", + "debug": "2.6.9", + "depd": "~1.1.2", + "http-errors": "1.7.2", + "iconv-lite": "0.4.24", + "on-finished": "~2.3.0", + "qs": "6.7.0", + "raw-body": "2.4.0", + "type-is": "~1.6.17" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/bytes": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz", + "integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/content-disposition": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz", + "integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==", + "dependencies": { + "safe-buffer": "5.1.2" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/content-type": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz", + "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/cookie": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz", + "integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/cookie-signature": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", + "integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw=" + }, + "node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/denque": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/denque/-/denque-1.5.1.tgz", + "integrity": "sha512-XwE+iZ4D6ZUB7mfYRMb5wByE8L74HCn30FBN7sWnXksWc1LO1bPDl67pBR9o/kC4z/xSNAwkMYcGgqDV3BE3Hw==", + "engines": { + "node": ">=0.10" + } + }, + "node_modules/depd": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", + "integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak=", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/destroy": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz", + "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=" + }, + "node_modules/ee-first": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", + "integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0=" + }, + "node_modules/encodeurl": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", + "integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/escape-html": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", + "integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg=" + }, + "node_modules/etag": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", + "integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/express": { + "version": "4.17.1", + "resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz", + "integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==", + "dependencies": { + "accepts": "~1.3.7", + "array-flatten": "1.1.1", + "body-parser": "1.19.0", + "content-disposition": "0.5.3", + "content-type": "~1.0.4", + "cookie": "0.4.0", + "cookie-signature": "1.0.6", + "debug": "2.6.9", + "depd": "~1.1.2", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "finalhandler": "~1.1.2", + "fresh": "0.5.2", + "merge-descriptors": "1.0.1", + "methods": "~1.1.2", + "on-finished": "~2.3.0", + "parseurl": "~1.3.3", + "path-to-regexp": "0.1.7", + "proxy-addr": "~2.0.5", + "qs": "6.7.0", + "range-parser": "~1.2.1", + "safe-buffer": "5.1.2", + "send": "0.17.1", + "serve-static": "1.14.1", + "setprototypeof": "1.1.1", + "statuses": "~1.5.0", + "type-is": "~1.6.18", + "utils-merge": "1.0.1", + "vary": "~1.1.2" + }, + "engines": { + "node": ">= 0.10.0" + } + }, + "node_modules/finalhandler": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz", + "integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==", + "dependencies": { + "debug": "2.6.9", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "on-finished": "~2.3.0", + "parseurl": "~1.3.3", + "statuses": "~1.5.0", + "unpipe": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/foreachasync": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/foreachasync/-/foreachasync-3.0.0.tgz", + "integrity": "sha1-VQKYfchxS+M5IJfzLgBxyd7gfPY=" + }, + "node_modules/forwarded": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", + "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/fresh": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", + "integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/handlebars": { + "version": "4.7.7", + "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.7.tgz", + "integrity": "sha512-aAcXm5OAfE/8IXkcZvCepKU3VzW1/39Fb5ZuqMtgI/hT8X2YgoMvBY5dLhq/cpOvw7Lk1nK/UF71aLG/ZnVYRA==", + "dependencies": { + "minimist": "^1.2.5", + "neo-async": "^2.6.0", + "source-map": "^0.6.1", + "uglify-js": "^3.1.4", + "wordwrap": "^1.0.0" + }, + "bin": { + "handlebars": "bin/handlebars" + }, + "engines": { + "node": ">=0.4.7" + }, + "optionalDependencies": { + "uglify-js": "^3.1.4" + } + }, + "node_modules/hbs": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/hbs/-/hbs-4.1.2.tgz", + "integrity": "sha512-WfBnQbozbdiTLjJu6P6Wturgvy0FN8xtRmIjmP0ebX9OGQrt+2S6UC7xX0IebHTCS1sXe20zfTzQ7yhjrEvrfQ==", + "dependencies": { + "handlebars": "4.7.7", + "walk": "2.3.14" + }, + "engines": { + "node": ">= 0.8", + "npm": "1.2.8000 || >= 1.4.16" + } + }, + "node_modules/http-errors": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz", + "integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==", + "dependencies": { + "depd": "~1.1.2", + "inherits": "2.0.3", + "setprototypeof": "1.1.1", + "statuses": ">= 1.5.0 < 2", + "toidentifier": "1.0.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/iconv-lite": { + "version": "0.4.24", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", + "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/inherits": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", + "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" + }, + "node_modules/ipaddr.js": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", + "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==", + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/media-typer": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", + "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/merge-descriptors": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", + "integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E=" + }, + "node_modules/methods": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", + "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", + "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", + "bin": { + "mime": "cli.js" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/mime-db": { + "version": "1.49.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.49.0.tgz", + "integrity": "sha512-CIc8j9URtOVApSFCQIF+VBkX1RwXp/oMMOrqdyXSBXq5RWNEsRfyj1kiRnQgmNXmHxPoFIxOroKA3zcU9P+nAA==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.32", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.32.tgz", + "integrity": "sha512-hJGaVS4G4c9TSMYh2n6SQAGrC4RnfU+daP8G7cSCmaqNjiOoUY0VHCMS42pxnQmVF1GWwFhbHWn3RIxCqTmZ9A==", + "dependencies": { + "mime-db": "1.49.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/minimist": { + "version": "1.2.5", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.5.tgz", + "integrity": "sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw==" + }, + "node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + }, + "node_modules/negotiator": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz", + "integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/neo-async": { "version": "2.6.2", - "resolved": "https://registry.npmjs.org/async/-/async-2.6.2.tgz", - "integrity": "sha512-H1qVYh1MYhEEFLsP97cVKqCGo7KfCyTt6uEWqsTBr9SO84oK9Uwbyd/yCW+6rKJLHksBNUVWZDAjfS+Ccx0Bbg==", + "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz", + "integrity": "sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==" + }, + "node_modules/on-finished": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz", + "integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=", + "dependencies": { + "ee-first": "1.1.1" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/parseurl": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", + "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/path-to-regexp": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", + "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=" + }, + "node_modules/proxy-addr": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", + "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==", + "dependencies": { + "forwarded": "0.2.0", + "ipaddr.js": "1.9.1" + }, + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/qs": { + "version": "6.7.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz", + "integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ==", + "engines": { + "node": ">=0.6" + } + }, + "node_modules/range-parser": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", + "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/raw-body": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz", + "integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==", + "dependencies": { + "bytes": "3.1.0", + "http-errors": "1.7.2", + "iconv-lite": "0.4.24", + "unpipe": "1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/redis": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/redis/-/redis-3.1.2.tgz", + "integrity": "sha512-grn5KoZLr/qrRQVwoSkmzdbw6pwF+/rwODtrOr6vuBRiR/f3rjSTGupbF90Zpqm2oenix8Do6RV7pYEkGwlKkw==", + "dependencies": { + "denque": "^1.5.0", + "redis-commands": "^1.7.0", + "redis-errors": "^1.2.0", + "redis-parser": "^3.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/node-redis" + } + }, + "node_modules/redis-commands": { + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.7.0.tgz", + "integrity": "sha512-nJWqw3bTFy21hX/CPKHth6sfhZbdiHP6bTawSgQBlKOVRG7EZkfHbbHwQJnrE4vsQf0CMNE+3gJ4Fmm16vdVlQ==" + }, + "node_modules/redis-errors": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/redis-errors/-/redis-errors-1.2.0.tgz", + "integrity": "sha1-62LSrbFeTq9GEMBK/hUpOEJQq60=", + "engines": { + "node": ">=4" + } + }, + "node_modules/redis-parser": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz", + "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs=", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/redis/node_modules/redis-parser": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-3.0.0.tgz", + "integrity": "sha1-tm2CjNyv5rS4pCin3vTGvKwxyLQ=", + "dependencies": { + "redis-errors": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" + }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" + }, + "node_modules/secure-random-string": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/secure-random-string/-/secure-random-string-1.1.3.tgz", + "integrity": "sha512-298HxkJJp5mjpPhxDsN26S/2JmMaUIrQ4PxDI/F4fXKRBTOKendQ5i6JCkc+a8F8koLh0vdfwSCw8+RJkY7N6A==" + }, + "node_modules/send": { + "version": "0.17.1", + "resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz", + "integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==", + "dependencies": { + "debug": "2.6.9", + "depd": "~1.1.2", + "destroy": "~1.0.4", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "0.5.2", + "http-errors": "~1.7.2", + "mime": "1.6.0", + "ms": "2.1.1", + "on-finished": "~2.3.0", + "range-parser": "~1.2.1", + "statuses": "~1.5.0" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/send/node_modules/ms": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", + "integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg==" + }, + "node_modules/serve-static": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz", + "integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==", + "dependencies": { + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "parseurl": "~1.3.3", + "send": "0.17.1" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/setprototypeof": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz", + "integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw==" + }, + "node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/statuses": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", + "integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/toidentifier": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz", + "integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw==", + "engines": { + "node": ">=0.6" + } + }, + "node_modules/type-is": { + "version": "1.6.18", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", + "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==", + "dependencies": { + "media-typer": "0.3.0", + "mime-types": "~2.1.24" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/uglify-js": { + "version": "3.14.2", + "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.14.2.tgz", + "integrity": "sha512-rtPMlmcO4agTUfz10CbgJ1k6UAoXM2gWb3GoMPPZB/+/Ackf8lNWk11K4rYi2D0apgoFRLtQOZhb+/iGNJq26A==", + "optional": true, + "bin": { + "uglifyjs": "bin/uglifyjs" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/unpipe": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", + "integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw=", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/utils-merge": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", + "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=", + "engines": { + "node": ">= 0.4.0" + } + }, + "node_modules/vary": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", + "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/walk": { + "version": "2.3.14", + "resolved": "https://registry.npmjs.org/walk/-/walk-2.3.14.tgz", + "integrity": "sha512-5skcWAUmySj6hkBdH6B6+3ddMjVQYH5Qy9QGbPmN8kVmLteXk+yVXg+yfk1nbX30EYakahLrr8iPcCxJQSCBeg==", + "dependencies": { + "foreachasync": "^3.0.0" + } + }, + "node_modules/wordwrap": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz", + "integrity": "sha1-J1hIEIkUVqQXHI0CJkQa3pDLyus=" + } + }, + "dependencies": { + "accepts": { + "version": "1.3.7", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz", + "integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==", "requires": { - "lodash": "^4.17.11" + "mime-types": "~2.1.24", + "negotiator": "0.6.2" } }, + "array-flatten": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", + "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI=" + }, "body-parser": { - "version": "1.18.3", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.18.3.tgz", - "integrity": "sha1-WykhmP/dVTs6DyDe0FkrlWlVyLQ=", + "version": "1.19.0", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz", + "integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==", "requires": { - "bytes": "3.0.0", + "bytes": "3.1.0", "content-type": "~1.0.4", "debug": "2.6.9", "depd": "~1.1.2", - "http-errors": "~1.6.3", - "iconv-lite": "0.4.23", + "http-errors": "1.7.2", + "iconv-lite": "0.4.24", "on-finished": "~2.3.0", - "qs": "6.5.2", - "raw-body": "2.3.3", - "type-is": "~1.6.16" + "qs": "6.7.0", + "raw-body": "2.4.0", + "type-is": "~1.6.17" } }, "bytes": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz", - "integrity": "sha1-0ygVQE1olpn4Wk6k+odV3ROpYEg=" - }, - "commander": { - "version": "2.20.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.0.tgz", - "integrity": "sha512-7j2y+40w61zy6YC2iRNpUe/NwhNyoXrYpHMrSunaMG64nRnaf96zO/KMQR4OyN/UnE5KLyEBnKHd4aG3rskjpQ==", - "optional": true + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz", + "integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==" }, "content-disposition": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.2.tgz", - "integrity": "sha1-DPaLud318r55YcOoUXjLhdunjLQ=" + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz", + "integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==", + "requires": { + "safe-buffer": "5.1.2" + } }, "content-type": { "version": "1.0.4", @@ -65,9 +689,9 @@ "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==" }, "cookie": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.3.1.tgz", - "integrity": "sha1-5+Ch+e9DtMi6klxcWpboBtFoc7s=" + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz", + "integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg==" }, "cookie-signature": { "version": "1.0.6", @@ -82,6 +706,11 @@ "ms": "2.0.0" } }, + "denque": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/denque/-/denque-1.5.1.tgz", + "integrity": "sha512-XwE+iZ4D6ZUB7mfYRMb5wByE8L74HCn30FBN7sWnXksWc1LO1bPDl67pBR9o/kC4z/xSNAwkMYcGgqDV3BE3Hw==" + }, "depd": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", @@ -92,11 +721,6 @@ "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz", "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=" }, - "double-ended-queue": { - "version": "2.1.0-0", - "resolved": "https://registry.npmjs.org/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz", - "integrity": "sha1-ED01J/0xUo9AGIEwyEHv3XgmTlw=" - }, "ee-first": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", @@ -118,53 +742,53 @@ "integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=" }, "express": { - "version": "4.16.4", - "resolved": "https://registry.npmjs.org/express/-/express-4.16.4.tgz", - "integrity": "sha512-j12Uuyb4FMrd/qQAm6uCHAkPtO8FDTRJZBDd5D2KOL2eLaz1yUNdUB/NOIyq0iU4q4cFarsUCrnFDPBcnksuOg==", + "version": "4.17.1", + "resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz", + "integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==", "requires": { - "accepts": "~1.3.5", + "accepts": "~1.3.7", "array-flatten": "1.1.1", - "body-parser": "1.18.3", - "content-disposition": "0.5.2", + "body-parser": "1.19.0", + "content-disposition": "0.5.3", "content-type": "~1.0.4", - "cookie": "0.3.1", + "cookie": "0.4.0", "cookie-signature": "1.0.6", "debug": "2.6.9", "depd": "~1.1.2", "encodeurl": "~1.0.2", "escape-html": "~1.0.3", "etag": "~1.8.1", - "finalhandler": "1.1.1", + "finalhandler": "~1.1.2", "fresh": "0.5.2", "merge-descriptors": "1.0.1", "methods": "~1.1.2", "on-finished": "~2.3.0", - "parseurl": "~1.3.2", + "parseurl": "~1.3.3", "path-to-regexp": "0.1.7", - "proxy-addr": "~2.0.4", - "qs": "6.5.2", - "range-parser": "~1.2.0", + "proxy-addr": "~2.0.5", + "qs": "6.7.0", + "range-parser": "~1.2.1", "safe-buffer": "5.1.2", - "send": "0.16.2", - "serve-static": "1.13.2", - "setprototypeof": "1.1.0", - "statuses": "~1.4.0", - "type-is": "~1.6.16", + "send": "0.17.1", + "serve-static": "1.14.1", + "setprototypeof": "1.1.1", + "statuses": "~1.5.0", + "type-is": "~1.6.18", "utils-merge": "1.0.1", "vary": "~1.1.2" } }, "finalhandler": { - "version": "1.1.1", - "resolved": "http://registry.npmjs.org/finalhandler/-/finalhandler-1.1.1.tgz", - "integrity": "sha512-Y1GUDo39ez4aHAw7MysnUD5JzYX+WaIj8I57kO3aEPT1fFRL4sr7mjei97FgnwhAyyzRYmQZaTHb2+9uZ1dPtg==", + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz", + "integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==", "requires": { "debug": "2.6.9", "encodeurl": "~1.0.2", "escape-html": "~1.0.3", "on-finished": "~2.3.0", - "parseurl": "~1.3.2", - "statuses": "~1.4.0", + "parseurl": "~1.3.3", + "statuses": "~1.5.0", "unpipe": "~1.0.0" } }, @@ -174,9 +798,9 @@ "integrity": "sha1-VQKYfchxS+M5IJfzLgBxyd7gfPY=" }, "forwarded": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz", - "integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ=" + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", + "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==" }, "fresh": { "version": "0.5.2", @@ -184,40 +808,42 @@ "integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=" }, "handlebars": { - "version": "4.0.14", - "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.0.14.tgz", - "integrity": "sha512-E7tDoyAA8ilZIV3xDJgl18sX3M8xB9/fMw8+mfW4msLW8jlX97bAnWgT3pmaNXuvzIEgSBMnAHfuXsB2hdzfow==", + "version": "4.7.7", + "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.7.tgz", + "integrity": "sha512-aAcXm5OAfE/8IXkcZvCepKU3VzW1/39Fb5ZuqMtgI/hT8X2YgoMvBY5dLhq/cpOvw7Lk1nK/UF71aLG/ZnVYRA==", "requires": { - "async": "^2.5.0", - "optimist": "^0.6.1", + "minimist": "^1.2.5", + "neo-async": "^2.6.0", "source-map": "^0.6.1", - "uglify-js": "^3.1.4" + "uglify-js": "^3.1.4", + "wordwrap": "^1.0.0" } }, "hbs": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/hbs/-/hbs-4.0.4.tgz", - "integrity": "sha512-esVlyV/V59mKkwFai5YmPRSNIWZzhqL5YMN0++ueMxyK1cCfPa5f6JiHtapPKAIVAhQR6rpGxow0troav9WMEg==", + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/hbs/-/hbs-4.1.2.tgz", + "integrity": "sha512-WfBnQbozbdiTLjJu6P6Wturgvy0FN8xtRmIjmP0ebX9OGQrt+2S6UC7xX0IebHTCS1sXe20zfTzQ7yhjrEvrfQ==", "requires": { - "handlebars": "4.0.14", - "walk": "2.3.9" + "handlebars": "4.7.7", + "walk": "2.3.14" } }, "http-errors": { - "version": "1.6.3", - "resolved": "http://registry.npmjs.org/http-errors/-/http-errors-1.6.3.tgz", - "integrity": "sha1-i1VoC7S+KDoLW/TqLjhYC+HZMg0=", + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz", + "integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==", "requires": { "depd": "~1.1.2", "inherits": "2.0.3", - "setprototypeof": "1.1.0", - "statuses": ">= 1.4.0 < 2" + "setprototypeof": "1.1.1", + "statuses": ">= 1.5.0 < 2", + "toidentifier": "1.0.0" } }, "iconv-lite": { - "version": "0.4.23", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.23.tgz", - "integrity": "sha512-neyTUVFtahjf0mB3dZT77u+8O0QB89jFdnBkd5P1JgYPbPaia3gXXOVL2fq8VyU2gMMD7SaN7QukTB/pmXYvDA==", + "version": "0.4.24", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", + "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", "requires": { "safer-buffer": ">= 2.1.2 < 3" } @@ -228,18 +854,13 @@ "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" }, "ipaddr.js": { - "version": "1.8.0", - "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.8.0.tgz", - "integrity": "sha1-6qM9bd16zo9/b+DJygRA5wZzix4=" - }, - "lodash": { - "version": "4.17.15", - "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz", - "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A==" + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", + "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==" }, "media-typer": { "version": "0.3.0", - "resolved": "http://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", + "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=" }, "merge-descriptors": { @@ -253,27 +874,27 @@ "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4=" }, "mime": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/mime/-/mime-1.4.1.tgz", - "integrity": "sha512-KI1+qOZu5DcW6wayYHSzR/tXKCDC5Om4s1z2QJjDULzLcmf3DvzS7oluY4HCTrc+9FiKmWUgeNLg7W3uIQvxtQ==" + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", + "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==" }, "mime-db": { - "version": "1.37.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.37.0.tgz", - "integrity": "sha512-R3C4db6bgQhlIhPU48fUtdVmKnflq+hRdad7IyKhtFj06VPNVdk2RhiYL3UjQIlso8L+YxAtFkobT0VK+S/ybg==" + "version": "1.49.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.49.0.tgz", + "integrity": "sha512-CIc8j9URtOVApSFCQIF+VBkX1RwXp/oMMOrqdyXSBXq5RWNEsRfyj1kiRnQgmNXmHxPoFIxOroKA3zcU9P+nAA==" }, "mime-types": { - "version": "2.1.21", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.21.tgz", - "integrity": "sha512-3iL6DbwpyLzjR3xHSFNFeb9Nz/M8WDkX33t1GFQnFOllWk8pOrh/LSrB5OXlnlW5P9LH73X6loW/eogc+F5lJg==", + "version": "2.1.32", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.32.tgz", + "integrity": "sha512-hJGaVS4G4c9TSMYh2n6SQAGrC4RnfU+daP8G7cSCmaqNjiOoUY0VHCMS42pxnQmVF1GWwFhbHWn3RIxCqTmZ9A==", "requires": { - "mime-db": "~1.37.0" + "mime-db": "1.49.0" } }, "minimist": { - "version": "0.0.10", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.10.tgz", - "integrity": "sha1-3j+YVD2/lggr5IrRoMfNqDYwHc8=" + "version": "1.2.5", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.5.tgz", + "integrity": "sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw==" }, "ms": { "version": "2.0.0", @@ -281,9 +902,14 @@ "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" }, "negotiator": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.1.tgz", - "integrity": "sha1-KzJxhOiZIQEXeyhWP7XnECrNDKk=" + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz", + "integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==" + }, + "neo-async": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz", + "integrity": "sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==" }, "on-finished": { "version": "2.3.0", @@ -293,19 +919,10 @@ "ee-first": "1.1.1" } }, - "optimist": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz", - "integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=", - "requires": { - "minimist": "~0.0.1", - "wordwrap": "~0.0.2" - } - }, "parseurl": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.2.tgz", - "integrity": "sha1-/CidTtiZMRlGDBViUyYs3I3mW/M=" + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", + "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==" }, "path-to-regexp": { "version": "0.1.7", @@ -313,61 +930,65 @@ "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w=" }, "proxy-addr": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.4.tgz", - "integrity": "sha512-5erio2h9jp5CHGwcybmxmVqHmnCBZeewlfJ0pex+UW7Qny7OOZXTtH56TGNyBizkgiOwhJtMKrVzDTeKcySZwA==", + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", + "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==", "requires": { - "forwarded": "~0.1.2", - "ipaddr.js": "1.8.0" + "forwarded": "0.2.0", + "ipaddr.js": "1.9.1" } }, "qs": { - "version": "6.5.2", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz", - "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA==" + "version": "6.7.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz", + "integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ==" }, "range-parser": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.0.tgz", - "integrity": "sha1-9JvmtIeJTdxA3MlKMi9hEJLgDV4=" + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", + "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==" }, "raw-body": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.3.3.tgz", - "integrity": "sha512-9esiElv1BrZoI3rCDuOuKCBRbuApGGaDPQfjSflGxdy4oyzqghxu6klEkkVIvBje+FF0BX9coEv8KqW6X/7njw==", + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz", + "integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==", "requires": { - "bytes": "3.0.0", - "http-errors": "1.6.3", - "iconv-lite": "0.4.23", + "bytes": "3.1.0", + "http-errors": "1.7.2", + "iconv-lite": "0.4.24", "unpipe": "1.0.0" } }, "redis": { - "version": "2.8.0", - "resolved": "https://registry.npmjs.org/redis/-/redis-2.8.0.tgz", - "integrity": "sha512-M1OkonEQwtRmZv4tEWF2VgpG0JWJ8Fv1PhlgT5+B+uNq2cA3Rt1Yt/ryoR+vQNOQcIEgdCdfH0jr3bDpihAw1A==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/redis/-/redis-3.1.2.tgz", + "integrity": "sha512-grn5KoZLr/qrRQVwoSkmzdbw6pwF+/rwODtrOr6vuBRiR/f3rjSTGupbF90Zpqm2oenix8Do6RV7pYEkGwlKkw==", "requires": { - "double-ended-queue": "^2.1.0-0", - "redis-commands": "^1.2.0", - "redis-parser": "^2.6.0" + "denque": "^1.5.0", + "redis-commands": "^1.7.0", + "redis-errors": "^1.2.0", + "redis-parser": "^3.0.0" }, "dependencies": { - "redis-commands": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.4.0.tgz", - "integrity": "sha512-cu8EF+MtkwI4DLIT0x9P8qNTLFhQD4jLfxLR0cCNkeGzs87FN6879JOJwNQR/1zD7aSYNbU0hgsV9zGY71Itvw==" - }, "redis-parser": { - "version": "2.6.0", - "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz", - "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs=" + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-3.0.0.tgz", + "integrity": "sha1-tm2CjNyv5rS4pCin3vTGvKwxyLQ=", + "requires": { + "redis-errors": "^1.0.0" + } } } }, "redis-commands": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.5.0.tgz", - "integrity": "sha512-6KxamqpZ468MeQC3bkWmCB1fp56XL64D4Kf0zJSwDZbVLLm7KFkoIcHrgRvQ+sk8dnhySs7+yBg94yIkAK7aJg==" + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.7.0.tgz", + "integrity": "sha512-nJWqw3bTFy21hX/CPKHth6sfhZbdiHP6bTawSgQBlKOVRG7EZkfHbbHwQJnrE4vsQf0CMNE+3gJ4Fmm16vdVlQ==" + }, + "redis-errors": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/redis-errors/-/redis-errors-1.2.0.tgz", + "integrity": "sha1-62LSrbFeTq9GEMBK/hUpOEJQq60=" }, "redis-parser": { "version": "2.6.0", @@ -385,14 +1006,14 @@ "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, "secure-random-string": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/secure-random-string/-/secure-random-string-1.1.0.tgz", - "integrity": "sha512-V/h8jqoz58zklNGybVhP++cWrxEPXlLM/6BeJ4e0a8zlb4BsbYRzFs16snrxByPa5LUxCVTD3M6EYIVIHR1fAg==" + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/secure-random-string/-/secure-random-string-1.1.3.tgz", + "integrity": "sha512-298HxkJJp5mjpPhxDsN26S/2JmMaUIrQ4PxDI/F4fXKRBTOKendQ5i6JCkc+a8F8koLh0vdfwSCw8+RJkY7N6A==" }, "send": { - "version": "0.16.2", - "resolved": "https://registry.npmjs.org/send/-/send-0.16.2.tgz", - "integrity": "sha512-E64YFPUssFHEFBvpbbjr44NCLtI1AohxQ8ZSiJjQLskAdKuriYEP6VyGEsRDH8ScozGpkaX1BGvhanqCwkcEZw==", + "version": "0.17.1", + "resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz", + "integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==", "requires": { "debug": "2.6.9", "depd": "~1.1.2", @@ -401,29 +1022,36 @@ "escape-html": "~1.0.3", "etag": "~1.8.1", "fresh": "0.5.2", - "http-errors": "~1.6.2", - "mime": "1.4.1", - "ms": "2.0.0", + "http-errors": "~1.7.2", + "mime": "1.6.0", + "ms": "2.1.1", "on-finished": "~2.3.0", - "range-parser": "~1.2.0", - "statuses": "~1.4.0" + "range-parser": "~1.2.1", + "statuses": "~1.5.0" + }, + "dependencies": { + "ms": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", + "integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg==" + } } }, "serve-static": { - "version": "1.13.2", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.13.2.tgz", - "integrity": "sha512-p/tdJrO4U387R9oMjb1oj7qSMaMfmOyd4j9hOFoxZe2baQszgHcSWjuya/CiT5kgZZKRudHNOA0pYXOl8rQ5nw==", + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz", + "integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==", "requires": { "encodeurl": "~1.0.2", "escape-html": "~1.0.3", - "parseurl": "~1.3.2", - "send": "0.16.2" + "parseurl": "~1.3.3", + "send": "0.17.1" } }, "setprototypeof": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.0.tgz", - "integrity": "sha512-BvE/TwpZX4FXExxOxZyRGQQv651MSwmWKZGqvmPcRIjDqWub67kTKuIMx43cZZrS/cBBzwBcNDWoFxt2XEFIpQ==" + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz", + "integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw==" }, "source-map": { "version": "0.6.1", @@ -431,28 +1059,29 @@ "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==" }, "statuses": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.4.0.tgz", - "integrity": "sha512-zhSCtt8v2NDrRlPQpCNtw/heZLtfUDqxBM1udqikb/Hbk52LK4nQSwr10u77iopCW5LsyHpuXS0GnEc48mLeew==" + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", + "integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=" + }, + "toidentifier": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz", + "integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw==" }, "type-is": { - "version": "1.6.16", - "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.16.tgz", - "integrity": "sha512-HRkVv/5qY2G6I8iab9cI7v1bOIdhm94dVjQCPFElW9W+3GeDOSHmy2EBYe4VTApuzolPcmgFTN3ftVJRKR2J9Q==", + "version": "1.6.18", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", + "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==", "requires": { "media-typer": "0.3.0", - "mime-types": "~2.1.18" + "mime-types": "~2.1.24" } }, "uglify-js": { - "version": "3.5.9", - "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.5.9.tgz", - "integrity": "sha512-WpT0RqsDtAWPNJK955DEnb6xjymR8Fn0OlK4TT4pS0ASYsVPqr5ELhgwOwLCP5J5vHeJ4xmMmz3DEgdqC10JeQ==", - "optional": true, - "requires": { - "commander": "~2.20.0", - "source-map": "~0.6.1" - } + "version": "3.14.2", + "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.14.2.tgz", + "integrity": "sha512-rtPMlmcO4agTUfz10CbgJ1k6UAoXM2gWb3GoMPPZB/+/Ackf8lNWk11K4rYi2D0apgoFRLtQOZhb+/iGNJq26A==", + "optional": true }, "unpipe": { "version": "1.0.0", @@ -470,17 +1099,17 @@ "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw=" }, "walk": { - "version": "2.3.9", - "resolved": "https://registry.npmjs.org/walk/-/walk-2.3.9.tgz", - "integrity": "sha1-MbTbZnjyrgHDnqn7hyWpAx5Vins=", + "version": "2.3.14", + "resolved": "https://registry.npmjs.org/walk/-/walk-2.3.14.tgz", + "integrity": "sha512-5skcWAUmySj6hkBdH6B6+3ddMjVQYH5Qy9QGbPmN8kVmLteXk+yVXg+yfk1nbX30EYakahLrr8iPcCxJQSCBeg==", "requires": { "foreachasync": "^3.0.0" } }, "wordwrap": { - "version": "0.0.3", - "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz", - "integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=" + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz", + "integrity": "sha1-J1hIEIkUVqQXHI0CJkQa3pDLyus=" } } } diff --git a/images/benchmarks/node/package.json b/images/benchmarks/node/package.json index 7dcadd523..a46adc22a 100644 --- a/images/benchmarks/node/package.json +++ b/images/benchmarks/node/package.json @@ -11,7 +11,7 @@ "dependencies": { "express": "^4.16.4", "hbs": "^4.0.4", - "redis": "^2.8.0", + "redis": "^3.1.2", "redis-commands": "^1.2.0", "redis-parser": "^2.6.0", "secure-random-string": "^1.1.0" diff --git a/pkg/atomicbitops/atomicbitops_amd64.s b/pkg/atomicbitops/atomicbitops_amd64.s index cbaf716bb..6b9a67adc 100644 --- a/pkg/atomicbitops/atomicbitops_amd64.s +++ b/pkg/atomicbitops/atomicbitops_amd64.s @@ -16,28 +16,28 @@ #include "textflag.h" -TEXT ·AndUint32(SB),$0-12 +TEXT ·AndUint32(SB),NOSPLIT,$0-12 MOVQ addr+0(FP), BX MOVL val+8(FP), AX LOCK ANDL AX, 0(BX) RET -TEXT ·OrUint32(SB),$0-12 +TEXT ·OrUint32(SB),NOSPLIT,$0-12 MOVQ addr+0(FP), BX MOVL val+8(FP), AX LOCK ORL AX, 0(BX) RET -TEXT ·XorUint32(SB),$0-12 +TEXT ·XorUint32(SB),NOSPLIT,$0-12 MOVQ addr+0(FP), BX MOVL val+8(FP), AX LOCK XORL AX, 0(BX) RET -TEXT ·CompareAndSwapUint32(SB),$0-20 +TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-20 MOVQ addr+0(FP), DI MOVL old+8(FP), AX MOVL new+12(FP), DX @@ -46,28 +46,28 @@ TEXT ·CompareAndSwapUint32(SB),$0-20 MOVL AX, ret+16(FP) RET -TEXT ·AndUint64(SB),$0-16 +TEXT ·AndUint64(SB),NOSPLIT,$0-16 MOVQ addr+0(FP), BX MOVQ val+8(FP), AX LOCK ANDQ AX, 0(BX) RET -TEXT ·OrUint64(SB),$0-16 +TEXT ·OrUint64(SB),NOSPLIT,$0-16 MOVQ addr+0(FP), BX MOVQ val+8(FP), AX LOCK ORQ AX, 0(BX) RET -TEXT ·XorUint64(SB),$0-16 +TEXT ·XorUint64(SB),NOSPLIT,$0-16 MOVQ addr+0(FP), BX MOVQ val+8(FP), AX LOCK XORQ AX, 0(BX) RET -TEXT ·CompareAndSwapUint64(SB),$0-32 +TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-32 MOVQ addr+0(FP), DI MOVQ old+8(FP), AX MOVQ new+16(FP), DX diff --git a/pkg/atomicbitops/atomicbitops_arm64.s b/pkg/atomicbitops/atomicbitops_arm64.s index 5c780851b..644a6bca5 100644 --- a/pkg/atomicbitops/atomicbitops_arm64.s +++ b/pkg/atomicbitops/atomicbitops_arm64.s @@ -16,7 +16,7 @@ #include "textflag.h" -TEXT ·AndUint32(SB),$0-12 +TEXT ·AndUint32(SB),NOSPLIT,$0-12 MOVD ptr+0(FP), R0 MOVW val+8(FP), R1 again: @@ -26,7 +26,7 @@ again: CBNZ R3, again RET -TEXT ·OrUint32(SB),$0-12 +TEXT ·OrUint32(SB),NOSPLIT,$0-12 MOVD ptr+0(FP), R0 MOVW val+8(FP), R1 again: @@ -36,7 +36,7 @@ again: CBNZ R3, again RET -TEXT ·XorUint32(SB),$0-12 +TEXT ·XorUint32(SB),NOSPLIT,$0-12 MOVD ptr+0(FP), R0 MOVW val+8(FP), R1 again: @@ -46,7 +46,7 @@ again: CBNZ R3, again RET -TEXT ·CompareAndSwapUint32(SB),$0-20 +TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-20 MOVD addr+0(FP), R0 MOVW old+8(FP), R1 MOVW new+12(FP), R2 @@ -60,7 +60,7 @@ done: MOVW R3, prev+16(FP) RET -TEXT ·AndUint64(SB),$0-16 +TEXT ·AndUint64(SB),NOSPLIT,$0-16 MOVD ptr+0(FP), R0 MOVD val+8(FP), R1 again: @@ -70,7 +70,7 @@ again: CBNZ R3, again RET -TEXT ·OrUint64(SB),$0-16 +TEXT ·OrUint64(SB),NOSPLIT,$0-16 MOVD ptr+0(FP), R0 MOVD val+8(FP), R1 again: @@ -80,7 +80,7 @@ again: CBNZ R3, again RET -TEXT ·XorUint64(SB),$0-16 +TEXT ·XorUint64(SB),NOSPLIT,$0-16 MOVD ptr+0(FP), R0 MOVD val+8(FP), R1 again: @@ -90,7 +90,7 @@ again: CBNZ R3, again RET -TEXT ·CompareAndSwapUint64(SB),$0-32 +TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-32 MOVD addr+0(FP), R0 MOVD old+8(FP), R1 MOVD new+16(FP), R2 diff --git a/pkg/atomicbitops/atomicbitops_noasm.go b/pkg/atomicbitops/atomicbitops_noasm.go index 474c0c815..af6b1362e 100644 --- a/pkg/atomicbitops/atomicbitops_noasm.go +++ b/pkg/atomicbitops/atomicbitops_noasm.go @@ -21,6 +21,7 @@ import ( "sync/atomic" ) +//go:nosplit func AndUint32(addr *uint32, val uint32) { for { o := atomic.LoadUint32(addr) @@ -31,6 +32,7 @@ func AndUint32(addr *uint32, val uint32) { } } +//go:nosplit func OrUint32(addr *uint32, val uint32) { for { o := atomic.LoadUint32(addr) @@ -41,6 +43,7 @@ func OrUint32(addr *uint32, val uint32) { } } +//go:nosplit func XorUint32(addr *uint32, val uint32) { for { o := atomic.LoadUint32(addr) @@ -51,6 +54,7 @@ func XorUint32(addr *uint32, val uint32) { } } +//go:nosplit func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) { for { prev = atomic.LoadUint32(addr) @@ -63,6 +67,7 @@ func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) { } } +//go:nosplit func AndUint64(addr *uint64, val uint64) { for { o := atomic.LoadUint64(addr) @@ -73,6 +78,7 @@ func AndUint64(addr *uint64, val uint64) { } } +//go:nosplit func OrUint64(addr *uint64, val uint64) { for { o := atomic.LoadUint64(addr) @@ -83,6 +89,7 @@ func OrUint64(addr *uint64, val uint64) { } } +//go:nosplit func XorUint64(addr *uint64, val uint64) { for { o := atomic.LoadUint64(addr) @@ -93,6 +100,7 @@ func XorUint64(addr *uint64, val uint64) { } } +//go:nosplit func CompareAndSwapUint64(addr *uint64, old, new uint64) (prev uint64) { for { prev = atomic.LoadUint64(addr) diff --git a/pkg/buffer/view_test.go b/pkg/buffer/view_test.go index 796efa240..59784eacb 100644 --- a/pkg/buffer/view_test.go +++ b/pkg/buffer/view_test.go @@ -509,6 +509,24 @@ func TestView(t *testing.T) { } } +func TestViewClone(t *testing.T) { + const ( + originalSize = 90 + bytesToDelete = 30 + ) + var v View + v.AppendOwned(bytes.Repeat([]byte{originalSize}, originalSize)) + + clonedV := v.Clone() + v.TrimFront(bytesToDelete) + if got, want := int(v.Size()), originalSize-bytesToDelete; got != want { + t.Errorf("original packet was not changed: size expected = %d, got = %d", want, got) + } + if got := clonedV.Size(); got != originalSize { + t.Errorf("cloned packet should not be modified: expected size = %d, got = %d", originalSize, got) + } +} + func TestViewPullUp(t *testing.T) { for _, tc := range []struct { desc string diff --git a/pkg/eventfd/BUILD b/pkg/eventfd/BUILD new file mode 100644 index 000000000..02407cb99 --- /dev/null +++ b/pkg/eventfd/BUILD @@ -0,0 +1,22 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "eventfd", + srcs = [ + "eventfd.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/hostarch", + "//pkg/tcpip/link/rawfile", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "eventfd_test", + srcs = ["eventfd_test.go"], + library = ":eventfd", +) diff --git a/pkg/eventfd/eventfd.go b/pkg/eventfd/eventfd.go new file mode 100644 index 000000000..acdac01b8 --- /dev/null +++ b/pkg/eventfd/eventfd.go @@ -0,0 +1,115 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package eventfd wraps Linux's eventfd(2) syscall. +package eventfd + +import ( + "fmt" + "io" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/tcpip/link/rawfile" +) + +const sizeofUint64 = 8 + +// Eventfd represents a Linux eventfd object. +type Eventfd struct { + fd int +} + +// Create returns an initialized eventfd. +func Create() (Eventfd, error) { + fd, _, err := unix.RawSyscall(unix.SYS_EVENTFD2, 0, 0, 0) + if err != 0 { + return Eventfd{}, fmt.Errorf("failed to create eventfd: %v", error(err)) + } + if err := unix.SetNonblock(int(fd), true); err != nil { + unix.Close(int(fd)) + return Eventfd{}, err + } + return Eventfd{int(fd)}, nil +} + +// Wrap returns an initialized Eventfd using the provided fd. +func Wrap(fd int) Eventfd { + return Eventfd{fd} +} + +// Close closes the eventfd, after which it should not be used. +func (ev Eventfd) Close() error { + return unix.Close(ev.fd) +} + +// Dup copies the eventfd, calling dup(2) on the underlying file descriptor. +func (ev Eventfd) Dup() (Eventfd, error) { + other, err := unix.Dup(ev.fd) + if err != nil { + return Eventfd{}, fmt.Errorf("failed to dup: %v", other) + } + return Eventfd{other}, nil +} + +// Notify alerts other users of the eventfd. Users can receive alerts by +// calling Wait or Read. +func (ev Eventfd) Notify() error { + return ev.Write(1) +} + +// Write writes a specific value to the eventfd. +func (ev Eventfd) Write(val uint64) error { + var buf [sizeofUint64]byte + hostarch.ByteOrder.PutUint64(buf[:], val) + for { + n, err := unix.Write(ev.fd, buf[:]) + if err == unix.EINTR { + continue + } + if n != sizeofUint64 { + panic(fmt.Sprintf("short write to eventfd: got %d bytes, wanted %d", n, sizeofUint64)) + } + return err + } +} + +// Wait blocks until eventfd is non-zero (i.e. someone calls Notify or Write). +func (ev Eventfd) Wait() error { + _, err := ev.Read() + return err +} + +// Read blocks until eventfd is non-zero (i.e. someone calls Notify or Write) +// and returns the value read. +func (ev Eventfd) Read() (uint64, error) { + var tmp [sizeofUint64]byte + n, err := rawfile.BlockingReadUntranslated(ev.fd, tmp[:]) + if err != 0 { + return 0, err + } + if n == 0 { + return 0, io.EOF + } + if n != sizeofUint64 { + panic(fmt.Sprintf("short read from eventfd: got %d bytes, wanted %d", n, sizeofUint64)) + } + return hostarch.ByteOrder.Uint64(tmp[:]), nil +} + +// FD returns the underlying file descriptor. Use with care, as this breaks the +// Eventfd abstraction. +func (ev Eventfd) FD() int { + return ev.fd +} diff --git a/pkg/eventfd/eventfd_test.go b/pkg/eventfd/eventfd_test.go new file mode 100644 index 000000000..96998d530 --- /dev/null +++ b/pkg/eventfd/eventfd_test.go @@ -0,0 +1,75 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventfd + +import ( + "testing" + "time" +) + +func TestReadWrite(t *testing.T) { + efd, err := Create() + if err != nil { + t.Fatalf("failed to Create(): %v", err) + } + defer efd.Close() + + // Make sure we can read actual values + const want = 343 + if err := efd.Write(want); err != nil { + t.Fatalf("failed to write value: %d", want) + } + + got, err := efd.Read() + if err != nil { + t.Fatalf("failed to read value: %v", err) + } + if got != want { + t.Fatalf("Read(): got %d, but wanted %d", got, want) + } +} + +func TestWait(t *testing.T) { + efd, err := Create() + if err != nil { + t.Fatalf("failed to Create(): %v", err) + } + defer efd.Close() + + // There's no way to test with certainty that Wait() blocks indefinitely, but + // as a best-effort we can wait a bit on it. + errCh := make(chan error) + go func() { + errCh <- efd.Wait() + }() + select { + case err := <-errCh: + t.Fatalf("Wait() returned without a call to Notify(): %v", err) + case <-time.After(500 * time.Millisecond): + } + + // Notify and check that Wait() returned. + if err := efd.Notify(); err != nil { + t.Fatalf("Notify() failed: %v", err) + } + select { + case err := <-errCh: + if err != nil { + t.Fatalf("Read() failed: %v", err) + } + case <-time.After(5 * time.Second): + t.Fatalf("Read() did not return after Notify()") + } +} diff --git a/pkg/ring0/defs.go b/pkg/ring0/defs.go index b6e2012e8..38ce9be1e 100644 --- a/pkg/ring0/defs.go +++ b/pkg/ring0/defs.go @@ -77,6 +77,9 @@ type CPU struct { // calls and exceptions via the Registers function. registers arch.Registers + // floatingPointState holds floating point state. + floatingPointState fpu.State + // hooks are kernel hooks. hooks Hooks } @@ -90,6 +93,15 @@ func (c *CPU) Registers() *arch.Registers { return &c.registers } +// FloatingPointState returns the kernel floating point state. +// +// This is explicitly safe to call during KernelException and KernelSyscall. +// +//go:nosplit +func (c *CPU) FloatingPointState() *fpu.State { + return &c.floatingPointState +} + // SwitchOpts are passed to the Switch function. type SwitchOpts struct { // Registers are the user register state. diff --git a/pkg/ring0/defs_amd64.go b/pkg/ring0/defs_amd64.go index 24f6e4cde..81e90dbf7 100644 --- a/pkg/ring0/defs_amd64.go +++ b/pkg/ring0/defs_amd64.go @@ -116,6 +116,11 @@ type CPUArchState struct { errorType uintptr *kernelEntry + + // Copies of global variables, stored in CPU so that they can be used by + // syscall and exception handlers (in the upper address space). + hasXSAVE bool + hasXSAVEOPT bool } // ErrorCode returns the last error code. diff --git a/pkg/ring0/entry_amd64.go b/pkg/ring0/entry_amd64.go index afd646b0b..13ad4e4df 100644 --- a/pkg/ring0/entry_amd64.go +++ b/pkg/ring0/entry_amd64.go @@ -39,11 +39,6 @@ func sysenter() // assembly to get the ABI0 (i.e., primary) address. func addrOfSysenter() uintptr -// swapgs swaps the current GS value. -// -// This must be called prior to sysret/iret. -func swapgs() - // jumpToKernel jumps to the kernel version of the current RIP. func jumpToKernel() diff --git a/pkg/ring0/entry_amd64.s b/pkg/ring0/entry_amd64.s index 520bd9f57..d2913f190 100644 --- a/pkg/ring0/entry_amd64.s +++ b/pkg/ring0/entry_amd64.s @@ -142,8 +142,103 @@ TEXT ·jumpToUser(SB),NOSPLIT,$0 MOVQ AX, 0(SP) RET +// See kernel_amd64.go. +// +// The 16-byte frame size is for the saved values of MXCSR and the x87 control +// word. +TEXT ·doSwitchToUser(SB),NOSPLIT,$16-48 + // We are passed pointers to heap objects, but do not store them in our + // local frame. + NO_LOCAL_POINTERS + + // MXCSR and the x87 control word are the only floating point state + // that is callee-save and thus we must save. + STMXCSR mxcsr-0(SP) + FSTCW cw-8(SP) + + // Restore application floating point state. + MOVQ cpu+0(FP), SI + MOVQ fpState+16(FP), DI + MOVB ·hasXSAVE(SB), BX + TESTB BX, BX + JZ no_xrstor + // Use xrstor to restore all available fp state. For now, we restore + // everything unconditionally by setting the implicit operand edx:eax + // (the "requested feature bitmap") to all 1's. + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI) + JMP fprestore_done +no_xrstor: + // Fall back to fxrstor if xsave is not available. + FXRSTOR64 0(DI) +fprestore_done: + + // Set application GS. + MOVQ regs+8(FP), R8 + SWAP_GS() + MOVQ PTRACE_GS_BASE(R8), AX + PUSHQ AX + CALL ·writeGS(SB) + POPQ AX + + // Call sysret() or iret(). + MOVQ userCR3+24(FP), CX + MOVQ needIRET+32(FP), R9 + ADDQ $-32, SP + MOVQ SI, 0(SP) // cpu + MOVQ R8, 8(SP) // regs + MOVQ CX, 16(SP) // userCR3 + TESTQ R9, R9 + JNZ do_iret + CALL ·sysret(SB) + JMP done_sysret_or_iret +do_iret: + CALL ·iret(SB) +done_sysret_or_iret: + MOVQ 24(SP), AX // vector + ADDQ $32, SP + MOVQ AX, vector+40(FP) + + // Save application floating point state. + MOVQ fpState+16(FP), DI + MOVB ·hasXSAVE(SB), BX + MOVB ·hasXSAVEOPT(SB), CX + TESTB BX, BX + JZ no_xsave + // Use xsave/xsaveopt to save all extended state. + // We save everything unconditionally by setting RFBM to all 1's. + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + TESTB CX, CX + JZ no_xsaveopt + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) + JMP fpsave_done +no_xsaveopt: + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) + JMP fpsave_done +no_xsave: + FXSAVE64 0(DI) +fpsave_done: + + // Restore MXCSR and the x87 control word after one of the two floating + // point save cases above, to ensure the application versions are saved + // before being clobbered here. + LDMXCSR mxcsr-0(SP) + + // FLDCW is a "waiting" x87 instruction, meaning it checks for pending + // unmasked exceptions before executing. Thus if userspace has unmasked + // an exception and has one pending, it can be raised by FLDCW even + // though the new control word will mask exceptions. To prevent this, + // we must first clear pending exceptions (which will be restored by + // XRSTOR, et al). + BYTE $0xDB; BYTE $0xE2; // FNCLEX + FLDCW cw-8(SP) + + RET + // See entry_amd64.go. -TEXT ·sysret(SB),NOSPLIT,$0-24 +TEXT ·sysret(SB),NOSPLIT,$0-32 // Set application FS. We can't do this in Go because Go code needs FS. MOVQ regs+8(FP), AX MOVQ PTRACE_FS_BASE(AX), AX @@ -182,9 +277,11 @@ TEXT ·sysret(SB),NOSPLIT,$0-24 POPQ AX // Restore AX. POPQ SP // Restore SP. SYSRET64() + // sysenter or exception will write our return value and return to our + // caller. // See entry_amd64.go. -TEXT ·iret(SB),NOSPLIT,$0-24 +TEXT ·iret(SB),NOSPLIT,$0-32 // Set application FS. We can't do this in Go because Go code needs FS. MOVQ regs+8(FP), AX MOVQ PTRACE_FS_BASE(AX), AX @@ -220,6 +317,8 @@ TEXT ·iret(SB),NOSPLIT,$0-24 WRITE_CR3() // Switch to userCR3. POPQ AX // Restore AX. IRET() + // sysenter or exception will write our return value and return to our + // caller. // See entry_amd64.go. TEXT ·resume(SB),NOSPLIT,$0 @@ -324,11 +423,39 @@ kernel: MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. + // Save floating point state. CPU.floatingPointState is a slice, so the + // first word of CPU.floatingPointState is a pointer to the destination + // array. + MOVQ CPU_FPU_STATE(AX), DI + MOVB CPU_HAS_XSAVE(AX), BX + MOVB CPU_HAS_XSAVEOPT(AX), CX + TESTB BX, BX + JZ no_xsave + // Use xsave/xsaveopt to save all extended state. + // We save everything unconditionally by setting RFBM to all 1's. + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + TESTB CX, CX + JZ no_xsaveopt + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) + JMP fpsave_done +no_xsaveopt: + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) + JMP fpsave_done +no_xsave: + FXSAVE64 0(DI) +fpsave_done: + // Call the syscall trampoline. LOAD_KERNEL_STACK(GS) - PUSHQ AX // First argument (vCPU). - CALL ·kernelSyscall(SB) // Call the trampoline. - POPQ AX // Pop vCPU. + MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU. + PUSHQ AX // First argument (vCPU). + CALL ·kernelSyscall(SB) // Call the trampoline. + POPQ AX // Pop vCPU. + + // We only trigger a bluepill entry in the bluepill function, and can + // therefore be guaranteed that there is no floating point state to be + // loaded on resuming from halt. JMP ·resume(SB) ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB)); @@ -416,15 +543,43 @@ kernel: MOVQ 8(SP), BX // Load the error code. MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU. MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. - MOVQ 0(SP), BX // BX contains the vector. + + // Save floating point state. CPU.floatingPointState is a slice, so the + // first word of CPU.floatingPointState is a pointer to the destination + // array. + MOVQ CPU_FPU_STATE(AX), DI + MOVB CPU_HAS_XSAVE(AX), BX + MOVB CPU_HAS_XSAVEOPT(AX), CX + TESTB BX, BX + JZ no_xsave + // Use xsave/xsaveopt to save all extended state. + // We save everything unconditionally by setting RFBM to all 1's. + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + TESTB CX, CX + JZ no_xsaveopt + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) + JMP fpsave_done +no_xsaveopt: + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) + JMP fpsave_done +no_xsave: + FXSAVE64 0(DI) +fpsave_done: // Call the exception trampoline. + MOVQ 0(SP), BX // BX contains the vector. LOAD_KERNEL_STACK(GS) - PUSHQ BX // Second argument (vector). - PUSHQ AX // First argument (vCPU). - CALL ·kernelException(SB) // Call the trampoline. - POPQ BX // Pop vector. - POPQ AX // Pop vCPU. + MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU. + PUSHQ BX // Second argument (vector). + PUSHQ AX // First argument (vCPU). + CALL ·kernelException(SB) // Call the trampoline. + POPQ BX // Pop vector. + POPQ AX // Pop vCPU. + + // We only trigger a bluepill entry in the bluepill function, and can + // therefore be guaranteed that there is no floating point state to be + // loaded on resuming from halt. JMP ·resume(SB) #define EXCEPTION_WITH_ERROR(value, symbol, addr) \ diff --git a/pkg/ring0/kernel.go b/pkg/ring0/kernel.go index 292f9d0cc..e7dd84929 100644 --- a/pkg/ring0/kernel.go +++ b/pkg/ring0/kernel.go @@ -14,6 +14,10 @@ package ring0 +import ( + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" +) + // Init initializes a new kernel. // //go:nosplit @@ -80,6 +84,7 @@ func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) { c.self = c // Set self reference. c.kernel = k // Set kernel reference. c.init(cpuID) // Perform architectural init. + c.floatingPointState = fpu.NewState() // Require hooks. if hooks != nil { diff --git a/pkg/ring0/kernel_amd64.go b/pkg/ring0/kernel_amd64.go index 4a4c0ae26..7e55011b5 100644 --- a/pkg/ring0/kernel_amd64.go +++ b/pkg/ring0/kernel_amd64.go @@ -143,6 +143,9 @@ func (c *CPU) init(cpuID int) { // Set mandatory flags. c.registers.Eflags = KernelFlagsSet + + c.hasXSAVE = hasXSAVE + c.hasXSAVEOPT = hasXSAVEOPT } // StackTop returns the kernel's stack address. @@ -248,19 +251,21 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { regs.Ss = uint64(Udata) // Ditto. // Perform the switch. - swapgs() // GS will be swapped on return. - WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. - LoadFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy in floating point. + needIRET := uint64(0) if switchOpts.FullRestore { - vector = iret(c, regs, uintptr(userCR3)) - } else { - vector = sysret(c, regs, uintptr(userCR3)) + needIRET = 1 } - SaveFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy out floating point. - RestoreKernelFPState() // escapes: no. Restore kernel MXCSR. + vector = doSwitchToUser(c, regs, switchOpts.FloatingPointState.BytePointer(), userCR3, needIRET) // escapes: no. return } +func doSwitchToUser( + cpu *CPU, // +0(FP) + regs *arch.Registers, // +8(FP) + fpState *byte, // +16(FP) + userCR3 uint64, // +24(FP) + needIRET uint64) Vector // +32(FP), +40(FP) + var ( sentryXCR0 uintptr sentryXCR0Once sync.Once @@ -287,7 +292,7 @@ func initSentryXCR0() { //go:nosplit func startGo(c *CPU) { // Save per-cpu. - WriteGS(kernelAddr(c.kernelEntry)) + writeGS(kernelAddr(c.kernelEntry)) // // TODO(mpratt): Note that per the note above, this should be done diff --git a/pkg/ring0/lib_amd64.go b/pkg/ring0/lib_amd64.go index 05c394ff5..c42a5b205 100644 --- a/pkg/ring0/lib_amd64.go +++ b/pkg/ring0/lib_amd64.go @@ -21,29 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" ) -// LoadFloatingPoint loads floating point state by the most efficient mechanism -// available (set by Init). -var LoadFloatingPoint func(*byte) - -// SaveFloatingPoint saves floating point state by the most efficient mechanism -// available (set by Init). -var SaveFloatingPoint func(*byte) - -// fxrstor uses fxrstor64 to load floating point state. -func fxrstor(*byte) - -// xrstor uses xrstor to load floating point state. -func xrstor(*byte) - -// fxsave uses fxsave64 to save floating point state. -func fxsave(*byte) - -// xsave uses xsave to save floating point state. -func xsave(*byte) - -// xsaveopt uses xsaveopt to save floating point state. -func xsaveopt(*byte) - // writeFS sets the FS base address (selects one of wrfsbase or wrfsmsr). func writeFS(addr uintptr) @@ -53,8 +30,8 @@ func wrfsbase(addr uintptr) // wrfsmsr writes to the GS_BASE MSR. func wrfsmsr(addr uintptr) -// WriteGS sets the GS address (set by init). -var WriteGS func(addr uintptr) +// writeGS sets the GS address (selects one of wrgsbase or wrgsmsr). +func writeGS(addr uintptr) // wrgsbase writes to the GS base address. func wrgsbase(addr uintptr) @@ -106,19 +83,4 @@ func Init(featureSet *cpuid.FeatureSet) { hasXSAVE = featureSet.UseXsave() hasFSGSBASE = featureSet.HasFeature(cpuid.X86FeatureFSGSBase) validXCR0Mask = uintptr(featureSet.ValidXCR0Mask()) - if hasXSAVEOPT { - SaveFloatingPoint = xsaveopt - LoadFloatingPoint = xrstor - } else if hasXSAVE { - SaveFloatingPoint = xsave - LoadFloatingPoint = xrstor - } else { - SaveFloatingPoint = fxsave - LoadFloatingPoint = fxrstor - } - if hasFSGSBASE { - WriteGS = wrgsbase - } else { - WriteGS = wrgsmsr - } } diff --git a/pkg/ring0/lib_amd64.s b/pkg/ring0/lib_amd64.s index 8ed98fc84..0f283aaae 100644 --- a/pkg/ring0/lib_amd64.s +++ b/pkg/ring0/lib_amd64.s @@ -128,6 +128,29 @@ TEXT ·wrfsmsr(SB),NOSPLIT,$0-8 BYTE $0x0f; BYTE $0x30; RET +// writeGS writes to the GS base. +// +// This is written in assembly because it must be callable from assembly (ABI0) +// without an intermediate transition to ABIInternal. +// +// Preconditions: must be running in the lower address space, as it accesses +// global data. +TEXT ·writeGS(SB),NOSPLIT,$8-8 + MOVQ addr+0(FP), AX + + CMPB ·hasFSGSBASE(SB), $1 + JNE msr + + PUSHQ AX + CALL ·wrgsbase(SB) + POPQ AX + RET +msr: + PUSHQ AX + CALL ·wrgsmsr(SB) + POPQ AX + RET + // wrgsbase writes to the GS base. // // The code corresponds to: diff --git a/pkg/ring0/offsets_amd64.go b/pkg/ring0/offsets_amd64.go index 75f6218b3..38fe27c35 100644 --- a/pkg/ring0/offsets_amd64.go +++ b/pkg/ring0/offsets_amd64.go @@ -35,6 +35,9 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_HAS_XSAVE 0x%02x\n", reflect.ValueOf(&c.hasXSAVE).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_HAS_XSAVEOPT 0x%02x\n", reflect.ValueOf(&c.hasXSAVEOPT).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_FPU_STATE 0x%02x\n", reflect.ValueOf(&c.floatingPointState).Pointer()-reflect.ValueOf(c).Pointer()) e := &kernelEntry{} fmt.Fprintf(w, "\n// CPU entry offsets.\n") diff --git a/pkg/safecopy/BUILD b/pkg/safecopy/BUILD index 0a045fc8e..2a1602e2b 100644 --- a/pkg/safecopy/BUILD +++ b/pkg/safecopy/BUILD @@ -18,9 +18,9 @@ go_library( ], visibility = ["//:sandbox"], deps = [ - "//pkg/abi/linux", "//pkg/errors", "//pkg/errors/linuxerr", + "//pkg/sighandling", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/safecopy/safecopy.go b/pkg/safecopy/safecopy.go index a9711e63d..0dd0aea83 100644 --- a/pkg/safecopy/safecopy.go +++ b/pkg/safecopy/safecopy.go @@ -23,6 +23,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/errors" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sighandling" ) // SegvError is returned when a safecopy function receives SIGSEGV. @@ -132,10 +133,10 @@ func initializeAddresses() { func init() { initializeAddresses() - if err := ReplaceSignalHandler(unix.SIGSEGV, addrOfSignalHandler(), &savedSigSegVHandler); err != nil { + if err := sighandling.ReplaceSignalHandler(unix.SIGSEGV, addrOfSignalHandler(), &savedSigSegVHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err)) } - if err := ReplaceSignalHandler(unix.SIGBUS, addrOfSignalHandler(), &savedSigBusHandler); err != nil { + if err := sighandling.ReplaceSignalHandler(unix.SIGBUS, addrOfSignalHandler(), &savedSigBusHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err)) } linuxerr.AddErrorUnwrapper(func(e error) (*errors.Error, bool) { diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go index 2365b2c0d..15f84abea 100644 --- a/pkg/safecopy/safecopy_unsafe.go +++ b/pkg/safecopy/safecopy_unsafe.go @@ -20,7 +20,6 @@ import ( "unsafe" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" ) // maxRegisterSize is the maximum register size used in memcpy and memclr. It @@ -332,39 +331,3 @@ func errorFromFaultSignal(addr uintptr, sig int32) error { panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr)) } } - -// ReplaceSignalHandler replaces the existing signal handler for the provided -// signal with the one that handles faults in safecopy-protected functions. -// -// It stores the value of the previously set handler in previous. -// -// This function will be called on initialization in order to install safecopy -// handlers for appropriate signals. These handlers will call the previous -// handler however, and if this is function is being used externally then the -// same courtesy is expected. -func ReplaceSignalHandler(sig unix.Signal, handler uintptr, previous *uintptr) error { - var sa linux.SigAction - const maskLen = 8 - - // Get the existing signal handler information, and save the current - // handler. Once we replace it, we will use this pointer to fall back to - // it when we receive other signals. - if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 { - return e - } - - // Fail if there isn't a previous handler. - if sa.Handler == 0 { - return fmt.Errorf("previous handler for signal %x isn't set", sig) - } - - *previous = uintptr(sa.Handler) - - // Install our own handler. - sa.Handler = uint64(handler) - if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 { - return e - } - - return nil -} diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 4370cce33..d2eb03bb7 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -45,7 +45,8 @@ type pipeOperations struct { fsutil.FileNoIoctl `state:"nosave"` fsutil.FileNoSplice `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` - waiter.Queue `state:"nosave"` + + waiter.Queue // flags are the flags used to open the pipe. flags fs.FileFlags `state:".(fs.FileFlags)"` diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index 92d58e3e9..99c37291e 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -70,7 +70,7 @@ type inodeFileState struct { descriptor *descriptor `state:"wait"` // Event queue for blocking operations. - queue waiter.Queue `state:"zerovalue"` + queue waiter.Queue // sattr is used to restore the inodeOperations. sattr fs.StableAttr `state:"wait"` diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index 51cd6cd37..941f37116 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -43,7 +43,7 @@ type Inotify struct { // user, since we may aggressively reuse an id on S/R. id uint64 - waiter.Queue `state:"nosave"` + waiter.Queue // evMu *only* protects the events list. We need a separate lock because // while queuing events, a watch needs to lock the event queue, and using mu diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index 7d7a207cc..e39d340fe 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -132,7 +132,7 @@ type Locks struct { locks LockSet // blockedQueue is the queue of waiters that are waiting on a lock. - blockedQueue waiter.Queue `state:"zerovalue"` + blockedQueue waiter.Queue } // Blocker is the interface used for blocking locks. Passing a nil Blocker diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go index 085aa6d61..443b9a94c 100644 --- a/pkg/sentry/fs/proc/sys.go +++ b/pkg/sentry/fs/proc/sys.go @@ -109,6 +109,9 @@ func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode "shmall": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMALL, 10))), "shmmax": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMAX, 10))), "shmmni": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMNI, 10))), + "msgmni": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.MSGMNI, 10))), + "msgmax": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.MSGMAX, 10))), + "msgmnb": newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.MSGMNB, 10))), } d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555)) diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index 1c8518d71..ca8be8683 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -43,7 +43,7 @@ type TimerOperations struct { fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` - events waiter.Queue `state:"zerovalue"` + events waiter.Queue timer *ktime.Timer // val is the number of timer expirations since the last successful call to diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index f9fca6d8e..f2c9e9668 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -102,10 +102,10 @@ type lineDiscipline struct { column int // masterWaiter is used to wait on the master end of the TTY. - masterWaiter waiter.Queue `state:"zerovalue"` + masterWaiter waiter.Queue // replicaWaiter is used to wait on the replica end of the TTY. - replicaWaiter waiter.Queue `state:"zerovalue"` + replicaWaiter waiter.Queue } func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 7bef8242f..b98825e26 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1595,7 +1595,10 @@ func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats // (b/148380782). Allow all other extended attributes to be passed through // to the remote filesystem. This is inconsistent with Linux's 9p client, // but consistent with other filesystems (e.g. FUSE). - if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) { + // + // NOTE(b/202533394): Also disallow "trusted" namespace for now. This is + // consistent with the VFS1 gofer client. + if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { return linuxerr.EOPNOTSUPP } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) @@ -2046,16 +2049,7 @@ func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) { } if d.fs.opts.lisaEnabled { - xattrs, err := d.controlFDLisa.ListXattr(ctx, size) - if err != nil { - return nil, err - } - - res := make([]string, 0, len(xattrs)) - for _, xattr := range xattrs { - res = append(res, xattr) - } - return res, nil + return d.controlFDLisa.ListXattr(ctx, size) } xattrMap, err := d.file.listXattr(ctx, size) diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 26d44744b..7b0be9c14 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -268,6 +268,6 @@ func cpuInfoData(k *kernel.Kernel) string { return buf.String() } -func shmData(v uint64) dynamicInode { +func ipcData(v uint64) dynamicInode { return newStaticFile(strconv.FormatUint(v, 10)) } diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 99f64a9d8..82e2857b3 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -47,9 +47,12 @@ func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k * "kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "hostname": fs.newInode(ctx, root, 0444, &hostnameData{}), "sem": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))), - "shmall": fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)), - "shmmax": fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)), - "shmmni": fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)), + "shmall": fs.newInode(ctx, root, 0444, ipcData(linux.SHMALL)), + "shmmax": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMAX)), + "shmmni": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMNI)), + "msgmni": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNI)), + "msgmax": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMAX)), + "msgmnb": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNB)), "yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root), }), diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD index 66fa1ad40..03c8e2f38 100644 --- a/pkg/sentry/hostmm/BUILD +++ b/pkg/sentry/hostmm/BUILD @@ -12,8 +12,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/fd", - "//pkg/hostarch", + "//pkg/eventfd", "//pkg/log", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go index 285ea9050..5df06a60f 100644 --- a/pkg/sentry/hostmm/hostmm.go +++ b/pkg/sentry/hostmm/hostmm.go @@ -21,9 +21,7 @@ import ( "os" "path" - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/log" ) @@ -54,7 +52,7 @@ func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) } defer eventControlFile.Close() - eventFD, err := newEventFD() + eventFD, err := eventfd.Create() if err != nil { return nil, err } @@ -75,20 +73,11 @@ func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) const stopVal = 1 << 63 stopCh := make(chan struct{}) go func() { // S/R-SAFE: f provides synchronization if necessary - rw := fd.NewReadWriter(eventFD.FD()) - var buf [sizeofUint64]byte for { - n, err := rw.Read(buf[:]) + val, err := eventFD.Read() if err != nil { - if err == unix.EINTR { - continue - } panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err)) } - if n != sizeofUint64 { - panic(fmt.Sprintf("short read from memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) - } - val := hostarch.ByteOrder.Uint64(buf[:]) if val >= stopVal { // Assume this was due to the notifier's "destructor" (the // function returned by NotifyCurrentMemcgPressureCallback @@ -101,30 +90,7 @@ func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) } }() return func() { - rw := fd.NewReadWriter(eventFD.FD()) - var buf [sizeofUint64]byte - hostarch.ByteOrder.PutUint64(buf[:], stopVal) - for { - n, err := rw.Write(buf[:]) - if err != nil { - if err == unix.EINTR { - continue - } - panic(fmt.Sprintf("failed to write to memory pressure level eventfd: %v", err)) - } - if n != sizeofUint64 { - panic(fmt.Sprintf("short write to memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) - } - break - } + eventFD.Write(stopVal) <-stopCh }, nil } - -func newEventFD() (*fd.FD, error) { - f, _, e := unix.Syscall(unix.SYS_EVENTFD2, 0, 0, 0) - if e != 0 { - return nil, fmt.Errorf("failed to create eventfd: %v", e) - } - return fd.New(int(f)), nil -} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c0f13bf52..53a21e1e2 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -255,7 +255,6 @@ go_library( "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/futex", "//pkg/sentry/kernel/msgqueue", "//pkg/sentry/kernel/sched", diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 6006c46a9..8d0a21baf 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -66,7 +66,7 @@ type pollEntry struct { file *refs.WeakRef `state:"manual"` id FileIdentifier `state:"wait"` userData [2]int32 - waiter waiter.Entry `state:"manual"` + waiter waiter.Entry mask waiter.EventMask flags EntryFlags @@ -102,7 +102,7 @@ type EventPoll struct { // Wait queue is used to notify interested parties when the event poll // object itself becomes readable or writable. - waiter.Queue `state:"zerovalue"` + waiter.Queue // files is the map of all the files currently being observed, it is // protected by mu. @@ -454,14 +454,3 @@ func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error { return nil } - -// UnregisterEpollWaiters removes the epoll waiter objects from the waiting -// queues. This is different from Release() as the file is not dereferenced. -func (e *EventPoll) UnregisterEpollWaiters() { - e.mu.Lock() - defer e.mu.Unlock() - - for _, entry := range e.files { - entry.id.File.EventUnregister(&entry.waiter) - } -} diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go index e08d6287f..135a6d72c 100644 --- a/pkg/sentry/kernel/epoll/epoll_state.go +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -21,9 +21,7 @@ import ( // afterLoad is invoked by stateify. func (p *pollEntry) afterLoad() { - p.waiter.Callback = p p.file = refs.NewWeakRef(p.id.File, p) - p.id.File.EventRegister(&p.waiter, p.mask) } // afterLoad is invoked by stateify. diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 5ea44a2c2..bf625dede 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -54,7 +54,7 @@ type EventOperations struct { // Queue is used to notify interested parties when the event object // becomes readable or writable. - wq waiter.Queue `state:"zerovalue"` + wq waiter.Queue // val is the current value of the event counter. val uint64 diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index df5160b67..5dc821a48 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -57,7 +57,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -78,11 +77,19 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" ) -// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow -// easy access everywhere. To be removed once VFS2 becomes the default. +// VFS2Enabled is set to true when VFS2 is enabled. Added as a global to allow +// easy access everywhere. +// +// TODO(gvisor.dev/issue/1624): Remove when VFS1 is no longer used. var VFS2Enabled = false -// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow +// LISAFSEnabled is set to true when lisafs protocol is enabled. Added as a +// global to allow easy access everywhere. +// +// TODO(gvisor.dev/issue/6319): Remove when lisafs is default. +var LISAFSEnabled = false + +// FUSEEnabled is set to true when FUSE is enabled. Added as a global to allow // easy access everywhere. To be removed once FUSE is completed. var FUSEEnabled = false @@ -478,11 +485,6 @@ func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error { return err } - // Remove all epoll waiter objects from underlying wait queues. - // NOTE: for programs to resume execution in future snapshot scenarios, - // we will need to re-establish these waiter objects after saving. - k.tasks.unregisterEpollWaiters(ctx) - // Clear the dirent cache before saving because Dirents must be Loaded in a // particular order (parents before children), and Loading dirents from a cache // breaks that order. @@ -615,32 +617,6 @@ func (k *Kernel) flushWritesToFiles(ctx context.Context) error { }) } -// Preconditions: !VFS2Enabled. -func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) { - ts.mu.RLock() - defer ts.mu.RUnlock() - - // Tasks that belong to the same process could potentially point to the - // same FDTable. So we retain a map of processed ones to avoid - // processing the same FDTable multiple times. - processed := make(map[*FDTable]struct{}) - for t := range ts.Root.tids { - // We can skip locking Task.mu here since the kernel is paused. - if t.fdTable == nil { - continue - } - if _, ok := processed[t.fdTable]; ok { - continue - } - t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if e, ok := file.FileOperations.(*epoll.EventPoll); ok { - e.UnregisterEpollWaiters() - } - }) - processed[t.fdTable] = struct{}{} - } -} - // Preconditions: The kernel must be paused. func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { invalidated := make(map[*mm.MemoryManager]struct{}) diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 86beee6fe..8345473f3 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -55,7 +55,7 @@ const ( // // +stateify savable type Pipe struct { - waiter.Queue `state:"nosave"` + waiter.Queue // isNamed indicates whether this is a named pipe. // diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index b0004482c..1ea3c1bf7 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -158,7 +158,7 @@ type Task struct { // signalQueue is protected by the signalMutex. Note that the task does // not implement all queue methods, specifically the readiness checks. // The task only broadcast a notification on signal delivery. - signalQueue waiter.Queue `state:"zerovalue"` + signalQueue waiter.Queue // If groupStopPending is true, the task should participate in a group // stop in the interrupt path. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index c5b099559..f0c168ecc 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -191,9 +191,11 @@ const ( // // Preconditions: The task's owning TaskSet.mu must be locked. func (t *Task) updateInfoLocked() { - // Use the task's TID in the root PID namespace for logging. + // Use the task's TID and PID in the root PID namespace for logging. + pid := t.tg.pidns.owner.Root.tgids[t.tg] tid := t.tg.pidns.owner.Root.tids[t] - t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid)) + t.logPrefix.Store(fmt.Sprintf("[% 4d:% 4d] ", pid, tid)) + t.rebuildTraceContext(tid) } diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index 77ad62445..e38b723ce 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -324,11 +324,7 @@ type threadGroupNode struct { // eventQueue is notified whenever a event of interest to Task.Wait occurs // in a child of this thread group, or a ptrace tracee of a task in this // thread group. Events are defined in task_exit.go. - // - // Note that we cannot check and save this wait queue similarly to other - // wait queues, as the queue will not be empty by the time of saving, due - // to the wait sourced from Exec(). - eventQueue waiter.Queue `state:"nosave"` + eventQueue waiter.Queue // leader is the thread group's leader, which is the oldest task in the // thread group; usually the last task in the thread group to call diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index a26f54269..834d72408 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -63,7 +63,6 @@ go_library( "//pkg/procid", "//pkg/ring0", "//pkg/ring0/pagetables", - "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/arch/fpu", @@ -71,6 +70,7 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sentry/time", + "//pkg/sighandling", "//pkg/sync", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index 826997e77..5be2215ed 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -19,8 +19,8 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/ring0" - "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sighandling" ) // bluepill enters guest mode. @@ -97,7 +97,7 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) { func init() { // Install the handler. - if err := safecopy.ReplaceSignalHandler(bluepillSignal, addrOfSighandler(), &savedHandler); err != nil { + if err := sighandling.ReplaceSignalHandler(bluepillSignal, addrOfSighandler(), &savedHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) } diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index 0567c8d32..b2db2bb9f 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -71,10 +71,6 @@ func (c *vCPU) KernelSyscall() { if regs.Rax != ^uint64(0) { regs.Rip -= 2 // Rewind. } - // We only trigger a bluepill entry in the bluepill function, and can - // therefore be guaranteed that there is no floating point state to be - // loaded on resuming from halt. We only worry about saving on exit. - ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. // N.B. Since KernelSyscall is called when the kernel makes a syscall, // FS_BASE is already set for correct execution of this function. // @@ -112,8 +108,6 @@ func (c *vCPU) KernelException(vector ring0.Vector) { regs.Rip = 0 } // See above. - ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. - // See above. ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } @@ -144,5 +138,5 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { // Set the context pointer to the saved floating point state. This is // where the guest data has been serialized, the kernel will restore // from this new pointer value. - context.Fpstate = uint64(uintptrValue(c.floatingPointState.BytePointer())) + context.Fpstate = uint64(uintptrValue(c.FloatingPointState().BytePointer())) // escapes: no. } diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index acb0cb05f..df772d620 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -70,7 +70,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { lazyVfp := c.GetLazyVFP() if lazyVfp != 0 { - fpsimd := fpsimdPtr(c.floatingPointState.BytePointer()) + fpsimd := fpsimdPtr(c.FloatingPointState().BytePointer()) // escapes: no context.Fpsimd64.Fpsr = fpsimd.Fpsr context.Fpsimd64.Fpcr = fpsimd.Fpcr context.Fpsimd64.Vregs = fpsimd.Vregs @@ -90,12 +90,12 @@ func (c *vCPU) KernelSyscall() { fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { - fpsimd := fpsimdPtr(c.floatingPointState.BytePointer()) + fpsimd := fpsimdPtr(c.FloatingPointState().BytePointer()) // escapes: no fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) - ring0.SaveVRegs(c.floatingPointState.BytePointer()) + ring0.SaveVRegs(c.FloatingPointState().BytePointer()) // escapes: no } ring0.Halt() @@ -114,12 +114,12 @@ func (c *vCPU) KernelException(vector ring0.Vector) { fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { - fpsimd := fpsimdPtr(c.floatingPointState.BytePointer()) + fpsimd := fpsimdPtr(c.FloatingPointState().BytePointer()) // escapes: no fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) - ring0.SaveVRegs(c.floatingPointState.BytePointer()) + ring0.SaveVRegs(c.FloatingPointState().BytePointer()) // escapes: no } ring0.Halt() diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index aac0fdffe..ad6863646 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -77,7 +77,11 @@ var ( // OpenDevice opens the KVM device at /dev/kvm and returns the File. func OpenDevice() (*os.File, error) { - f, err := os.OpenFile("/dev/kvm", unix.O_RDWR, 0) + dev, ok := os.LookupEnv("GVISOR_KVM_DEV") + if !ok { + dev = "/dev/kvm" + } + f, err := os.OpenFile(dev, unix.O_RDWR, 0) if err != nil { return nil, fmt.Errorf("error opening /dev/kvm: %v", err) } diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index dcf34015d..f1f7e4ea4 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -28,9 +28,9 @@ import ( "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/seccomp" ktime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/pkg/sync" ) @@ -723,7 +723,7 @@ func addrOfSigsysHandler() uintptr func seccompMmapRules(m *machine) { seccompMmapRulesOnce.Do(func() { // Install the handler. - if err := safecopy.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil { + if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) } rules := []seccomp.RuleSet{} diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index ab1e036b7..5bc023899 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ktime "gvisor.dev/gvisor/pkg/sentry/time" ) @@ -72,10 +71,6 @@ type vCPUArchState struct { // // This starts above fixedKernelPCID. PCIDs *pagetables.PCIDs - - // floatingPointState is the floating point state buffer used in guest - // to host transitions. See usage in bluepill_amd64.go. - floatingPointState fpu.State } const ( @@ -152,12 +147,6 @@ func (c *vCPU) initArchState() error { return fmt.Errorf("error setting user registers: %v", errno) } - // Allocate some floating point state save area for the local vCPU. - // This will be saved prior to leaving the guest, and we restore from - // this always. We cannot use the pointer in the context alone because - // we don't know how large the area there is in reality. - c.floatingPointState = fpu.NewState() - // Set the time offset to the host native time. return c.setSystemTime() } diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 08d98c479..31998a600 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ) @@ -40,10 +39,6 @@ type vCPUArchState struct { // // This starts above fixedKernelPCID. PCIDs *pagetables.PCIDs - - // floatingPointState is the floating point state buffer used in guest - // to host transitions. See usage in bluepill_arm64.go. - floatingPointState fpu.State } const ( diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 7e8e19dcb..e73d5c544 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ktime "gvisor.dev/gvisor/pkg/sentry/time" ) @@ -159,8 +158,6 @@ func (c *vCPU) initArchState() error { c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) } - c.floatingPointState = fpu.NewState() - return c.setSystemTime() } diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 7ee89a735..00f925166 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -4,7 +4,10 @@ package(licenses = ["notice"]) go_library( name = "socket", - srcs = ["socket.go"], + srcs = [ + "socket.go", + "socket_state.go", + ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index f9a5b0df1..6077b2150 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -29,10 +29,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "time" ) -const maxInt = int(^uint(0) >> 1) - // SCMCredentials represents a SCM_CREDENTIALS socket control message. type SCMCredentials interface { transport.CredentialsControlMessage @@ -78,7 +77,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { } // Files implements SCMRights.Files. -func (fs *RightsFiles) Files(ctx context.Context, max int) (RightsFiles, bool) { +func (fs *RightsFiles) Files(_ context.Context, max int) (RightsFiles, bool) { n := max var trunc bool if l := len(*fs); n > l { @@ -124,7 +123,7 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32 break } - fds = append(fds, int32(fd)) + fds = append(fds, fd) } return fds, trunc } @@ -300,8 +299,8 @@ func alignSlice(buf []byte, align uint) []byte { } // PackTimestamp packs a SO_TIMESTAMP socket control message. -func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte { - timestampP := linux.NsecToTimeval(timestamp) +func PackTimestamp(t *kernel.Task, timestamp time.Time, buf []byte) []byte { + timestampP := linux.NsecToTimeval(timestamp.UnixNano()) return putCmsgStruct( buf, linux.SOL_SOCKET, @@ -545,7 +544,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) } var ts linux.Timeval ts.UnmarshalUnsafe(buf[i : i+linux.SizeOfTimeval]) - cmsgs.IP.Timestamp = ts.ToNsecCapped() + cmsgs.IP.Timestamp = ts.ToTime() cmsgs.IP.HasTimestamp = true i += bits.AlignUp(length, width) diff --git a/pkg/sentry/socket/control/control_test.go b/pkg/sentry/socket/control/control_test.go index 7e28a0cef..1b04e1bbc 100644 --- a/pkg/sentry/socket/control/control_test.go +++ b/pkg/sentry/socket/control/control_test.go @@ -50,7 +50,7 @@ func TestParse(t *testing.T) { want := socket.ControlMessages{ IP: socket.IPControlMessages{ HasTimestamp: true, - Timestamp: ts.ToNsecCapped(), + Timestamp: ts.ToTime(), }, } if diff := cmp.Diff(want, cmsg); diff != "" { diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 1c1e501ba..6e2318f75 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -111,7 +111,7 @@ func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } return readv(s.fd, safemem.IovecsFromBlockSeq(dsts)) })) - return int64(n), err + return n, err } // Write implements fs.FileOperations.Write. @@ -134,7 +134,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } return writev(s.fd, safemem.IovecsFromBlockSeq(srcs)) })) - return int64(n), err + return n, err } // Socket implements socket.Provider.Socket. @@ -180,7 +180,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, proto } // Pair implements socket.Provider.Pair. -func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func (p *socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { // Not supported by AF_INET/AF_INET6. return nil, nil, nil } @@ -207,7 +207,7 @@ type socketOpsCommon struct { // Release implements fs.FileOperations.Release. func (s *socketOpsCommon) Release(context.Context) { fdnotifier.RemoveFD(int32(s.fd)) - unix.Close(s.fd) + _ = unix.Close(s.fd) } // Readiness implements waiter.Waitable.Readiness. @@ -218,13 +218,13 @@ func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { // EventRegister implements waiter.Waitable.EventRegister. func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.queue.EventRegister(e, mask) - fdnotifier.UpdateFD(int32(s.fd)) + _ = fdnotifier.UpdateFD(int32(s.fd)) } // EventUnregister implements waiter.Waitable.EventUnregister. func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.queue.EventUnregister(e) - fdnotifier.UpdateFD(int32(s.fd)) + _ = fdnotifier.UpdateFD(int32(s.fd)) } // Connect implements socket.Socket.Connect. @@ -316,7 +316,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, if kernel.VFS2Enabled { f, err := newVFS2Socket(t, s.family, s.stype, s.protocol, fd, uint32(flags&unix.SOCK_NONBLOCK)) if err != nil { - unix.Close(fd) + _ = unix.Close(fd) return 0, nil, 0, err } defer f.DecRef(t) @@ -328,7 +328,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, } else { f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&unix.SOCK_NONBLOCK != 0) if err != nil { - unix.Close(fd) + _ = unix.Close(fd) return 0, nil, 0, err } defer f.DecRef(t) @@ -343,7 +343,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, } // Bind implements socket.Socket.Bind. -func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } @@ -356,12 +356,12 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Listen implements socket.Socket.Listen. -func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(_ *kernel.Task, backlog int) *syserr.Error { return syserr.FromError(unix.Listen(s.fd, backlog)) } // Shutdown implements socket.Socket.Shutdown. -func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(_ *kernel.Task, how int) *syserr.Error { switch how { case unix.SHUT_RD, unix.SHUT_WR, unix.SHUT_RDWR: return syserr.FromError(unix.Shutdown(s.fd, how)) @@ -371,7 +371,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, _ hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { if outLen < 0 { return nil, syserr.ErrInvalidArgument } @@ -401,7 +401,7 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr case linux.TCP_NODELAY: optlen = sizeofInt32 case linux.TCP_INFO: - optlen = int(linux.SizeOfTCPInfo) + optlen = linux.SizeOfTCPInfo } } @@ -579,7 +579,7 @@ func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) s controlMessages.IP.HasTimestamp = true ts := linux.Timeval{} ts.UnmarshalUnsafe(unixCmsg.Data[:linux.SizeOfTimeval]) - controlMessages.IP.Timestamp = ts.ToNsecCapped() + controlMessages.IP.Timestamp = ts.ToTime() } case linux.SOL_IP: diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go index 0f6e576a9..b9c15daab 100644 --- a/pkg/sentry/socket/netfilter/targets.go +++ b/pkg/sentry/socket/netfilter/targets.go @@ -647,7 +647,7 @@ func (jt *JumpTarget) id() targetID { } // Action implements stack.Target.Action. -func (jt *JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.Route, stack.AddressableEndpoint) (stack.RuleVerdict, int) { +func (jt *JumpTarget) Action(*stack.PacketBuffer, stack.Hook, *stack.Route, stack.AddressableEndpoint) (stack.RuleVerdict, int) { return stack.RuleJump, jt.RuleNum } diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index bf5ec4558..075f61cda 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "device.go", "netstack.go", + "netstack_state.go", "netstack_vfs2.go", "provider.go", "provider_vfs2.go", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index dedc32dda..030c6c8e4 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -274,6 +274,7 @@ var Metrics = tcpip.Stats{ ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), FailedPortReservations: mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."), SegmentsAckedWithDSACK: mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."), + SpuriousRecovery: mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."), }, UDP: tcpip.UDPStats{ PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), @@ -378,9 +379,9 @@ type socketOpsCommon struct { // timestampValid indicates whether timestamp for SIOCGSTAMP has been // set. It is protected by readMu. timestampValid bool - // timestampNS holds the timestamp to use with SIOCTSTAMP. It is only + // timestamp holds the timestamp to use with SIOCTSTAMP. It is only // valid when timestampValid is true. It is protected by readMu. - timestampNS int64 + timestamp time.Time `state:".(int64)"` // TODO(b/153685824): Move this to SocketOptions. // sockOptInq corresponds to TCP_INQ. @@ -410,15 +411,6 @@ var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes() var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes() var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes() -// bytesToIPAddress converts an IPv4 or IPv6 address from the user to the -// netstack representation taking any addresses into account. -func bytesToIPAddress(addr []byte) tcpip.Address { - if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) { - return "" - } - return tcpip.Address(addr) -} - // minSockAddrLen returns the minimum length in bytes of a socket address for // the socket's family. func (s *socketOpsCommon) minSockAddrLen() int { @@ -468,7 +460,7 @@ func (s *socketOpsCommon) Release(ctx context.Context) { t := kernel.TaskFromContext(ctx) start := t.Kernel().MonotonicClock().Now() deadline := start.Add(v.Timeout) - t.BlockWithDeadline(ch, true, deadline) + _ = t.BlockWithDeadline(ch, true, deadline) } } @@ -488,7 +480,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } // WriteTo implements fs.FileOperations.WriteTo. -func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) { +func (s *SocketOperations) WriteTo(_ context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) { s.readMu.Lock() defer s.readMu.Unlock() @@ -543,7 +535,7 @@ func (l *limitedPayloader) Len() int { } // ReadFrom implements fs.FileOperations.ReadFrom. -func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { +func (s *SocketOperations) ReadFrom(_ context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { f := limitedPayloader{ inner: io.LimitedReader{ R: r, @@ -654,7 +646,7 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) < 2 { return syserr.ErrInvalidArgument } @@ -714,7 +706,7 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // Listen implements the linux syscall listen(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(_ *kernel.Task, backlog int) *syserr.Error { return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog)) } @@ -805,7 +797,7 @@ func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { // Shutdown implements the linux syscall shutdown(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(_ *kernel.Task, how int) *syserr.Error { f, err := ConvertShutdown(how) if err != nil { return err @@ -886,7 +878,7 @@ func boolToInt32(v bool) int32 { } // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. -func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { +func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_ERROR: @@ -1402,11 +1394,11 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } - info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, true) + info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true) if err != nil { return nil, err } @@ -1422,11 +1414,11 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } - entries, err := netfilter.GetEntries6(t, stack.(*Stack).Stack, outPtr, outLen) + entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen) if err != nil { return nil, err } @@ -1442,8 +1434,8 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber) @@ -1459,7 +1451,7 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name } // getSockOptIP implements GetSockOpt when level is SOL_IP. -func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) { +func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) { if _, ok := ep.(tcpip.Endpoint); !ok { log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) return nil, syserr.ErrUnknownProtocolOption @@ -1599,11 +1591,11 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } - info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, false) + info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false) if err != nil { return nil, err } @@ -1619,11 +1611,11 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } - entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen) + entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen) if err != nil { return nil, err } @@ -1639,8 +1631,8 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return nil, syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return nil, syserr.ErrNoDevice } ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber) @@ -2186,12 +2178,12 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return syserr.ErrNoDevice } // Stack must be a netstack stack. - return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, true) + return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, true) case linux.IP6T_SO_SET_ADD_COUNTERS: log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported") @@ -2429,12 +2421,12 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return syserr.ErrProtocolNotAvailable } - stack := inet.StackFromContext(t) - if stack == nil { + stk := inet.StackFromContext(t) + if stk == nil { return syserr.ErrNoDevice } // Stack must be a netstack stack. - return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, false) + return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, false) case linux.IPT_SO_SET_ADD_COUNTERS: log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported") @@ -2601,7 +2593,7 @@ func emitUnimplementedEventIP(t *kernel.Task, name int) { // GetSockName implements the linux syscall getsockname(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2613,7 +2605,7 @@ func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, * // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2774,7 +2766,7 @@ func (s *socketOpsCommon) updateTimestamp(cm tcpip.ControlMessages) { // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. if !s.sockOptTimestamp { s.timestampValid = true - s.timestampNS = cm.Timestamp + s.timestamp = cm.Timestamp } } @@ -2833,7 +2825,7 @@ func (s *socketOpsCommon) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // tcpip.Endpoint. -func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { if flags&linux.MSG_ERRQUEUE != 0 { return s.recvErr(t, dst) } @@ -2998,7 +2990,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy return 0, linuxerr.ENOENT } - tv := linux.NsecToTimeval(s.timestampNS) + tv := linux.NsecToTimeval(s.timestamp.UnixNano()) _, err := tv.CopyOut(t, args[2].Pointer()) return 0, err @@ -3105,7 +3097,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc } // interfaceIoctl implements interface requests. -func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error { +func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error { var ( iface inet.Interface index int32 @@ -3113,8 +3105,8 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe ) // Find the relevant device. - stack := inet.StackFromContext(ctx) - if stack == nil { + stk := inet.StackFromContext(ctx) + if stk == nil { return syserr.ErrNoDevice } @@ -3124,7 +3116,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe // Gets the name of the interface given the interface index // stored in ifr_ifindex. index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4])) - if iface, ok := stack.Interfaces()[index]; ok { + if iface, ok := stk.Interfaces()[index]; ok { ifr.SetName(iface.Name) return nil } @@ -3132,7 +3124,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe } // Find the relevant device. - for index, iface = range stack.Interfaces() { + for index, iface = range stk.Interfaces() { if iface.Name == ifr.Name() { found = true break @@ -3165,7 +3157,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe } case linux.SIOCGIFFLAGS: - f, err := interfaceStatusFlags(stack, iface.Name) + f, err := interfaceStatusFlags(stk, iface.Name) if err != nil { return err } @@ -3175,7 +3167,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe case linux.SIOCGIFADDR: // Copy the IPv4 address out. - for _, addr := range stack.InterfaceAddrs()[index] { + for _, addr := range stk.InterfaceAddrs()[index] { // This ioctl is only compatible with AF_INET addresses. if addr.Family != linux.AF_INET { continue @@ -3211,7 +3203,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe case linux.SIOCGIFNETMASK: // Gets the network mask of a device. - for _, addr := range stack.InterfaceAddrs()[index] { + for _, addr := range stk.InterfaceAddrs()[index] { // This ioctl is only compatible with AF_INET addresses. if addr.Family != linux.AF_INET { continue @@ -3243,24 +3235,24 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe } // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl. -func ifconfIoctl(ctx context.Context, t *kernel.Task, io usermem.IO, ifc *linux.IFConf) error { +func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error { // If Ptr is NULL, return the necessary buffer size via Len. // Otherwise, write up to Len bytes starting at Ptr containing ifreq // structs. - stack := inet.StackFromContext(ctx) - if stack == nil { + stk := inet.StackFromContext(ctx) + if stk == nil { return syserr.ErrNoDevice.ToError() } if ifc.Ptr == 0 { - ifc.Len = int32(len(stack.Interfaces())) * int32(linux.SizeOfIFReq) + ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq) return nil } max := ifc.Len ifc.Len = 0 - for key, ifaceAddrs := range stack.InterfaceAddrs() { - iface := stack.Interfaces()[key] + for key, ifaceAddrs := range stk.InterfaceAddrs() { + iface := stk.Interfaces()[key] for _, ifaceAddr := range ifaceAddrs { // Don't write past the end of the buffer. if ifc.Len+int32(linux.SizeOfIFReq) > max { diff --git a/pkg/sentry/socket/netstack/netstack_state.go b/pkg/sentry/socket/netstack/netstack_state.go new file mode 100644 index 000000000..591e00d42 --- /dev/null +++ b/pkg/sentry/socket/netstack/netstack_state.go @@ -0,0 +1,31 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "time" +) + +func (s *socketOpsCommon) saveTimestamp() int64 { + s.readMu.Lock() + defer s.readMu.Unlock() + return s.timestamp.UnixNano() +} + +func (s *socketOpsCommon) loadTimestamp(nsec int64) { + s.readMu.Lock() + defer s.readMu.Unlock() + s.timestamp = time.Unix(0, nsec) +} diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 2f0eb4a6c..d4b80a39d 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -21,6 +21,7 @@ import ( "bytes" "fmt" "sync/atomic" + "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" @@ -51,8 +52,8 @@ type ControlMessages struct { func packetInfoToLinux(packetInfo tcpip.IPPacketInfo) linux.ControlMessageIPPacketInfo { var p linux.ControlMessageIPPacketInfo p.NIC = int32(packetInfo.NIC) - copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr)) - copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr)) + copy(p.LocalAddr[:], packetInfo.LocalAddr) + copy(p.DestinationAddr[:], packetInfo.DestinationAddr) return p } @@ -60,7 +61,7 @@ func packetInfoToLinux(packetInfo tcpip.IPPacketInfo) linux.ControlMessageIPPack // format. func ipv6PacketInfoToLinux(packetInfo tcpip.IPv6PacketInfo) linux.ControlMessageIPv6PacketInfo { var p linux.ControlMessageIPv6PacketInfo - if n := copy(p.Addr[:], []byte(packetInfo.Addr)); n != len(p.Addr) { + if n := copy(p.Addr[:], packetInfo.Addr); n != len(p.Addr) { panic(fmt.Sprintf("got copy(%x, %x) = %d, want = %d", p.Addr, packetInfo.Addr, n, len(p.Addr))) } p.NIC = uint32(packetInfo.NIC) @@ -156,9 +157,9 @@ type IPControlMessages struct { // HasTimestamp indicates whether Timestamp is valid/set. HasTimestamp bool - // Timestamp is the time (in ns) that the last packet used to create - // the read data was received. - Timestamp int64 + // Timestamp is the time that the last packet used to create the read data + // was received. + Timestamp time.Time `state:".(int64)"` // HasInq indicates whether Inq is valid/set. HasInq bool diff --git a/pkg/sentry/socket/socket_state.go b/pkg/sentry/socket/socket_state.go new file mode 100644 index 000000000..32e12b238 --- /dev/null +++ b/pkg/sentry/socket/socket_state.go @@ -0,0 +1,27 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package socket + +import ( + "time" +) + +func (i *IPControlMessages) saveTimestamp() int64 { + return i.Timestamp.UnixNano() +} + +func (i *IPControlMessages) loadTimestamp(nsec int64) { + i.Timestamp = time.Unix(0, nsec) +} diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 757ff2a40..4d3f4d556 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -610,9 +610,9 @@ func (i *SyscallInfo) printExit(t *kernel.Task, elapsed time.Duration, output [] if err == nil { // Fill in the output after successful execution. i.post(t, args, retval, output, LogMaximumSize) - rval = fmt.Sprintf("%#x (%v)", retval, elapsed) + rval = fmt.Sprintf("%d (%#x) (%v)", retval, retval, elapsed) } else { - rval = fmt.Sprintf("%#x errno=%d (%s) (%v)", retval, errno, err, elapsed) + rval = fmt.Sprintf("%d (%#x) errno=%d (%s) (%v)", retval, retval, errno, err, elapsed) } switch len(output) { diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 04bc4d10c..fefd0fc9c 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -135,12 +135,16 @@ func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask { return 0 } ep.mu.Lock() - for epi := ep.ready.Front(); epi != nil; epi = epi.Next() { + var next *epollInterest + for epi := ep.ready.Front(); epi != nil; epi = next { + next = epi.Next() wmask := waiter.EventMaskFromLinux(epi.mask) if epi.key.file.Readiness(wmask)&wmask != 0 { ep.mu.Unlock() return waiter.ReadableEvents } + ep.ready.Remove(epi) + epi.ready = false } ep.mu.Unlock() return 0 diff --git a/pkg/shim/service.go b/pkg/shim/service.go index 24e3b7a82..0980d964e 100644 --- a/pkg/shim/service.go +++ b/pkg/shim/service.go @@ -77,6 +77,8 @@ const ( // shimAddressPath is the relative path to a file that contains the address // to the shim UDS. See service.shimAddress. shimAddressPath = "address" + + cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" ) // New returns a new shim service that can be used via GRPC. @@ -952,7 +954,7 @@ func newInit(path, workDir, namespace string, platform stdio.Platform, r *proc.C if err != nil { return nil, fmt.Errorf("update volume annotations: %w", err) } - updated = updateCgroup(spec) || updated + updated = setPodCgroup(spec) || updated if updated { if err := utils.WriteSpec(r.Bundle, spec); err != nil { @@ -980,12 +982,13 @@ func newInit(path, workDir, namespace string, platform stdio.Platform, r *proc.C return p, nil } -// updateCgroup updates cgroup path for the sandbox to make the sandbox join the -// pod cgroup and not the pause container cgroup. Returns true if the spec was -// modified. Ex.: -// /kubepods/burstable/pod123/abc => kubepods/burstable/pod123 +// setPodCgroup searches for the pod cgroup path inside the container's cgroup +// path. If found, it's set as an annotation in the spec. This is done so that +// the sandbox joins the pod cgroup. Otherwise, the sandbox would join the pause +// container cgroup. Returns true if the spec was modified. Ex.: +// /kubepods/burstable/pod123/container123 => kubepods/burstable/pod123 // -func updateCgroup(spec *specs.Spec) bool { +func setPodCgroup(spec *specs.Spec) bool { if !utils.IsSandbox(spec) { return false } @@ -1009,7 +1012,10 @@ func updateCgroup(spec *specs.Spec) bool { if spec.Linux.CgroupsPath == path { return false } - spec.Linux.CgroupsPath = path + if spec.Annotations == nil { + spec.Annotations = make(map[string]string) + } + spec.Annotations[cgroupParentAnnotation] = path return true } } diff --git a/pkg/shim/service_test.go b/pkg/shim/service_test.go index 2d9f07e02..4b4410a58 100644 --- a/pkg/shim/service_test.go +++ b/pkg/shim/service_test.go @@ -40,12 +40,12 @@ func TestCgroupPath(t *testing.T) { { name: "no-container", path: "foo/pod123", - want: "foo/pod123", + want: "", }, { name: "no-container-absolute", path: "/foo/pod123", - want: "/foo/pod123", + want: "", }, { name: "double-pod", @@ -70,7 +70,7 @@ func TestCgroupPath(t *testing.T) { { name: "no-pod", path: "/foo/nopod123/container", - want: "/foo/nopod123/container", + want: "", }, } { t.Run(tc.name, func(t *testing.T) { @@ -79,12 +79,12 @@ func TestCgroupPath(t *testing.T) { CgroupsPath: tc.path, }, } - updated := updateCgroup(&spec) - if spec.Linux.CgroupsPath != tc.want { - t.Errorf("updateCgroup(%q), want: %q, got: %q", tc.path, tc.want, spec.Linux.CgroupsPath) + updated := setPodCgroup(&spec) + if got := spec.Annotations[cgroupParentAnnotation]; got != tc.want { + t.Errorf("setPodCgroup(%q), want: %q, got: %q", tc.path, tc.want, got) } - if shouldUpdate := tc.path != tc.want; shouldUpdate != updated { - t.Errorf("updateCgroup(%q)=%v, want: %v", tc.path, updated, shouldUpdate) + if shouldUpdate := len(tc.want) > 0; shouldUpdate != updated { + t.Errorf("setPodCgroup(%q)=%v, want: %v", tc.path, updated, shouldUpdate) } }) } @@ -113,8 +113,8 @@ func TestCgroupNoUpdate(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - if updated := updateCgroup(tc.spec); updated { - t.Errorf("updateCgroup(%+v), got: %v, want: false", tc.spec.Linux, updated) + if updated := setPodCgroup(tc.spec); updated { + t.Errorf("setPodCgroup(%+v), got: %v, want: false", tc.spec.Linux, updated) } }) } diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sighandling/BUILD index 1790d57c9..72f10f982 100644 --- a/pkg/sentry/sighandling/BUILD +++ b/pkg/sighandling/BUILD @@ -8,7 +8,7 @@ go_library( "sighandling.go", "sighandling_unsafe.go", ], - visibility = ["//pkg/sentry:internal"], + visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sighandling/sighandling.go index bdaf8af29..bdaf8af29 100644 --- a/pkg/sentry/sighandling/sighandling.go +++ b/pkg/sighandling/sighandling.go diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sighandling/sighandling_unsafe.go index 3fe5c6770..7deeda042 100644 --- a/pkg/sentry/sighandling/sighandling_unsafe.go +++ b/pkg/sighandling/sighandling_unsafe.go @@ -15,6 +15,7 @@ package sighandling import ( + "fmt" "unsafe" "golang.org/x/sys/unix" @@ -37,3 +38,36 @@ func IgnoreChildStop() error { return nil } + +// ReplaceSignalHandler replaces the existing signal handler for the provided +// signal with the function pointer at `handler`. This bypasses the Go runtime +// signal handlers, and should only be used for low-level signal handlers where +// use of signal.Notify is not appropriate. +// +// It stores the value of the previously set handler in previous. +func ReplaceSignalHandler(sig unix.Signal, handler uintptr, previous *uintptr) error { + var sa linux.SigAction + const maskLen = 8 + + // Get the existing signal handler information, and save the current + // handler. Once we replace it, we will use this pointer to fall back to + // it when we receive other signals. + if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 { + return e + } + + // Fail if there isn't a previous handler. + if sa.Handler == 0 { + return fmt.Errorf("previous handler for signal %x isn't set", sig) + } + + *previous = uintptr(sa.Handler) + + // Install our own handler. + sa.Handler = uint64(handler) + if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 { + return e + } + + return nil +} diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD index 73791b456..517f16329 100644 --- a/pkg/sync/BUILD +++ b/pkg/sync/BUILD @@ -26,6 +26,7 @@ go_library( "rwmutex_unsafe.go", "seqcount.go", "sync.go", + "wait.go", ], marshal = False, stateify = False, diff --git a/pkg/sync/wait.go b/pkg/sync/wait.go new file mode 100644 index 000000000..f8e7742a5 --- /dev/null +++ b/pkg/sync/wait.go @@ -0,0 +1,58 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sync + +// WaitGroupErr is similar to WaitGroup but allows goroutines to report error. +// Only the first error is retained and reported back. +// +// Example usage: +// wg := WaitGroupErr{} +// wg.Add(1) +// go func() { +// defer wg.Done() +// if err := ...; err != nil { +// wg.ReportError(err) +// return +// } +// }() +// return wg.Error() +// +type WaitGroupErr struct { + WaitGroup + + // mu protects firstErr. + mu Mutex + + // firstErr holds the first error reported. nil is no error occurred. + firstErr error +} + +// ReportError reports an error. Note it does not call Done(). +func (w *WaitGroupErr) ReportError(err error) { + w.mu.Lock() + defer w.mu.Unlock() + if w.firstErr == nil { + w.firstErr = err + } +} + +// Error waits for the counter to reach 0 and returns the first reported error +// if any. +func (w *WaitGroupErr) Error() error { + w.Wait() + w.mu.Lock() + defer w.mu.Unlock() + return w.firstErr +} diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD index dbe4506cc..b98de54c5 100644 --- a/pkg/tcpip/BUILD +++ b/pkg/tcpip/BUILD @@ -25,6 +25,7 @@ go_library( "stdclock.go", "stdclock_state.go", "tcpip.go", + "tcpip_state.go", "timer.go", ], visibility = ["//visibility:public"], diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go index 87a0b9a62..e53789d92 100644 --- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go +++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go @@ -152,10 +152,22 @@ type PollEvent struct { // no data is available, it will block in a poll() syscall until the file // descriptor becomes readable. func BlockingRead(fd int, b []byte) (int, tcpip.Error) { + n, err := BlockingReadUntranslated(fd, b) + if err != 0 { + return n, TranslateErrno(err) + } + return n, nil +} + +// BlockingReadUntranslated reads from a file descriptor that is set up as +// non-blocking. If no data is available, it will block in a poll() syscall +// until the file descriptor becomes readable. It returns the raw unix.Errno +// value returned by the underlying syscalls. +func BlockingReadUntranslated(fd int, b []byte) (int, unix.Errno) { for { n, _, e := unix.RawSyscall(unix.SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(&b[0])), uintptr(len(b))) if e == 0 { - return int(n), nil + return int(n), 0 } event := PollEvent{ @@ -165,7 +177,7 @@ func BlockingRead(fd int, b []byte) (int, tcpip.Error) { _, e = BlockingPoll(&event, 1, nil) if e != 0 && e != unix.EINTR { - return 0, TranslateErrno(e) + return 0, e } } } diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD index 4215ee852..f8076d83c 100644 --- a/pkg/tcpip/link/sharedmem/BUILD +++ b/pkg/tcpip/link/sharedmem/BUILD @@ -5,19 +5,26 @@ package(licenses = ["notice"]) go_library( name = "sharedmem", srcs = [ + "queuepair.go", "rx.go", + "server_rx.go", + "server_tx.go", "sharedmem.go", + "sharedmem_server.go", "sharedmem_unsafe.go", "tx.go", ], visibility = ["//visibility:public"], deps = [ + "//pkg/cleanup", + "//pkg/eventfd", "//pkg/log", "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", "//pkg/tcpip/link/rawfile", + "//pkg/tcpip/link/sharedmem/pipe", "//pkg/tcpip/link/sharedmem/queue", "//pkg/tcpip/stack", "@org_golang_x_sys//unix:go_default_library", @@ -26,9 +33,7 @@ go_library( go_test( name = "sharedmem_test", - srcs = [ - "sharedmem_test.go", - ], + srcs = ["sharedmem_test.go"], library = ":sharedmem", deps = [ "//pkg/sync", @@ -41,3 +46,22 @@ go_test( "@org_golang_x_sys//unix:go_default_library", ], ) + +go_test( + name = "sharedmem_server_test", + size = "small", + srcs = ["sharedmem_server_test.go"], + deps = [ + ":sharedmem", + "//pkg/tcpip", + "//pkg/tcpip/adapters/gonet", + "//pkg/tcpip/header", + "//pkg/tcpip/link/sniffer", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/network/ipv6", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport/tcp", + "//pkg/tcpip/transport/udp", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/tcpip/link/sharedmem/queue/rx.go b/pkg/tcpip/link/sharedmem/queue/rx.go index 696e6c9e5..a78826ebc 100644 --- a/pkg/tcpip/link/sharedmem/queue/rx.go +++ b/pkg/tcpip/link/sharedmem/queue/rx.go @@ -119,7 +119,6 @@ func (r *Rx) PostBuffers(buffers []RxBuffer) bool { } r.tx.Flush() - return true } @@ -131,7 +130,6 @@ func (r *Rx) PostBuffers(buffers []RxBuffer) bool { func (r *Rx) Dequeue(bufs []RxBuffer) ([]RxBuffer, uint32) { for { outBufs := bufs - // Pull the next descriptor from the rx pipe. b := r.rx.Pull() if b == nil { diff --git a/pkg/tcpip/link/sharedmem/queuepair.go b/pkg/tcpip/link/sharedmem/queuepair.go new file mode 100644 index 000000000..b12647fdd --- /dev/null +++ b/pkg/tcpip/link/sharedmem/queuepair.go @@ -0,0 +1,199 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux +// +build linux + +package sharedmem + +import ( + "fmt" + "io/ioutil" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/eventfd" +) + +const ( + // defaultQueueDataSize is the size of the shared memory data region that + // holds the scatter/gather buffers. + defaultQueueDataSize = 1 << 20 // 1MiB + + // defaultQueuePipeSize is the size of the pipe that holds the packet descriptors. + // + // Assuming each packet data is approximately 1280 bytes (IPv6 Minimum MTU) + // then we can hold approximately 1024*1024/1280 ~ 819 packets in the data + // area. Which means the pipe needs to be big enough to hold 819 + // descriptors. + // + // Each descriptor is approximately 8 (slot descriptor in pipe) + + // 16 (packet descriptor) + 12 (for buffer descriptor) assuming each packet is + // stored in exactly 1 buffer descriptor (see queue/tx.go and pipe/tx.go.) + // + // Which means we need approximately 36*819 ~ 29 KiB to store all packet + // descriptors. We could go with a 32 KiB pipe but to give it some slack in + // how the upper layer may make use of the scatter gather buffers we double + // this to hold enough descriptors. + defaultQueuePipeSize = 64 << 10 // 64KiB + + // defaultSharedDataSize is the size of the sharedData region used to + // enable/disable notifications. + defaultSharedDataSize = 4 << 10 // 4KiB +) + +// A QueuePair represents a pair of TX/RX queues. +type QueuePair struct { + // txCfg is the QueueConfig to be used for transmit queue. + txCfg QueueConfig + + // rxCfg is the QueueConfig to be used for receive queue. + rxCfg QueueConfig +} + +// NewQueuePair creates a shared memory QueuePair. +func NewQueuePair() (*QueuePair, error) { + txCfg, err := createQueueFDs(queueSizes{ + dataSize: defaultQueueDataSize, + txPipeSize: defaultQueuePipeSize, + rxPipeSize: defaultQueuePipeSize, + sharedDataSize: defaultSharedDataSize, + }) + + if err != nil { + return nil, fmt.Errorf("failed to create tx queue: %s", err) + } + + rxCfg, err := createQueueFDs(queueSizes{ + dataSize: defaultQueueDataSize, + txPipeSize: defaultQueuePipeSize, + rxPipeSize: defaultQueuePipeSize, + sharedDataSize: defaultSharedDataSize, + }) + + if err != nil { + closeFDs(txCfg) + return nil, fmt.Errorf("failed to create rx queue: %s", err) + } + + return &QueuePair{ + txCfg: txCfg, + rxCfg: rxCfg, + }, nil +} + +// Close closes underlying tx/rx queue fds. +func (q *QueuePair) Close() { + closeFDs(q.txCfg) + closeFDs(q.rxCfg) +} + +// TXQueueConfig returns the QueueConfig for the receive queue. +func (q *QueuePair) TXQueueConfig() QueueConfig { + return q.txCfg +} + +// RXQueueConfig returns the QueueConfig for the transmit queue. +func (q *QueuePair) RXQueueConfig() QueueConfig { + return q.rxCfg +} + +type queueSizes struct { + dataSize int64 + txPipeSize int64 + rxPipeSize int64 + sharedDataSize int64 +} + +func createQueueFDs(s queueSizes) (QueueConfig, error) { + success := false + var eventFD eventfd.Eventfd + var dataFD, txPipeFD, rxPipeFD, sharedDataFD int + defer func() { + if success { + return + } + closeFDs(QueueConfig{ + EventFD: eventFD, + DataFD: dataFD, + TxPipeFD: txPipeFD, + RxPipeFD: rxPipeFD, + SharedDataFD: sharedDataFD, + }) + }() + eventFD, err := eventfd.Create() + if err != nil { + return QueueConfig{}, fmt.Errorf("eventfd failed: %v", err) + } + dataFD, err = createFile(s.dataSize, false) + if err != nil { + return QueueConfig{}, fmt.Errorf("failed to create dataFD: %s", err) + } + txPipeFD, err = createFile(s.txPipeSize, true) + if err != nil { + return QueueConfig{}, fmt.Errorf("failed to create txPipeFD: %s", err) + } + rxPipeFD, err = createFile(s.rxPipeSize, true) + if err != nil { + return QueueConfig{}, fmt.Errorf("failed to create rxPipeFD: %s", err) + } + sharedDataFD, err = createFile(s.sharedDataSize, false) + if err != nil { + return QueueConfig{}, fmt.Errorf("failed to create sharedDataFD: %s", err) + } + success = true + return QueueConfig{ + EventFD: eventFD, + DataFD: dataFD, + TxPipeFD: txPipeFD, + RxPipeFD: rxPipeFD, + SharedDataFD: sharedDataFD, + }, nil +} + +func createFile(size int64, initQueue bool) (fd int, err error) { + const tmpDir = "/dev/shm/" + f, err := ioutil.TempFile(tmpDir, "sharedmem_test") + if err != nil { + return -1, fmt.Errorf("TempFile failed: %v", err) + } + defer f.Close() + unix.Unlink(f.Name()) + + if initQueue { + // Write the "slot-free" flag in the initial queue. + if _, err := f.WriteAt([]byte{0, 0, 0, 0, 0, 0, 0, 0x80}, 0); err != nil { + return -1, fmt.Errorf("WriteAt failed: %v", err) + } + } + + fd, err = unix.Dup(int(f.Fd())) + if err != nil { + return -1, fmt.Errorf("unix.Dup(%d) failed: %v", f.Fd(), err) + } + + if err := unix.Ftruncate(fd, size); err != nil { + unix.Close(fd) + return -1, fmt.Errorf("ftruncate(%d, %d) failed: %v", fd, size, err) + } + + return fd, nil +} + +func closeFDs(c QueueConfig) { + unix.Close(c.DataFD) + c.EventFD.Close() + unix.Close(c.TxPipeFD) + unix.Close(c.RxPipeFD) + unix.Close(c.SharedDataFD) +} diff --git a/pkg/tcpip/link/sharedmem/rx.go b/pkg/tcpip/link/sharedmem/rx.go index e882a128c..87747dcc7 100644 --- a/pkg/tcpip/link/sharedmem/rx.go +++ b/pkg/tcpip/link/sharedmem/rx.go @@ -21,7 +21,7 @@ import ( "sync/atomic" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/tcpip/link/rawfile" + "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" ) @@ -30,7 +30,7 @@ type rx struct { data []byte sharedData []byte q queue.Rx - eventFD int + eventFD eventfd.Eventfd } // init initializes all state needed by the rx queue based on the information @@ -68,7 +68,7 @@ func (r *rx) init(mtu uint32, c *QueueConfig) error { // Duplicate the eventFD so that caller can close it but we can still // use it. - efd, err := unix.Dup(c.EventFD) + efd, err := c.EventFD.Dup() if err != nil { unix.Munmap(txPipe) unix.Munmap(rxPipe) @@ -77,16 +77,6 @@ func (r *rx) init(mtu uint32, c *QueueConfig) error { return err } - // Set the eventfd as non-blocking. - if err := unix.SetNonblock(efd, true); err != nil { - unix.Munmap(txPipe) - unix.Munmap(rxPipe) - unix.Munmap(data) - unix.Munmap(sharedData) - unix.Close(efd) - return err - } - // Initialize state based on buffers. r.q.Init(txPipe, rxPipe, sharedDataPointer(sharedData)) r.data = data @@ -105,7 +95,13 @@ func (r *rx) cleanup() { unix.Munmap(r.data) unix.Munmap(r.sharedData) - unix.Close(r.eventFD) + r.eventFD.Close() +} + +// notify writes to the tx.eventFD to indicate to the peer that there is data to +// be read. +func (r *rx) notify() { + r.eventFD.Notify() } // postAndReceive posts the provided buffers (if any), and then tries to read @@ -122,8 +118,7 @@ func (r *rx) postAndReceive(b []queue.RxBuffer, stopRequested *uint32) ([]queue. if len(b) != 0 && !r.q.PostBuffers(b) { r.q.EnableNotification() for !r.q.PostBuffers(b) { - var tmp [8]byte - rawfile.BlockingRead(r.eventFD, tmp[:]) + r.eventFD.Wait() if atomic.LoadUint32(stopRequested) != 0 { r.q.DisableNotification() return nil, 0 @@ -147,8 +142,7 @@ func (r *rx) postAndReceive(b []queue.RxBuffer, stopRequested *uint32) ([]queue. } // Wait for notification. - var tmp [8]byte - rawfile.BlockingRead(r.eventFD, tmp[:]) + r.eventFD.Wait() if atomic.LoadUint32(stopRequested) != 0 { r.q.DisableNotification() return nil, 0 diff --git a/pkg/tcpip/link/sharedmem/server_rx.go b/pkg/tcpip/link/sharedmem/server_rx.go new file mode 100644 index 000000000..6ea21ffd1 --- /dev/null +++ b/pkg/tcpip/link/sharedmem/server_rx.go @@ -0,0 +1,142 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux +// +build linux + +package sharedmem + +import ( + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/eventfd" + "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe" + "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" +) + +type serverRx struct { + // packetPipe represents the receive end of the pipe that carries the packet + // descriptors sent by the client. + packetPipe pipe.Rx + + // completionPipe represents the transmit end of the pipe that will carry + // completion notifications from the server to the client. + completionPipe pipe.Tx + + // data represents the buffer area where the packet payload is held. + data []byte + + // eventFD is used to notify the peer when transmission is completed. + eventFD eventfd.Eventfd + + // sharedData the memory region to use to enable/disable notifications. + sharedData []byte +} + +// init initializes all state needed by the serverTx queue based on the +// information provided. +// +// The caller always retains ownership of all file descriptors passed in. The +// queue implementation will duplicate any that it may need in the future. +func (s *serverRx) init(c *QueueConfig) error { + // Map in all buffers. + packetPipeMem, err := getBuffer(c.TxPipeFD) + if err != nil { + return err + } + cu := cleanup.Make(func() { unix.Munmap(packetPipeMem) }) + defer cu.Clean() + + completionPipeMem, err := getBuffer(c.RxPipeFD) + if err != nil { + return err + } + cu.Add(func() { unix.Munmap(completionPipeMem) }) + + data, err := getBuffer(c.DataFD) + if err != nil { + return err + } + cu.Add(func() { unix.Munmap(data) }) + + sharedData, err := getBuffer(c.SharedDataFD) + if err != nil { + return err + } + cu.Add(func() { unix.Munmap(sharedData) }) + + // Duplicate the eventFD so that caller can close it but we can still + // use it. + efd, err := c.EventFD.Dup() + if err != nil { + return err + } + cu.Add(func() { efd.Close() }) + + s.packetPipe.Init(packetPipeMem) + s.completionPipe.Init(completionPipeMem) + s.data = data + s.eventFD = efd + s.sharedData = sharedData + + cu.Release() + return nil +} + +func (s *serverRx) cleanup() { + unix.Munmap(s.packetPipe.Bytes()) + unix.Munmap(s.completionPipe.Bytes()) + unix.Munmap(s.data) + unix.Munmap(s.sharedData) + s.eventFD.Close() +} + +// completionNotificationSize is size in bytes of a completion notification sent +// on the completion queue after a transmitted packet has been handled. +const completionNotificationSize = 8 + +// receive receives a single packet from the packetPipe. +func (s *serverRx) receive() []byte { + desc := s.packetPipe.Pull() + if desc == nil { + return nil + } + + pktInfo := queue.DecodeTxPacketHeader(desc) + contents := make([]byte, 0, pktInfo.Size) + toCopy := pktInfo.Size + for i := 0; i < pktInfo.BufferCount; i++ { + txBuf := queue.DecodeTxBufferHeader(desc, i) + if txBuf.Size <= toCopy { + contents = append(contents, s.data[txBuf.Offset:][:txBuf.Size]...) + toCopy -= txBuf.Size + continue + } + contents = append(contents, s.data[txBuf.Offset:][:toCopy]...) + break + } + + // Flush to let peer know that slots queued for transmission have been handled + // and its free to reuse the slots. + s.packetPipe.Flush() + // Encode packet completion. + b := s.completionPipe.Push(completionNotificationSize) + queue.EncodeTxCompletion(b, pktInfo.ID) + s.completionPipe.Flush() + return contents +} + +func (s *serverRx) waitForPackets() { + s.eventFD.Wait() +} diff --git a/pkg/tcpip/link/sharedmem/server_tx.go b/pkg/tcpip/link/sharedmem/server_tx.go new file mode 100644 index 000000000..13a82903f --- /dev/null +++ b/pkg/tcpip/link/sharedmem/server_tx.go @@ -0,0 +1,175 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux +// +build linux + +package sharedmem + +import ( + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/eventfd" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe" + "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" +) + +// serverTx represents the server end of the sharedmem queue and is used to send +// packets to the peer in the buffers posted by the peer in the fillPipe. +type serverTx struct { + // fillPipe represents the receive end of the pipe that carries the RxBuffers + // posted by the peer. + fillPipe pipe.Rx + + // completionPipe represents the transmit end of the pipe that carries the + // descriptors for filled RxBuffers. + completionPipe pipe.Tx + + // data represents the buffer area where the packet payload is held. + data []byte + + // eventFD is used to notify the peer when fill requests are fulfilled. + eventFD eventfd.Eventfd + + // sharedData the memory region to use to enable/disable notifications. + sharedData []byte +} + +// init initializes all tstate needed by the serverTx queue based on the +// information provided. +// +// The caller always retains ownership of all file descriptors passed in. The +// queue implementation will duplicate any that it may need in the future. +func (s *serverTx) init(c *QueueConfig) error { + // Map in all buffers. + fillPipeMem, err := getBuffer(c.TxPipeFD) + if err != nil { + return err + } + cu := cleanup.Make(func() { unix.Munmap(fillPipeMem) }) + defer cu.Clean() + + completionPipeMem, err := getBuffer(c.RxPipeFD) + if err != nil { + return err + } + cu.Add(func() { unix.Munmap(completionPipeMem) }) + + data, err := getBuffer(c.DataFD) + if err != nil { + return err + } + cu.Add(func() { unix.Munmap(data) }) + + sharedData, err := getBuffer(c.SharedDataFD) + if err != nil { + return err + } + cu.Add(func() { unix.Munmap(sharedData) }) + + // Duplicate the eventFD so that caller can close it but we can still + // use it. + efd, err := c.EventFD.Dup() + if err != nil { + return err + } + cu.Add(func() { efd.Close() }) + + cu.Release() + + s.fillPipe.Init(fillPipeMem) + s.completionPipe.Init(completionPipeMem) + s.data = data + s.eventFD = efd + s.sharedData = sharedData + + return nil +} + +func (s *serverTx) cleanup() { + unix.Munmap(s.fillPipe.Bytes()) + unix.Munmap(s.completionPipe.Bytes()) + unix.Munmap(s.data) + unix.Munmap(s.sharedData) + s.eventFD.Close() +} + +// fillPacket copies the data in the provided views into buffers pulled from the +// fillPipe and returns a slice of RxBuffers that contain the copied data as +// well as the total number of bytes copied. +// +// To avoid allocations the filledBuffers are appended to the buffers slice +// which will be grown as required. +func (s *serverTx) fillPacket(views []buffer.View, buffers []queue.RxBuffer) (filledBuffers []queue.RxBuffer, totalCopied uint32) { + filledBuffers = buffers[:0] + // fillBuffer copies as much of the views as possible into the provided buffer + // and returns any left over views (if any). + fillBuffer := func(buffer *queue.RxBuffer, views []buffer.View) (left []buffer.View) { + if len(views) == 0 { + return nil + } + availBytes := buffer.Size + copied := uint64(0) + for availBytes > 0 && len(views) > 0 { + n := copy(s.data[buffer.Offset+copied:][:uint64(buffer.Size)-copied], views[0]) + views[0].TrimFront(n) + if !views[0].IsEmpty() { + break + } + views = views[1:] + copied += uint64(n) + availBytes -= uint32(n) + } + buffer.Size = uint32(copied) + return views + } + + for len(views) > 0 { + var b []byte + // Spin till we get a free buffer reposted by the peer. + for { + if b = s.fillPipe.Pull(); b != nil { + break + } + } + rxBuffer := queue.DecodeRxBufferHeader(b) + // Copy the packet into the posted buffer. + views = fillBuffer(&rxBuffer, views) + totalCopied += rxBuffer.Size + filledBuffers = append(filledBuffers, rxBuffer) + } + + return filledBuffers, totalCopied +} + +func (s *serverTx) transmit(views []buffer.View) bool { + buffers := make([]queue.RxBuffer, 8) + buffers, totalCopied := s.fillPacket(views, buffers) + b := s.completionPipe.Push(queue.RxCompletionSize(len(buffers))) + if b == nil { + return false + } + queue.EncodeRxCompletion(b, totalCopied, 0 /* reserved */) + for i := 0; i < len(buffers); i++ { + queue.EncodeRxCompletionBuffer(b, i, buffers[i]) + } + s.completionPipe.Flush() + s.fillPipe.Flush() + return true +} + +func (s *serverTx) notify() { + s.eventFD.Notify() +} diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index 66efe6472..bcb37a465 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -24,14 +24,16 @@ package sharedmem import ( + "fmt" "sync/atomic" - "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/link/rawfile" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -47,7 +49,7 @@ type QueueConfig struct { // EventFD is a file descriptor for the event that is signaled when // data is becomes available in this queue. - EventFD int + EventFD eventfd.Eventfd // TxPipeFD is a file descriptor for the tx pipe associated with the // queue. @@ -63,16 +65,97 @@ type QueueConfig struct { SharedDataFD int } +// FDs returns the FD's in the QueueConfig as a slice of ints. This must +// be used in conjunction with QueueConfigFromFDs to ensure the order +// of FDs matches when reconstructing the config when serialized or sent +// as part of control messages. +func (q *QueueConfig) FDs() []int { + return []int{q.DataFD, q.EventFD.FD(), q.TxPipeFD, q.RxPipeFD, q.SharedDataFD} +} + +// QueueConfigFromFDs constructs a QueueConfig out of a slice of ints where each +// entry represents an file descriptor. The order of FDs in the slice must be in +// the order specified below for the config to be valid. QueueConfig.FDs() +// should be used when the config needs to be serialized or sent as part of a +// control message to ensure the correct order. +func QueueConfigFromFDs(fds []int) (QueueConfig, error) { + if len(fds) != 5 { + return QueueConfig{}, fmt.Errorf("insufficient number of fds: len(fds): %d, want: 5", len(fds)) + } + return QueueConfig{ + DataFD: fds[0], + EventFD: eventfd.Wrap(fds[1]), + TxPipeFD: fds[2], + RxPipeFD: fds[3], + SharedDataFD: fds[4], + }, nil +} + +// Options specify the details about the sharedmem endpoint to be created. +type Options struct { + // MTU is the mtu to use for this endpoint. + MTU uint32 + + // BufferSize is the size of each scatter/gather buffer that will hold packet + // data. + // + // NOTE: This directly determines number of packets that can be held in + // the ring buffer at any time. This does not have to be sized to the MTU as + // the shared memory queue design allows usage of more than one buffer to be + // used to make up a given packet. + BufferSize uint32 + + // LinkAddress is the link address for this endpoint (required). + LinkAddress tcpip.LinkAddress + + // TX is the transmit queue configuration for this shared memory endpoint. + TX QueueConfig + + // RX is the receive queue configuration for this shared memory endpoint. + RX QueueConfig + + // PeerFD is the fd for the connected peer which can be used to detect + // peer disconnects. + PeerFD int + + // OnClosed is a function that is called when the endpoint is being closed + // (probably due to peer going away) + OnClosed func(err tcpip.Error) + + // TXChecksumOffload if true, indicates that this endpoints capability + // set should include CapabilityTXChecksumOffload. + TXChecksumOffload bool + + // RXChecksumOffload if true, indicates that this endpoints capability + // set should include CapabilityRXChecksumOffload. + RXChecksumOffload bool +} + type endpoint struct { // mtu (maximum transmission unit) is the maximum size of a packet. + // mtu is immutable. mtu uint32 // bufferSize is the size of each individual buffer. + // bufferSize is immutable. bufferSize uint32 // addr is the local address of this endpoint. + // addr is immutable. addr tcpip.LinkAddress + // peerFD is an fd to the peer that can be used to detect when the + // peer is gone. + // peerFD is immutable. + peerFD int + + // caps holds the endpoint capabilities. + caps stack.LinkEndpointCapabilities + + // hdrSize is the size of the link layer header if any. + // hdrSize is immutable. + hdrSize uint32 + // rx is the receive queue. rx rx @@ -83,34 +166,55 @@ type endpoint struct { // Wait group used to indicate that all workers have stopped. completed sync.WaitGroup + // onClosed is a function to be called when the FD's peer (if any) closes + // its end of the communication pipe. + onClosed func(tcpip.Error) + // mu protects the following fields. mu sync.Mutex // tx is the transmit queue. + // +checklocks:mu tx tx // workerStarted specifies whether the worker goroutine was started. + // +checklocks:mu workerStarted bool } // New creates a new shared-memory-based endpoint. Buffers will be broken up // into buffers of "bufferSize" bytes. -func New(mtu, bufferSize uint32, addr tcpip.LinkAddress, tx, rx QueueConfig) (stack.LinkEndpoint, error) { +func New(opts Options) (stack.LinkEndpoint, error) { e := &endpoint{ - mtu: mtu, - bufferSize: bufferSize, - addr: addr, + mtu: opts.MTU, + bufferSize: opts.BufferSize, + addr: opts.LinkAddress, + peerFD: opts.PeerFD, + onClosed: opts.OnClosed, } - if err := e.tx.init(bufferSize, &tx); err != nil { + if err := e.tx.init(opts.BufferSize, &opts.TX); err != nil { return nil, err } - if err := e.rx.init(bufferSize, &rx); err != nil { + if err := e.rx.init(opts.BufferSize, &opts.RX); err != nil { e.tx.cleanup() return nil, err } + e.caps = stack.LinkEndpointCapabilities(0) + if opts.RXChecksumOffload { + e.caps |= stack.CapabilityRXChecksumOffload + } + + if opts.TXChecksumOffload { + e.caps |= stack.CapabilityTXChecksumOffload + } + + if opts.LinkAddress != "" { + e.hdrSize = header.EthernetMinimumSize + e.caps |= stack.CapabilityResolutionRequired + } return e, nil } @@ -119,13 +223,13 @@ func (e *endpoint) Close() { // Tell dispatch goroutine to stop, then write to the eventfd so that // it wakes up in case it's sleeping. atomic.StoreUint32(&e.stopRequested, 1) - unix.Write(e.rx.eventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0}) + e.rx.eventFD.Notify() // Cleanup the queues inline if the worker hasn't started yet; we also // know it won't start from now on because stopRequested is set to 1. e.mu.Lock() + defer e.mu.Unlock() workerPresent := e.workerStarted - e.mu.Unlock() if !workerPresent { e.tx.cleanup() @@ -146,6 +250,22 @@ func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { if !e.workerStarted && atomic.LoadUint32(&e.stopRequested) == 0 { e.workerStarted = true e.completed.Add(1) + + // Spin up a goroutine to monitor for peer shutdown. + if e.peerFD >= 0 { + e.completed.Add(1) + go func() { + defer e.completed.Done() + b := make([]byte, 1) + // When sharedmem endpoint is in use the peerFD is never used for any data + // transfer and this Read should only return if the peer is shutting down. + _, err := rawfile.BlockingRead(e.peerFD, b) + if e.onClosed != nil { + e.onClosed(err) + } + }() + } + // Link endpoints are not savable. When transportation endpoints // are saved, they stop sending outgoing packets and all // incoming packets are rejected. @@ -164,18 +284,18 @@ func (e *endpoint) IsAttached() bool { // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized // during construction. func (e *endpoint) MTU() uint32 { - return e.mtu - header.EthernetMinimumSize + return e.mtu - e.hdrSize } // Capabilities implements stack.LinkEndpoint.Capabilities. -func (*endpoint) Capabilities() stack.LinkEndpointCapabilities { - return 0 +func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { + return e.caps } // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It returns the // ethernet frame header size. -func (*endpoint) MaxHeaderLength() uint16 { - return header.EthernetMinimumSize +func (e *endpoint) MaxHeaderLength() uint16 { + return uint16(e.hdrSize) } // LinkAddress implements stack.LinkEndpoint.LinkAddress. It returns the local @@ -205,17 +325,15 @@ func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net // WriteRawPacket implements stack.LinkEndpoint. func (*endpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} } -// WritePacket writes outbound packets to the file descriptor. If it is not -// currently writable, the packet is dropped. -func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { - e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt) +// +checklocks:e.mu +func (e *endpoint) writePacketLocked(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { + if e.addr != "" { + e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt) + } views := pkt.Views() // Transmit the packet. - e.mu.Lock() ok := e.tx.transmit(views...) - e.mu.Unlock() - if !ok { return &tcpip.ErrWouldBlock{} } @@ -223,9 +341,37 @@ func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocol return nil } +// WritePacket writes outbound packets to the file descriptor. If it is not +// currently writable, the packet is dropped. +func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { + e.mu.Lock() + defer e.mu.Unlock() + if err := e.writePacketLocked(r, protocol, pkt); err != nil { + return err + } + e.tx.notify() + return nil +} + // WritePackets implements stack.LinkEndpoint.WritePackets. -func (*endpoint) WritePackets(stack.RouteInfo, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, tcpip.Error) { - panic("not implemented") +func (e *endpoint) WritePackets(r stack.RouteInfo, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, tcpip.Error) { + n := 0 + var err tcpip.Error + e.mu.Lock() + defer e.mu.Unlock() + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + if err = e.writePacketLocked(r, pkt.NetworkProtocolNumber, pkt); err != nil { + break + } + n++ + } + // WritePackets never returns an error if it successfully transmitted at least + // one packet. + if err != nil && n == 0 { + return 0, err + } + e.tx.notify() + return n, nil } // dispatchLoop reads packets from the rx queue in a loop and dispatches them @@ -268,16 +414,42 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) { Data: buffer.View(b).ToVectorisedView(), }) - hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) - if !ok { - continue + var src, dst tcpip.LinkAddress + var proto tcpip.NetworkProtocolNumber + if e.addr != "" { + hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) + if !ok { + continue + } + eth := header.Ethernet(hdr) + src = eth.SourceAddress() + dst = eth.DestinationAddress() + proto = eth.Type() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + // IP version information is at the first octet, so pulling up 1 byte. + h, ok := pkt.Data().PullUp(1) + if !ok { + continue + } + switch header.IPVersion(h) { + case header.IPv4Version: + proto = header.IPv4ProtocolNumber + case header.IPv6Version: + proto = header.IPv6ProtocolNumber + default: + continue + } } - eth := header.Ethernet(hdr) // Send packet up the stack. - d.DeliverNetworkPacket(eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), pkt) + d.DeliverNetworkPacket(src, dst, proto, pkt) } + e.mu.Lock() + defer e.mu.Unlock() + // Clean state. e.tx.cleanup() e.rx.cleanup() diff --git a/pkg/tcpip/link/sharedmem/sharedmem_server.go b/pkg/tcpip/link/sharedmem/sharedmem_server.go new file mode 100644 index 000000000..ccc84989d --- /dev/null +++ b/pkg/tcpip/link/sharedmem/sharedmem_server.go @@ -0,0 +1,333 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux +// +build linux + +package sharedmem + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/link/rawfile" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +type serverEndpoint struct { + // mtu (maximum transmission unit) is the maximum size of a packet. + // mtu is immutable. + mtu uint32 + + // bufferSize is the size of each individual buffer. + // bufferSize is immutable. + bufferSize uint32 + + // addr is the local address of this endpoint. + // addr is immutable + addr tcpip.LinkAddress + + // rx is the receive queue. + rx serverRx + + // stopRequested is to be accessed atomically only, and determines if the + // worker goroutines should stop. + stopRequested uint32 + + // Wait group used to indicate that all workers have stopped. + completed sync.WaitGroup + + // peerFD is an fd to the peer that can be used to detect when the peer is + // gone. + // peerFD is immutable. + peerFD int + + // caps holds the endpoint capabilities. + caps stack.LinkEndpointCapabilities + + // hdrSize is the size of the link layer header if any. + // hdrSize is immutable. + hdrSize uint32 + + // onClosed is a function to be called when the FD's peer (if any) closes its + // end of the communication pipe. + onClosed func(tcpip.Error) + + // mu protects the following fields. + mu sync.Mutex + + // tx is the transmit queue. + // +checklocks:mu + tx serverTx + + // workerStarted specifies whether the worker goroutine was started. + // +checklocks:mu + workerStarted bool +} + +// NewServerEndpoint creates a new shared-memory-based endpoint. Buffers will be +// broken up into buffers of "bufferSize" bytes. +func NewServerEndpoint(opts Options) (stack.LinkEndpoint, error) { + e := &serverEndpoint{ + mtu: opts.MTU, + bufferSize: opts.BufferSize, + addr: opts.LinkAddress, + peerFD: opts.PeerFD, + onClosed: opts.OnClosed, + } + + if err := e.tx.init(&opts.RX); err != nil { + return nil, err + } + + if err := e.rx.init(&opts.TX); err != nil { + e.tx.cleanup() + return nil, err + } + + e.caps = stack.LinkEndpointCapabilities(0) + if opts.RXChecksumOffload { + e.caps |= stack.CapabilityRXChecksumOffload + } + + if opts.TXChecksumOffload { + e.caps |= stack.CapabilityTXChecksumOffload + } + + if opts.LinkAddress != "" { + e.hdrSize = header.EthernetMinimumSize + e.caps |= stack.CapabilityResolutionRequired + } + + return e, nil +} + +// Close frees all resources associated with the endpoint. +func (e *serverEndpoint) Close() { + // Tell dispatch goroutine to stop, then write to the eventfd so that it wakes + // up in case it's sleeping. + atomic.StoreUint32(&e.stopRequested, 1) + e.rx.eventFD.Notify() + + // Cleanup the queues inline if the worker hasn't started yet; we also know it + // won't start from now on because stopRequested is set to 1. + e.mu.Lock() + defer e.mu.Unlock() + workerPresent := e.workerStarted + + if !workerPresent { + e.tx.cleanup() + e.rx.cleanup() + } +} + +// Wait implements stack.LinkEndpoint.Wait. It waits until all workers have +// stopped after a Close() call. +func (e *serverEndpoint) Wait() { + e.completed.Wait() +} + +// Attach implements stack.LinkEndpoint.Attach. It launches the goroutine that +// reads packets from the rx queue. +func (e *serverEndpoint) Attach(dispatcher stack.NetworkDispatcher) { + e.mu.Lock() + if !e.workerStarted && atomic.LoadUint32(&e.stopRequested) == 0 { + e.workerStarted = true + e.completed.Add(1) + if e.peerFD >= 0 { + e.completed.Add(1) + // Spin up a goroutine to monitor for peer shutdown. + go func() { + b := make([]byte, 1) + // When sharedmem endpoint is in use the peerFD is never used for any + // data transfer and this Read should only return if the peer is + // shutting down. + _, err := rawfile.BlockingRead(e.peerFD, b) + if e.onClosed != nil { + e.onClosed(err) + } + e.completed.Done() + }() + } + // Link endpoints are not savable. When transportation endpoints are saved, + // they stop sending outgoing packets and all incoming packets are rejected. + go e.dispatchLoop(dispatcher) // S/R-SAFE: see above. + } + e.mu.Unlock() +} + +// IsAttached implements stack.LinkEndpoint.IsAttached. +func (e *serverEndpoint) IsAttached() bool { + e.mu.Lock() + defer e.mu.Unlock() + return e.workerStarted +} + +// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized +// during construction. +func (e *serverEndpoint) MTU() uint32 { + return e.mtu - e.hdrSize +} + +// Capabilities implements stack.LinkEndpoint.Capabilities. +func (e *serverEndpoint) Capabilities() stack.LinkEndpointCapabilities { + return e.caps +} + +// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It returns the +// ethernet frame header size. +func (e *serverEndpoint) MaxHeaderLength() uint16 { + return uint16(e.hdrSize) +} + +// LinkAddress implements stack.LinkEndpoint.LinkAddress. It returns the local +// link address. +func (e *serverEndpoint) LinkAddress() tcpip.LinkAddress { + return e.addr +} + +// AddHeader implements stack.LinkEndpoint.AddHeader. +func (e *serverEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { + // Add ethernet header if needed. + eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) + ethHdr := &header.EthernetFields{ + DstAddr: remote, + Type: protocol, + } + + // Preserve the src address if it's set in the route. + if local != "" { + ethHdr.SrcAddr = local + } else { + ethHdr.SrcAddr = e.addr + } + eth.Encode(ethHdr) +} + +// WriteRawPacket implements stack.LinkEndpoint. +func (*serverEndpoint) WriteRawPacket(*stack.PacketBuffer) tcpip.Error { + return &tcpip.ErrNotSupported{} +} + +// +checklocks:e.mu +func (e *serverEndpoint) writePacketLocked(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { + e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt) + + views := pkt.Views() + ok := e.tx.transmit(views) + if !ok { + return &tcpip.ErrWouldBlock{} + } + + return nil +} + +// WritePacket writes outbound packets to the file descriptor. If it is not +// currently writable, the packet is dropped. +func (e *serverEndpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { + // Transmit the packet. + e.mu.Lock() + defer e.mu.Unlock() + if err := e.writePacketLocked(r, protocol, pkt); err != nil { + return err + } + e.tx.notify() + return nil +} + +// WritePackets implements stack.LinkEndpoint.WritePackets. +func (e *serverEndpoint) WritePackets(r stack.RouteInfo, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, tcpip.Error) { + n := 0 + var err tcpip.Error + e.mu.Lock() + defer e.mu.Unlock() + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + if err = e.writePacketLocked(r, pkt.NetworkProtocolNumber, pkt); err != nil { + break + } + n++ + } + // WritePackets never returns an error if it successfully transmitted at least + // one packet. + if err != nil && n == 0 { + return 0, err + } + e.tx.notify() + return n, nil +} + +// dispatchLoop reads packets from the rx queue in a loop and dispatches them +// to the network stack. +func (e *serverEndpoint) dispatchLoop(d stack.NetworkDispatcher) { + for atomic.LoadUint32(&e.stopRequested) == 0 { + b := e.rx.receive() + if b == nil { + e.rx.waitForPackets() + continue + } + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + Data: buffer.View(b).ToVectorisedView(), + }) + var src, dst tcpip.LinkAddress + var proto tcpip.NetworkProtocolNumber + if e.addr != "" { + hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) + if !ok { + continue + } + eth := header.Ethernet(hdr) + src = eth.SourceAddress() + dst = eth.DestinationAddress() + proto = eth.Type() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + // IP version information is at the first octet, so pulling up 1 byte. + h, ok := pkt.Data().PullUp(1) + if !ok { + continue + } + switch header.IPVersion(h) { + case header.IPv4Version: + proto = header.IPv4ProtocolNumber + case header.IPv6Version: + proto = header.IPv6ProtocolNumber + default: + continue + } + } + // Send packet up the stack. + d.DeliverNetworkPacket(src, dst, proto, pkt) + } + + e.mu.Lock() + defer e.mu.Unlock() + + // Clean state. + e.tx.cleanup() + e.rx.cleanup() + + e.completed.Done() +} + +// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType +func (e *serverEndpoint) ARPHardwareType() header.ARPHardwareType { + if e.hdrSize > 0 { + return header.ARPHardwareEther + } + return header.ARPHardwareNone +} diff --git a/pkg/tcpip/link/sharedmem/sharedmem_server_test.go b/pkg/tcpip/link/sharedmem/sharedmem_server_test.go new file mode 100644 index 000000000..1bc58614e --- /dev/null +++ b/pkg/tcpip/link/sharedmem/sharedmem_server_test.go @@ -0,0 +1,220 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux +// +build linux + +package sharedmem_server_test + +import ( + "fmt" + "io" + "net" + "net/http" + "syscall" + "testing" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" +) + +const ( + localLinkAddr = "\xde\xad\xbe\xef\x56\x78" + remoteLinkAddr = "\xde\xad\xbe\xef\x12\x34" + localIPv4Address = tcpip.Address("\x0a\x00\x00\x01") + remoteIPv4Address = tcpip.Address("\x0a\x00\x00\x02") + serverPort = 10001 + + defaultMTU = 1500 + defaultBufferSize = 1500 +) + +type stackOptions struct { + ep stack.LinkEndpoint + addr tcpip.Address +} + +func newStackWithOptions(stackOpts stackOptions) (*stack.Stack, error) { + st := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{ + ipv4.NewProtocolWithOptions(ipv4.Options{ + AllowExternalLoopbackTraffic: true, + }), + ipv6.NewProtocolWithOptions(ipv6.Options{ + AllowExternalLoopbackTraffic: true, + }), + }, + TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol}, + }) + nicID := tcpip.NICID(1) + sniffEP := sniffer.New(stackOpts.ep) + opts := stack.NICOptions{Name: "eth0"} + if err := st.CreateNICWithOptions(nicID, sniffEP, opts); err != nil { + return nil, fmt.Errorf("method CreateNICWithOptions(%d, _, %v) failed: %s", nicID, opts, err) + } + + // Add Protocol Address. + protocolNum := ipv4.ProtocolNumber + routeTable := []tcpip.Route{{Destination: header.IPv4EmptySubnet, NIC: nicID}} + if len(stackOpts.addr) == 16 { + routeTable = []tcpip.Route{{Destination: header.IPv6EmptySubnet, NIC: nicID}} + protocolNum = ipv6.ProtocolNumber + } + protocolAddr := tcpip.ProtocolAddress{ + Protocol: protocolNum, + AddressWithPrefix: stackOpts.addr.WithPrefix(), + } + if err := st.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil { + return nil, fmt.Errorf("AddProtocolAddress(%d, %v, {}): %s", nicID, protocolAddr, err) + } + + // Setup route table. + st.SetRouteTable(routeTable) + + return st, nil +} + +func newClientStack(t *testing.T, qPair *sharedmem.QueuePair, peerFD int) (*stack.Stack, error) { + ep, err := sharedmem.New(sharedmem.Options{ + MTU: defaultMTU, + BufferSize: defaultBufferSize, + LinkAddress: localLinkAddr, + TX: qPair.TXQueueConfig(), + RX: qPair.RXQueueConfig(), + PeerFD: peerFD, + }) + if err != nil { + return nil, fmt.Errorf("failed to create sharedmem endpoint: %s", err) + } + st, err := newStackWithOptions(stackOptions{ep: ep, addr: localIPv4Address}) + if err != nil { + return nil, fmt.Errorf("failed to create client stack: %s", err) + } + return st, nil +} + +func newServerStack(t *testing.T, qPair *sharedmem.QueuePair, peerFD int) (*stack.Stack, error) { + ep, err := sharedmem.NewServerEndpoint(sharedmem.Options{ + MTU: defaultMTU, + BufferSize: defaultBufferSize, + LinkAddress: remoteLinkAddr, + TX: qPair.TXQueueConfig(), + RX: qPair.RXQueueConfig(), + PeerFD: peerFD, + }) + if err != nil { + return nil, fmt.Errorf("failed to create sharedmem endpoint: %s", err) + } + st, err := newStackWithOptions(stackOptions{ep: ep, addr: remoteIPv4Address}) + if err != nil { + return nil, fmt.Errorf("failed to create client stack: %s", err) + } + return st, nil +} + +type testContext struct { + clientStk *stack.Stack + serverStk *stack.Stack + peerFDs [2]int +} + +func newTestContext(t *testing.T) *testContext { + peerFDs, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET|syscall.SOCK_NONBLOCK, 0) + if err != nil { + t.Fatalf("failed to create peerFDs: %s", err) + } + q, err := sharedmem.NewQueuePair() + if err != nil { + t.Fatalf("failed to create sharedmem queue: %s", err) + } + clientStack, err := newClientStack(t, q, peerFDs[0]) + if err != nil { + q.Close() + unix.Close(peerFDs[0]) + unix.Close(peerFDs[1]) + t.Fatalf("failed to create client stack: %s", err) + } + serverStack, err := newServerStack(t, q, peerFDs[1]) + if err != nil { + q.Close() + unix.Close(peerFDs[0]) + unix.Close(peerFDs[1]) + clientStack.Close() + t.Fatalf("failed to create server stack: %s", err) + } + return &testContext{ + clientStk: clientStack, + serverStk: serverStack, + peerFDs: peerFDs, + } +} + +func (ctx *testContext) cleanup() { + unix.Close(ctx.peerFDs[0]) + unix.Close(ctx.peerFDs[1]) + ctx.clientStk.Close() + ctx.serverStk.Close() +} + +func TestServerRoundTrip(t *testing.T) { + ctx := newTestContext(t) + defer ctx.cleanup() + listenAddr := tcpip.FullAddress{Addr: remoteIPv4Address, Port: serverPort} + l, err := gonet.ListenTCP(ctx.serverStk, listenAddr, ipv4.ProtocolNumber) + if err != nil { + t.Fatalf("failed to start TCP Listener: %s", err) + } + defer l.Close() + var responseString = "response" + go func() { + http.Serve(l, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte(responseString)) + })) + }() + + dialFunc := func(address, protocol string) (net.Conn, error) { + return gonet.DialTCP(ctx.clientStk, listenAddr, ipv4.ProtocolNumber) + } + + httpClient := &http.Client{ + Transport: &http.Transport{ + Dial: dialFunc, + }, + } + serverURL := fmt.Sprintf("http://[%s]:%d/", net.IP(remoteIPv4Address), serverPort) + response, err := httpClient.Get(serverURL) + if err != nil { + t.Fatalf("httpClient.Get(\"/\") failed: %s", err) + } + if got, want := response.StatusCode, http.StatusOK; got != want { + t.Fatalf("unexpected status code got: %d, want: %d", got, want) + } + body, err := io.ReadAll(response.Body) + if err != nil { + t.Fatalf("io.ReadAll(response.Body) failed: %s", err) + } + response.Body.Close() + if got, want := string(body), responseString; got != want { + t.Fatalf("unexpected response got: %s, want: %s", got, want) + } +} diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go index d6d953085..66ffc33b8 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem_test.go +++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go @@ -19,9 +19,7 @@ package sharedmem import ( "bytes" - "io/ioutil" "math/rand" - "os" "strings" "testing" "time" @@ -104,24 +102,36 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress t: t, packetCh: make(chan struct{}, 1000000), } - c.txCfg = createQueueFDs(t, queueSizes{ + c.txCfg, err = createQueueFDs(queueSizes{ dataSize: queueDataSize, txPipeSize: queuePipeSize, rxPipeSize: queuePipeSize, sharedDataSize: 4096, }) - - c.rxCfg = createQueueFDs(t, queueSizes{ + if err != nil { + t.Fatalf("createQueueFDs for tx failed: %s", err) + } + c.rxCfg, err = createQueueFDs(queueSizes{ dataSize: queueDataSize, txPipeSize: queuePipeSize, rxPipeSize: queuePipeSize, sharedDataSize: 4096, }) + if err != nil { + t.Fatalf("createQueueFDs for rx failed: %s", err) + } initQueue(t, &c.txq, &c.txCfg) initQueue(t, &c.rxq, &c.rxCfg) - ep, err := New(mtu, bufferSize, addr, c.txCfg, c.rxCfg) + ep, err := New(Options{ + MTU: mtu, + BufferSize: bufferSize, + LinkAddress: addr, + TX: c.txCfg, + RX: c.rxCfg, + PeerFD: -1, + }) if err != nil { t.Fatalf("New failed: %v", err) } @@ -150,8 +160,8 @@ func (c *testContext) DeliverOutboundPacket(remoteLinkAddr, localLinkAddr tcpip. func (c *testContext) cleanup() { c.ep.Close() - closeFDs(&c.txCfg) - closeFDs(&c.rxCfg) + closeFDs(c.txCfg) + closeFDs(c.rxCfg) c.txq.cleanup() c.rxq.cleanup() } @@ -191,69 +201,6 @@ func shuffle(b []int) { } } -func createFile(t *testing.T, size int64, initQueue bool) int { - tmpDir, ok := os.LookupEnv("TEST_TMPDIR") - if !ok { - tmpDir = os.Getenv("TMPDIR") - } - f, err := ioutil.TempFile(tmpDir, "sharedmem_test") - if err != nil { - t.Fatalf("TempFile failed: %v", err) - } - defer f.Close() - unix.Unlink(f.Name()) - - if initQueue { - // Write the "slot-free" flag in the initial queue. - _, err := f.WriteAt([]byte{0, 0, 0, 0, 0, 0, 0, 0x80}, 0) - if err != nil { - t.Fatalf("WriteAt failed: %v", err) - } - } - - fd, err := unix.Dup(int(f.Fd())) - if err != nil { - t.Fatalf("Dup failed: %v", err) - } - - if err := unix.Ftruncate(fd, size); err != nil { - unix.Close(fd) - t.Fatalf("Ftruncate failed: %v", err) - } - - return fd -} - -func closeFDs(c *QueueConfig) { - unix.Close(c.DataFD) - unix.Close(c.EventFD) - unix.Close(c.TxPipeFD) - unix.Close(c.RxPipeFD) - unix.Close(c.SharedDataFD) -} - -type queueSizes struct { - dataSize int64 - txPipeSize int64 - rxPipeSize int64 - sharedDataSize int64 -} - -func createQueueFDs(t *testing.T, s queueSizes) QueueConfig { - fd, _, err := unix.RawSyscall(unix.SYS_EVENTFD2, 0, 0, 0) - if err != 0 { - t.Fatalf("eventfd failed: %v", error(err)) - } - - return QueueConfig{ - EventFD: int(fd), - DataFD: createFile(t, s.dataSize, false), - TxPipeFD: createFile(t, s.txPipeSize, true), - RxPipeFD: createFile(t, s.rxPipeSize, true), - SharedDataFD: createFile(t, s.sharedDataSize, false), - } -} - // TestSimpleSend sends 1000 packets with random header and payload sizes, // then checks that the right payload is received on the shared memory queues. func TestSimpleSend(t *testing.T) { @@ -672,7 +619,7 @@ func TestSimpleReceive(t *testing.T) { // Push completion. c.pushRxCompletion(uint32(len(contents)), bufs) c.rxq.rx.Flush() - unix.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0}) + c.rxCfg.EventFD.Notify() // Wait for packet to be received, then check it. c.waitForPackets(1, time.After(5*time.Second), "Timeout waiting for packet") @@ -718,7 +665,7 @@ func TestRxBuffersReposted(t *testing.T) { // Complete the buffer. c.pushRxCompletion(buffers[i].Size, buffers[i:][:1]) c.rxq.rx.Flush() - unix.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0}) + c.rxCfg.EventFD.Notify() // Wait for it to be reposted. bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for buffer to be reposted")) @@ -734,7 +681,7 @@ func TestRxBuffersReposted(t *testing.T) { // Complete with two buffers. c.pushRxCompletion(2*bufferSize, buffers[2*i:][:2]) c.rxq.rx.Flush() - unix.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0}) + c.rxCfg.EventFD.Notify() // Wait for them to be reposted. for j := 0; j < 2; j++ { @@ -759,7 +706,7 @@ func TestReceivePostingIsFull(t *testing.T) { first := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, time.After(time.Second), "Timeout waiting for first buffer to be posted")) c.pushRxCompletion(first.Size, []queue.RxBuffer{first}) c.rxq.rx.Flush() - unix.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0}) + c.rxCfg.EventFD.Notify() // Check that packet is received. c.waitForPackets(1, time.After(time.Second), "Timeout waiting for completed packet") @@ -768,7 +715,7 @@ func TestReceivePostingIsFull(t *testing.T) { second := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, time.After(time.Second), "Timeout waiting for second buffer to be posted")) c.pushRxCompletion(second.Size, []queue.RxBuffer{second}) c.rxq.rx.Flush() - unix.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0}) + c.rxCfg.EventFD.Notify() // Check that no packet is received yet, as the worker is blocked trying // to repost. @@ -781,7 +728,7 @@ func TestReceivePostingIsFull(t *testing.T) { // Flush tx queue, which will allow the first buffer to be reposted, // and the second completion to be pulled. c.rxq.tx.Flush() - unix.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0}) + c.rxCfg.EventFD.Notify() // Check that second packet completes. c.waitForPackets(1, time.After(time.Second), "Timeout waiting for second completed packet") @@ -803,7 +750,7 @@ func TestCloseWhileWaitingToPost(t *testing.T) { bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, time.After(time.Second), "Timeout waiting for initial buffer to be posted")) c.pushRxCompletion(bi.Size, []queue.RxBuffer{bi}) c.rxq.rx.Flush() - unix.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0}) + c.rxCfg.EventFD.Notify() // Wait for packet to be indicated. c.waitForPackets(1, time.After(time.Second), "Timeout waiting for completed packet") diff --git a/pkg/tcpip/link/sharedmem/tx.go b/pkg/tcpip/link/sharedmem/tx.go index e3210051f..35e5bff12 100644 --- a/pkg/tcpip/link/sharedmem/tx.go +++ b/pkg/tcpip/link/sharedmem/tx.go @@ -18,6 +18,7 @@ import ( "math" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" ) @@ -28,10 +29,12 @@ const ( // tx holds all state associated with a tx queue. type tx struct { - data []byte - q queue.Tx - ids idManager - bufs bufferManager + data []byte + q queue.Tx + ids idManager + bufs bufferManager + eventFD eventfd.Eventfd + sharedDataFD int } // init initializes all state needed by the tx queue based on the information @@ -64,7 +67,8 @@ func (t *tx) init(mtu uint32, c *QueueConfig) error { t.ids.init() t.bufs.init(0, len(data), int(mtu)) t.data = data - + t.eventFD = c.EventFD + t.sharedDataFD = c.SharedDataFD return nil } @@ -142,6 +146,12 @@ func (t *tx) transmit(bufs ...buffer.View) bool { return true } +// notify writes to the tx.eventFD to indicate to the peer that there is data to +// be read. +func (t *tx) notify() { + t.eventFD.Notify() +} + // getBuffer returns a memory region mapped to the full contents of the given // file descriptor. func getBuffer(fd int) ([]byte, error) { diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go index d51c36f19..1c3b0887f 100644 --- a/pkg/tcpip/network/ipv4/icmp.go +++ b/pkg/tcpip/network/ipv4/icmp.go @@ -167,14 +167,17 @@ func (e *endpoint) handleControl(errInfo stack.TransportError, pkt *stack.Packet p := hdr.TransportProtocol() dstAddr := hdr.DestinationAddress() // Skip the ip header, then deliver the error. - pkt.Data().DeleteFront(hlen) + if _, ok := pkt.Data().Consume(hlen); !ok { + panic(fmt.Sprintf("could not consume the IP header of %d bytes", hlen)) + } e.dispatcher.DeliverTransportError(srcAddr, dstAddr, ProtocolNumber, p, errInfo, pkt) } func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) { received := e.stats.icmp.packetsReceived // ICMP packets don't have their TransportHeader fields set. See - // icmp/protocol.go:protocol.Parse for a full explanation. + // icmp/protocol.go:protocol.Parse for a full explanation. Not all ICMP types + // require consuming the header, so we only call PullUp. v, ok := pkt.Data().PullUp(header.ICMPv4MinimumSize) if !ok { received.invalid.Increment() @@ -242,7 +245,8 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) { // DeliverTransportPacket will take ownership of pkt so don't use it beyond // this point. Make a deep copy of the data before pkt gets sent as we will - // be modifying fields. + // be modifying fields. Both the ICMP header (with its type modified to + // EchoReply) and payload are reused in the reply packet. // // TODO(gvisor.dev/issue/4399): The copy may not be needed if there are no // waiting endpoints. Consider moving responsibility for doing the copy to @@ -331,6 +335,8 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) { case header.ICMPv4EchoReply: received.echoReply.Increment() + // ICMP sockets expect the ICMP header to be present, so we don't consume + // the ICMP header. e.dispatcher.DeliverTransportPacket(header.ICMPv4ProtocolNumber, pkt) case header.ICMPv4DstUnreachable: @@ -338,7 +344,9 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) { mtu := h.MTU() code := h.Code() - pkt.Data().DeleteFront(header.ICMPv4MinimumSize) + if _, ok := pkt.Data().Consume(header.ICMPv4MinimumSize); !ok { + panic("could not consume ICMPv4MinimumSize bytes") + } switch code { case header.ICMPv4HostUnreachable: e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt) diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index dda473e48..9b71738ae 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -466,7 +466,7 @@ func (e *endpoint) writePacket(r *stack.Route, pkt *stack.PacketBuffer, headerIn // Postrouting NAT can only change the source address, and does not alter the // route or outgoing interface of the packet. outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) - if ok := e.protocol.stack.IPTables().CheckPostrouting(pkt, r, outNicName); !ok { + if ok := e.protocol.stack.IPTables().CheckPostrouting(pkt, r, e, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesPostroutingDropped.Increment() return nil @@ -576,7 +576,7 @@ func (e *endpoint) WritePackets(r *stack.Route, pkts stack.PacketBufferList, par // We ignore the list of NAT-ed packets here because Postrouting NAT can only // change the source address, and does not alter the route or outgoing // interface of the packet. - postroutingDropped, _ := e.protocol.stack.IPTables().CheckPostroutingPackets(pkts, r, outNicName) + postroutingDropped, _ := e.protocol.stack.IPTables().CheckPostroutingPackets(pkts, r, e, outNicName) stats.IPTablesPostroutingDropped.IncrementBy(uint64(len(postroutingDropped))) for pkt := range postroutingDropped { pkts.Remove(pkt) diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go index 6c6107264..ff23d48e7 100644 --- a/pkg/tcpip/network/ipv6/icmp.go +++ b/pkg/tcpip/network/ipv6/icmp.go @@ -187,7 +187,9 @@ func (e *endpoint) handleControl(transErr stack.TransportError, pkt *stack.Packe // Skip the IP header, then handle the fragmentation header if there // is one. - pkt.Data().DeleteFront(header.IPv6MinimumSize) + if _, ok := pkt.Data().Consume(header.IPv6MinimumSize); !ok { + panic("could not consume IPv6MinimumSize bytes") + } if p == header.IPv6FragmentHeader { f, ok := pkt.Data().PullUp(header.IPv6FragmentHeaderSize) if !ok { @@ -203,7 +205,9 @@ func (e *endpoint) handleControl(transErr stack.TransportError, pkt *stack.Packe // Skip fragmentation header and find out the actual protocol // number. - pkt.Data().DeleteFront(header.IPv6FragmentHeaderSize) + if _, ok := pkt.Data().Consume(header.IPv6FragmentHeaderSize); !ok { + panic("could not consume IPv6FragmentHeaderSize bytes") + } } e.dispatcher.DeliverTransportError(srcAddr, dstAddr, ProtocolNumber, p, transErr, pkt) @@ -325,7 +329,7 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool, r switch icmpType := h.Type(); icmpType { case header.ICMPv6PacketTooBig: received.packetTooBig.Increment() - hdr, ok := pkt.Data().PullUp(header.ICMPv6PacketTooBigMinimumSize) + hdr, ok := pkt.Data().Consume(header.ICMPv6PacketTooBigMinimumSize) if !ok { received.invalid.Increment() return @@ -334,18 +338,16 @@ func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool, r if err != nil { networkMTU = 0 } - pkt.Data().DeleteFront(header.ICMPv6PacketTooBigMinimumSize) e.handleControl(&icmpv6PacketTooBigSockError{mtu: networkMTU}, pkt) case header.ICMPv6DstUnreachable: received.dstUnreachable.Increment() - hdr, ok := pkt.Data().PullUp(header.ICMPv6DstUnreachableMinimumSize) + hdr, ok := pkt.Data().Consume(header.ICMPv6DstUnreachableMinimumSize) if !ok { received.invalid.Increment() return } code := header.ICMPv6(hdr).Code() - pkt.Data().DeleteFront(header.ICMPv6DstUnreachableMinimumSize) switch code { case header.ICMPv6NetworkUnreachable: e.handleControl(&icmpv6DestinationNetworkUnreachableSockError{}, pkt) diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index e2d2cf907..600e805f8 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -788,7 +788,7 @@ func (e *endpoint) writePacket(r *stack.Route, pkt *stack.PacketBuffer, protocol // Postrouting NAT can only change the source address, and does not alter the // route or outgoing interface of the packet. outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) - if ok := e.protocol.stack.IPTables().CheckPostrouting(pkt, r, outNicName); !ok { + if ok := e.protocol.stack.IPTables().CheckPostrouting(pkt, r, e, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesPostroutingDropped.Increment() return nil @@ -897,7 +897,7 @@ func (e *endpoint) WritePackets(r *stack.Route, pkts stack.PacketBufferList, par // We ignore the list of NAT-ed packets here because Postrouting NAT can only // change the source address, and does not alter the route or outgoing // interface of the packet. - postroutingDropped, _ := e.protocol.stack.IPTables().CheckPostroutingPackets(pkts, r, outNicName) + postroutingDropped, _ := e.protocol.stack.IPTables().CheckPostroutingPackets(pkts, r, e, outNicName) stats.IPTablesPostroutingDropped.IncrementBy(uint64(len(postroutingDropped))) for pkt := range postroutingDropped { pkts.Remove(pkt) @@ -1537,19 +1537,22 @@ func (e *endpoint) processExtensionHeaders(h header.IPv6, pkt *stack.PacketBuffe // If the last header in the payload isn't a known IPv6 extension header, // handle it as if it is transport layer data. - // Calculate the number of octets parsed from data. We want to remove all - // the data except the unparsed portion located at the end, which its size - // is extHdr.Buf.Size(). + // Calculate the number of octets parsed from data. We want to consume all + // the data except the unparsed portion located at the end, whose size is + // extHdr.Buf.Size(). trim := pkt.Data().Size() - extHdr.Buf.Size() // For unfragmented packets, extHdr still contains the transport header. - // Get rid of it. + // Consume that too. // // For reassembled fragments, pkt.TransportHeader is unset, so this is a // no-op and pkt.Data begins with the transport header. trim += pkt.TransportHeader().View().Size() - pkt.Data().DeleteFront(trim) + if _, ok := pkt.Data().Consume(trim); !ok { + stats.MalformedPacketsReceived.Increment() + return fmt.Errorf("could not consume %d bytes", trim) + } stats.PacketsDelivered.Increment() if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber { diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go index 4fb7e9adb..48f290187 100644 --- a/pkg/tcpip/stack/conntrack.go +++ b/pkg/tcpip/stack/conntrack.go @@ -45,17 +45,6 @@ const ( dirReply ) -// Manipulation type for the connection. -// TODO(gvisor.dev/issue/5696): Define this as a bit set and support SNAT and -// DNAT at the same time. -type manipType int - -const ( - manipNone manipType = iota - manipSource - manipDestination -) - // tuple holds a connection's identifying and manipulating data in one // direction. It is immutable. // @@ -64,13 +53,21 @@ type tuple struct { // tupleEntry is used to build an intrusive list of tuples. tupleEntry - tupleID - // conn is the connection tracking entry this tuple belongs to. conn *conn // direction is the direction of the tuple. direction direction + + mu sync.RWMutex `state:"nosave"` + // +checklocks:mu + tupleID tupleID +} + +func (t *tuple) id() tupleID { + t.mu.RLock() + defer t.mu.RUnlock() + return t.tupleID } // tupleID uniquely identifies a connection in one direction. It currently @@ -103,50 +100,47 @@ func (ti tupleID) reply() tupleID { // // +stateify savable type conn struct { + ct *ConnTrack + // original is the tuple in original direction. It is immutable. original tuple - // reply is the tuple in reply direction. It is immutable. + // reply is the tuple in reply direction. reply tuple - // manip indicates if the packet should be manipulated. It is immutable. - // TODO(gvisor.dev/issue/5696): Support updating manipulation type. - manip manipType - - // tcbHook indicates if the packet is inbound or outbound to - // update the state of tcb. It is immutable. - tcbHook Hook - - // mu protects all mutable state. - mu sync.Mutex `state:"nosave"` + mu sync.RWMutex `state:"nosave"` + // Indicates that the connection has been finalized and may handle replies. + // + // +checklocks:mu + finalized bool + // sourceManip indicates the packet's source is manipulated. + // + // +checklocks:mu + sourceManip bool + // destinationManip indicates the packet's destination is manipulated. + // + // +checklocks:mu + destinationManip bool // tcb is TCB control block. It is used to keep track of states - // of tcp connection and is protected by mu. + // of tcp connection. + // + // +checklocks:mu tcb tcpconntrack.TCB // lastUsed is the last time the connection saw a relevant packet, and - // is updated by each packet on the connection. It is protected by mu. + // is updated by each packet on the connection. // // TODO(gvisor.dev/issue/5939): do not use the ambient clock. + // + // +checklocks:mu lastUsed time.Time `state:".(unixTime)"` } -// newConn creates new connection. -func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn { - conn := conn{ - manip: manip, - tcbHook: hook, - lastUsed: time.Now(), - } - conn.original = tuple{conn: &conn, tupleID: orig} - conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply} - return &conn -} - // timedOut returns whether the connection timed out based on its state. func (cn *conn) timedOut(now time.Time) bool { const establishedTimeout = 5 * 24 * time.Hour const defaultTimeout = 120 * time.Second - cn.mu.Lock() - defer cn.mu.Unlock() + cn.mu.RLock() + defer cn.mu.RUnlock() if cn.tcb.State() == tcpconntrack.ResultAlive { // Use the same default as Linux, which doesn't delete // established connections for 5(!) days. @@ -159,8 +153,9 @@ func (cn *conn) timedOut(now time.Time) bool { // update the connection tracking state. // -// Precondition: cn.mu must be held. -func (cn *conn) updateLocked(pkt *PacketBuffer, hook Hook) { +// TODO(https://gvisor.dev/issue/6590): annotate r/w locking requirements. +// +checklocks:cn.mu +func (cn *conn) updateLocked(pkt *PacketBuffer, dir direction) { if pkt.TransportProtocolNumber != header.TCPProtocolNumber { return } @@ -172,10 +167,16 @@ func (cn *conn) updateLocked(pkt *PacketBuffer, hook Hook) { // established or not, so the client/server distinction isn't important. if cn.tcb.IsEmpty() { cn.tcb.Init(tcpHeader) - } else if hook == cn.tcbHook { + return + } + + switch dir { + case dirOriginal: cn.tcb.UpdateStateOutbound(tcpHeader) - } else { + case dirReply: cn.tcb.UpdateStateInbound(tcpHeader) + default: + panic(fmt.Sprintf("unhandled dir = %d", dir)) } } @@ -200,18 +201,18 @@ type ConnTrack struct { // It is immutable. seed uint32 + mu sync.RWMutex `state:"nosave"` // mu protects the buckets slice, but not buckets' contents. Only take // the write lock if you are modifying the slice or saving for S/R. - mu sync.RWMutex `state:"nosave"` - - // buckets is protected by mu. + // + // +checklocks:mu buckets []bucket } // +stateify savable type bucket struct { - // mu protects tuples. - mu sync.Mutex `state:"nosave"` + mu sync.RWMutex `state:"nosave"` + // +checklocks:mu tuples tupleList } @@ -230,241 +231,212 @@ func getTransportHeader(pkt *PacketBuffer) (header.ChecksummableTransport, bool) return nil, false } -// packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid -// TCP header. -// -// Preconditions: pkt.NetworkHeader() is valid. -func packetToTupleID(pkt *PacketBuffer) (tupleID, tcpip.Error) { +func (ct *ConnTrack) init() { + ct.mu.Lock() + defer ct.mu.Unlock() + ct.buckets = make([]bucket, numBuckets) +} + +func (ct *ConnTrack) getConnOrMaybeInsertNoop(pkt *PacketBuffer) *tuple { netHeader := pkt.Network() transportHeader, ok := getTransportHeader(pkt) if !ok { - return tupleID{}, &tcpip.ErrUnknownProtocol{} + return nil } - return tupleID{ + tid := tupleID{ srcAddr: netHeader.SourceAddress(), srcPort: transportHeader.SourcePort(), dstAddr: netHeader.DestinationAddress(), dstPort: transportHeader.DestinationPort(), transProto: pkt.TransportProtocolNumber, netProto: pkt.NetworkProtocolNumber, - }, nil -} - -func (ct *ConnTrack) init() { - ct.mu.Lock() - defer ct.mu.Unlock() - ct.buckets = make([]bucket, numBuckets) -} - -// connFor gets the conn for pkt if it exists, or returns nil -// if it does not. It returns an error when pkt does not contain a valid TCP -// header. -// TODO(gvisor.dev/issue/6168): Support UDP. -func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) { - tid, err := packetToTupleID(pkt) - if err != nil { - return nil, dirOriginal } - return ct.connForTID(tid) -} -func (ct *ConnTrack) connForTID(tid tupleID) (*conn, direction) { - bucket := ct.bucket(tid) - now := time.Now() + bktID := ct.bucket(tid) ct.mu.RLock() - defer ct.mu.RUnlock() - ct.buckets[bucket].mu.Lock() - defer ct.buckets[bucket].mu.Unlock() - - // Iterate over the tuples in a bucket, cleaning up any unused - // connections we find. - for other := ct.buckets[bucket].tuples.Front(); other != nil; other = other.Next() { - // Clean up any timed-out connections we happen to find. - if ct.reapTupleLocked(other, bucket, now) { - // The tuple expired. - continue - } - if tid == other.tupleID { - return other.conn, other.direction - } + bkt := &ct.buckets[bktID] + ct.mu.RUnlock() + + now := time.Now() + if t := bkt.connForTID(tid, now); t != nil { + return t } - return nil, dirOriginal -} + bkt.mu.Lock() + defer bkt.mu.Unlock() -func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, port uint16, address tcpip.Address) *conn { - tid, err := packetToTupleID(pkt) - if err != nil { - return nil + // Make sure a connection wasn't added between when we last checked the + // bucket and acquired the bucket's write lock. + if t := bkt.connForTIDRLocked(tid, now); t != nil { + return t } - if hook != Prerouting && hook != Output { - return nil + + // This is the first packet we're seeing for the connection. Create an entry + // for this new connection. + conn := &conn{ + ct: ct, + original: tuple{tupleID: tid, direction: dirOriginal}, + reply: tuple{tupleID: tid.reply(), direction: dirReply}, + lastUsed: now, } + conn.original.conn = conn + conn.reply.conn = conn - replyTID := tid.reply() - replyTID.srcAddr = address - replyTID.srcPort = port + // For now, we only map an entry for the packet's original tuple as NAT may be + // performed on this connection. Until the packet goes through all the hooks + // and its final address/port is known, we cannot know what the response + // packet's addresses/ports will look like. + // + // This is okay because the destination cannot send its response until it + // receives the packet; the packet will only be received once all the hooks + // have been performed. + // + // See (*conn).finalize. + bkt.tuples.PushFront(&conn.original) + return &conn.original +} - conn, _ := ct.connForTID(tid) - if conn != nil { - // The connection is already tracked. - // TODO(gvisor.dev/issue/5696): Support updating an existing connection. - return nil - } - conn = newConn(tid, replyTID, manipDestination, hook) - ct.insertConn(conn) - return conn +func (ct *ConnTrack) connForTID(tid tupleID) *tuple { + bktID := ct.bucket(tid) + + ct.mu.RLock() + bkt := &ct.buckets[bktID] + ct.mu.RUnlock() + + return bkt.connForTID(tid, time.Now()) } -func (ct *ConnTrack) insertSNATConn(pkt *PacketBuffer, hook Hook, port uint16, address tcpip.Address) *conn { - tid, err := packetToTupleID(pkt) - if err != nil { - return nil - } - if hook != Input && hook != Postrouting { - return nil +func (bkt *bucket) connForTID(tid tupleID, now time.Time) *tuple { + bkt.mu.RLock() + defer bkt.mu.RUnlock() + return bkt.connForTIDRLocked(tid, now) +} + +// +checklocks:bkt.mu +func (bkt *bucket) connForTIDRLocked(tid tupleID, now time.Time) *tuple { + for other := bkt.tuples.Front(); other != nil; other = other.Next() { + if tid == other.id() && !other.conn.timedOut(now) { + return other + } } + return nil +} - replyTID := tid.reply() - replyTID.dstAddr = address - replyTID.dstPort = port +func (ct *ConnTrack) finalize(cn *conn) { + tid := cn.reply.id() + id := ct.bucket(tid) - conn, _ := ct.connForTID(tid) - if conn != nil { - // The connection is already tracked. - // TODO(gvisor.dev/issue/5696): Support updating an existing connection. - return nil + ct.mu.RLock() + bkt := &ct.buckets[id] + ct.mu.RUnlock() + + bkt.mu.Lock() + defer bkt.mu.Unlock() + + if t := bkt.connForTIDRLocked(tid, time.Now()); t != nil { + // Another connection for the reply already exists. We can't do much about + // this so we leave the connection cn represents in a state where it can + // send packets but its responses will be mapped to some other connection. + // This may be okay if the connection only expects to send packets without + // any responses. + return } - conn = newConn(tid, replyTID, manipSource, hook) - ct.insertConn(conn) - return conn + + bkt.tuples.PushFront(&cn.reply) } -// insertConn inserts conn into the appropriate table bucket. -func (ct *ConnTrack) insertConn(conn *conn) { - // Lock the buckets in the correct order. - tupleBucket := ct.bucket(conn.original.tupleID) - replyBucket := ct.bucket(conn.reply.tupleID) - ct.mu.RLock() - defer ct.mu.RUnlock() - if tupleBucket < replyBucket { - ct.buckets[tupleBucket].mu.Lock() - ct.buckets[replyBucket].mu.Lock() - } else if tupleBucket > replyBucket { - ct.buckets[replyBucket].mu.Lock() - ct.buckets[tupleBucket].mu.Lock() - } else { - // Both tuples are in the same bucket. - ct.buckets[tupleBucket].mu.Lock() - } - - // Now that we hold the locks, ensure the tuple hasn't been inserted by - // another thread. - // TODO(gvisor.dev/issue/5773): Should check conn.reply.tupleID, too? - alreadyInserted := false - for other := ct.buckets[tupleBucket].tuples.Front(); other != nil; other = other.Next() { - if other.tupleID == conn.original.tupleID { - alreadyInserted = true - break +func (cn *conn) finalize() { + { + cn.mu.RLock() + finalized := cn.finalized + cn.mu.RUnlock() + if finalized { + return } } - if !alreadyInserted { - // Add the tuple to the map. - ct.buckets[tupleBucket].tuples.PushFront(&conn.original) - ct.buckets[replyBucket].tuples.PushFront(&conn.reply) + cn.mu.Lock() + finalized := cn.finalized + cn.finalized = true + cn.mu.Unlock() + if finalized { + return } - // Unlocking can happen in any order. - ct.buckets[tupleBucket].mu.Unlock() - if tupleBucket != replyBucket { - ct.buckets[replyBucket].mu.Unlock() // +checklocksforce - } + cn.ct.finalize(cn) } -// handlePacket will manipulate the port and address of the packet if the -// connection exists. Returns whether, after the packet traverses the tables, -// it should create a new entry in the table. -func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, r *Route) bool { - if pkt.NatDone { - return false - } +// performNAT setups up the connection for the specified NAT. +// +// Generally, only the first packet of a connection reaches this method; other +// other packets will be manipulated without needing to modify the connection. +func (cn *conn) performNAT(pkt *PacketBuffer, hook Hook, r *Route, port uint16, address tcpip.Address, dnat bool) { + cn.performNATIfNoop(port, address, dnat) + cn.handlePacket(pkt, hook, r) +} - switch hook { - case Prerouting, Input, Output, Postrouting: - default: - return false - } +func (cn *conn) performNATIfNoop(port uint16, address tcpip.Address, dnat bool) { + cn.mu.Lock() + defer cn.mu.Unlock() - transportHeader, ok := getTransportHeader(pkt) - if !ok { - return false + if cn.finalized { + return } - conn, dir := ct.connFor(pkt) - // Connection not found for the packet. - if conn == nil { - // If this is the last hook in the data path for this packet (Input if - // incoming, Postrouting if outgoing), indicate that a connection should be - // inserted by the end of this hook. - return hook == Input || hook == Postrouting + if dnat { + if cn.destinationManip { + return + } + cn.destinationManip = true + } else { + if cn.sourceManip { + return + } + cn.sourceManip = true } - netHeader := pkt.Network() - - // TODO(gvisor.dev/issue/5748): TCP checksums on inbound packets should be - // validated if checksum offloading is off. It may require IP defrag if the - // packets are fragmented. - - var newAddr tcpip.Address - var newPort uint16 + cn.reply.mu.Lock() + defer cn.reply.mu.Unlock() - updateSRCFields := false + if dnat { + cn.reply.tupleID.srcAddr = address + cn.reply.tupleID.srcPort = port + } else { + cn.reply.tupleID.dstAddr = address + cn.reply.tupleID.dstPort = port + } +} - switch hook { - case Prerouting, Output: - if conn.manip == manipDestination && dir == dirOriginal { - newPort = conn.reply.srcPort - newAddr = conn.reply.srcAddr - pkt.NatDone = true - } else if conn.manip == manipSource && dir == dirReply { - newPort = conn.original.srcPort - newAddr = conn.original.srcAddr - pkt.NatDone = true - } - case Input, Postrouting: - if conn.manip == manipSource && dir == dirOriginal { - newPort = conn.reply.dstPort - newAddr = conn.reply.dstAddr - updateSRCFields = true - pkt.NatDone = true - } else if conn.manip == manipDestination && dir == dirReply { - newPort = conn.original.dstPort - newAddr = conn.original.dstAddr - updateSRCFields = true - pkt.NatDone = true - } - default: - panic(fmt.Sprintf("unrecognized hook = %s", hook)) +func (cn *conn) handlePacket(pkt *PacketBuffer, hook Hook, r *Route) { + if pkt.NatDone { + return } - if !pkt.NatDone { - return false + transportHeader, ok := getTransportHeader(pkt) + if !ok { + return } fullChecksum := false updatePseudoHeader := false + dnat := false switch hook { case Prerouting: // Packet came from outside the stack so it must have a checksum set // already. fullChecksum = true updatePseudoHeader = true + + dnat = true case Input: - case Output, Postrouting: - // Calculate the TCP checksum and set it. + case Forward: + panic("should not handle packet in the forwarding hook") + case Output: + dnat = true + fallthrough + case Postrouting: if pkt.TransportProtocolNumber == header.TCPProtocolNumber && pkt.GSOOptions.Type != GSONone && pkt.GSOOptions.NeedsCsum { updatePseudoHeader = true } else if r.RequiresTXTransportChecksum() { @@ -472,62 +444,73 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, r *Route) bool { updatePseudoHeader = true } default: - panic(fmt.Sprintf("unrecognized hook = %s", hook)) + panic(fmt.Sprintf("unrecognized hook = %d", hook)) } - rewritePacket( - netHeader, - transportHeader, - updateSRCFields, - fullChecksum, - updatePseudoHeader, - newPort, - newAddr, - ) + // TODO(gvisor.dev/issue/5748): TCP checksums on inbound packets should be + // validated if checksum offloading is off. It may require IP defrag if the + // packets are fragmented. - // Update the state of tcb. - conn.mu.Lock() - defer conn.mu.Unlock() + dir := pkt.tuple.direction + tid, performManip := func() (tupleID, bool) { + cn.mu.Lock() + defer cn.mu.Unlock() + + var tuple *tuple + switch dir { + case dirOriginal: + if dnat { + if !cn.destinationManip { + return tupleID{}, false + } + } else if !cn.sourceManip { + return tupleID{}, false + } - // Mark the connection as having been used recently so it isn't reaped. - conn.lastUsed = time.Now() - // Update connection state. - conn.updateLocked(pkt, hook) + tuple = &cn.reply + case dirReply: + if dnat { + if !cn.sourceManip { + return tupleID{}, false + } + } else if !cn.destinationManip { + return tupleID{}, false + } - return false -} + tuple = &cn.original + default: + panic(fmt.Sprintf("unhandled dir = %d", dir)) + } -// maybeInsertNoop tries to insert a no-op connection entry to keep connections -// from getting clobbered when replies arrive. It only inserts if there isn't -// already a connection for pkt. -// -// This should be called after traversing iptables rules only, to ensure that -// pkt.NatDone is set correctly. -func (ct *ConnTrack) maybeInsertNoop(pkt *PacketBuffer, hook Hook) { - // If there were a rule applying to this packet, it would be marked - // with NatDone. - if pkt.NatDone { - return - } + // Mark the connection as having been used recently so it isn't reaped. + cn.lastUsed = time.Now() + // Update connection state. + cn.updateLocked(pkt, dir) - switch pkt.TransportProtocolNumber { - case header.TCPProtocolNumber, header.UDPProtocolNumber: - default: - // TODO(https://gvisor.dev/issue/5915): Track ICMP and other trackable - // connections. + return tuple.id(), true + }() + if !performManip { return } - // This is the first packet we're seeing for the TCP connection. Insert - // the noop entry (an identity mapping) so that the response doesn't - // get NATed, breaking the connection. - tid, err := packetToTupleID(pkt) - if err != nil { - return + newPort := tid.dstPort + newAddr := tid.dstAddr + if dnat { + newPort = tid.srcPort + newAddr = tid.srcAddr } - conn := newConn(tid, tid.reply(), manipNone, hook) - conn.updateLocked(pkt, hook) - ct.insertConn(conn) + + rewritePacket( + pkt.Network(), + transportHeader, + !dnat, + fullChecksum, + updatePseudoHeader, + newPort, + newAddr, + ) + + pkt.NatDone = true } // bucket gets the conntrack bucket for a tupleID. @@ -579,14 +562,15 @@ func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, tim defer ct.mu.RUnlock() for i := 0; i < len(ct.buckets)/fractionPerReaping; i++ { idx = (i + start) % len(ct.buckets) - ct.buckets[idx].mu.Lock() - for tuple := ct.buckets[idx].tuples.Front(); tuple != nil; tuple = tuple.Next() { + bkt := &ct.buckets[idx] + bkt.mu.Lock() + for tuple := bkt.tuples.Front(); tuple != nil; tuple = tuple.Next() { checked++ - if ct.reapTupleLocked(tuple, idx, now) { + if ct.reapTupleLocked(tuple, idx, bkt, now) { expired++ } } - ct.buckets[idx].mu.Unlock() + bkt.mu.Unlock() } // We already checked buckets[idx]. idx++ @@ -611,41 +595,45 @@ func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, tim // reapTupleLocked tries to remove tuple and its reply from the table. It // returns whether the tuple's connection has timed out. // -// Preconditions: -// * ct.mu is locked for reading. -// * bucket is locked. -func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool { +// Precondition: ct.mu is read locked and bkt.mu is write locked. +// TODO(https://gvisor.dev/issue/6590): annotate r/w locking requirements. +// +checklocks:ct.mu +// +checklocks:bkt.mu +func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bktID int, bkt *bucket, now time.Time) bool { if !tuple.conn.timedOut(now) { return false } // To maintain lock order, we can only reap these tuples if the reply // appears later in the table. - replyBucket := ct.bucket(tuple.reply()) - if bucket > replyBucket { + replyBktID := ct.bucket(tuple.id().reply()) + if bktID > replyBktID { return true } // Don't re-lock if both tuples are in the same bucket. - differentBuckets := bucket != replyBucket - if differentBuckets { - ct.buckets[replyBucket].mu.Lock() + if bktID != replyBktID { + replyBkt := &ct.buckets[replyBktID] + replyBkt.mu.Lock() + removeConnFromBucket(replyBkt, tuple) + replyBkt.mu.Unlock() + } else { + removeConnFromBucket(bkt, tuple) } // We have the buckets locked and can remove both tuples. + bkt.tuples.Remove(tuple) + return true +} + +// TODO(https://gvisor.dev/issue/6590): annotate r/w locking requirements. +// +checklocks:b.mu +func removeConnFromBucket(b *bucket, tuple *tuple) { if tuple.direction == dirOriginal { - ct.buckets[replyBucket].tuples.Remove(&tuple.conn.reply) + b.tuples.Remove(&tuple.conn.reply) } else { - ct.buckets[replyBucket].tuples.Remove(&tuple.conn.original) - } - ct.buckets[bucket].tuples.Remove(tuple) - - // Don't re-unlock if both tuples are in the same bucket. - if differentBuckets { - ct.buckets[replyBucket].mu.Unlock() // +checklocksforce + b.tuples.Remove(&tuple.conn.original) } - - return true } func (ct *ConnTrack) originalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber) (tcpip.Address, uint16, tcpip.Error) { @@ -659,14 +647,19 @@ func (ct *ConnTrack) originalDst(epID TransportEndpointID, netProto tcpip.Networ transProto: transProto, netProto: netProto, } - conn, _ := ct.connForTID(tid) - if conn == nil { + t := ct.connForTID(tid) + if t == nil { // Not a tracked connection. return "", 0, &tcpip.ErrNotConnected{} - } else if conn.manip != manipDestination { + } + + t.conn.mu.RLock() + defer t.conn.mu.RUnlock() + if !t.conn.destinationManip { // Unmanipulated destination. return "", 0, &tcpip.ErrInvalidOptionValue{} } - return conn.original.dstAddr, conn.original.dstPort, nil + id := t.conn.original.id() + return id.dstAddr, id.dstPort, nil } diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index 74c9075b4..5808be685 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -271,7 +271,18 @@ const ( // // Precondition: The packet's network and transport header must be set. func (it *IPTables) CheckPrerouting(pkt *PacketBuffer, addressEP AddressableEndpoint, inNicName string) bool { - return it.check(Prerouting, pkt, nil /* route */, addressEP, inNicName, "" /* outNicName */) + const hook = Prerouting + + if it.shouldSkip(pkt.NetworkProtocolNumber) { + return true + } + + if t := it.connections.getConnOrMaybeInsertNoop(pkt); t != nil { + pkt.tuple = t + t.conn.handlePacket(pkt, hook, nil /* route */) + } + + return it.check(hook, pkt, nil /* route */, addressEP, inNicName, "" /* outNicName */) } // CheckInput performs the input hook on the packet. @@ -281,7 +292,22 @@ func (it *IPTables) CheckPrerouting(pkt *PacketBuffer, addressEP AddressableEndp // // Precondition: The packet's network and transport header must be set. func (it *IPTables) CheckInput(pkt *PacketBuffer, inNicName string) bool { - return it.check(Input, pkt, nil /* route */, nil /* addressEP */, inNicName, "" /* outNicName */) + const hook = Input + + if it.shouldSkip(pkt.NetworkProtocolNumber) { + return true + } + + if t := pkt.tuple; t != nil { + t.conn.handlePacket(pkt, hook, nil /* route */) + } + + ret := it.check(hook, pkt, nil /* route */, nil /* addressEP */, inNicName, "" /* outNicName */) + if t := pkt.tuple; t != nil { + t.conn.finalize() + } + pkt.tuple = nil + return ret } // CheckForward performs the forward hook on the packet. @@ -291,6 +317,9 @@ func (it *IPTables) CheckInput(pkt *PacketBuffer, inNicName string) bool { // // Precondition: The packet's network and transport header must be set. func (it *IPTables) CheckForward(pkt *PacketBuffer, inNicName, outNicName string) bool { + if it.shouldSkip(pkt.NetworkProtocolNumber) { + return true + } return it.check(Forward, pkt, nil /* route */, nil /* addressEP */, inNicName, outNicName) } @@ -301,7 +330,18 @@ func (it *IPTables) CheckForward(pkt *PacketBuffer, inNicName, outNicName string // // Precondition: The packet's network and transport header must be set. func (it *IPTables) CheckOutput(pkt *PacketBuffer, r *Route, outNicName string) bool { - return it.check(Output, pkt, r, nil /* addressEP */, "" /* inNicName */, outNicName) + const hook = Output + + if it.shouldSkip(pkt.NetworkProtocolNumber) { + return true + } + + if t := it.connections.getConnOrMaybeInsertNoop(pkt); t != nil { + pkt.tuple = t + t.conn.handlePacket(pkt, hook, r) + } + + return it.check(hook, pkt, r, nil /* addressEP */, "" /* inNicName */, outNicName) } // CheckPostrouting performs the postrouting hook on the packet. @@ -310,8 +350,38 @@ func (it *IPTables) CheckOutput(pkt *PacketBuffer, r *Route, outNicName string) // must be dropped if false is returned. // // Precondition: The packet's network and transport header must be set. -func (it *IPTables) CheckPostrouting(pkt *PacketBuffer, r *Route, outNicName string) bool { - return it.check(Postrouting, pkt, r, nil /* addressEP */, "" /* inNicName */, outNicName) +func (it *IPTables) CheckPostrouting(pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, outNicName string) bool { + const hook = Postrouting + + if it.shouldSkip(pkt.NetworkProtocolNumber) { + return true + } + + if t := pkt.tuple; t != nil { + t.conn.handlePacket(pkt, hook, r) + } + + ret := it.check(hook, pkt, r, addressEP, "" /* inNicName */, outNicName) + if t := pkt.tuple; t != nil { + t.conn.finalize() + } + pkt.tuple = nil + return ret +} + +func (it *IPTables) shouldSkip(netProto tcpip.NetworkProtocolNumber) bool { + switch netProto { + case header.IPv4ProtocolNumber, header.IPv6ProtocolNumber: + default: + // IPTables only supports IPv4/IPv6. + return true + } + + it.mu.RLock() + defer it.mu.RUnlock() + // Many users never configure iptables. Spare them the cost of rule + // traversal if rules have never been set. + return !it.modified } // check runs pkt through the rules for hook. It returns true when the packet @@ -320,20 +390,8 @@ func (it *IPTables) CheckPostrouting(pkt *PacketBuffer, r *Route, outNicName str // // Precondition: The packet's network and transport header must be set. func (it *IPTables) check(hook Hook, pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, inNicName, outNicName string) bool { - if pkt.NetworkProtocolNumber != header.IPv4ProtocolNumber && pkt.NetworkProtocolNumber != header.IPv6ProtocolNumber { - return true - } - // Many users never configure iptables. Spare them the cost of rule - // traversal if rules have never been set. it.mu.RLock() defer it.mu.RUnlock() - if !it.modified { - return true - } - - // Packets are manipulated only if connection and matching - // NAT rule exists. - shouldTrack := it.connections.handlePacket(pkt, hook, r) // Go through each table containing the hook. priorities := it.priorities[hook] @@ -361,7 +419,7 @@ func (it *IPTables) check(hook Hook, pkt *PacketBuffer, r *Route, addressEP Addr // Any Return from a built-in chain means we have to // call the underflow. underflow := table.Rules[table.Underflows[hook]] - switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, r, addressEP); v { + switch v, _ := underflow.Target.Action(pkt, hook, r, addressEP); v { case RuleAccept: continue case RuleDrop: @@ -377,21 +435,6 @@ func (it *IPTables) check(hook Hook, pkt *PacketBuffer, r *Route, addressEP Addr } } - // If this connection should be tracked, try to add an entry for it. If - // traversing the nat table didn't end in adding an entry, - // maybeInsertNoop will add a no-op entry for the connection. This is - // needeed when establishing connections so that the SYN/ACK reply to an - // outgoing SYN is delivered to the correct endpoint rather than being - // redirected by a prerouting rule. - // - // From the iptables documentation: "If there is no rule, a `null' - // binding is created: this usually does not map the packet, but exists - // to ensure we don't map another stream over an existing one." - if shouldTrack { - it.connections.maybeInsertNoop(pkt, hook) - } - - // Every table returned Accept. return true } @@ -431,7 +474,9 @@ func (it *IPTables) startReaper(interval time.Duration) { // // Precondition: The packets' network and transport header must be set. func (it *IPTables) CheckOutputPackets(pkts PacketBufferList, r *Route, outNicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) { - return it.checkPackets(Output, pkts, r, outNicName) + return checkPackets(pkts, func(pkt *PacketBuffer) bool { + return it.CheckOutput(pkt, r, outNicName) + }) } // CheckPostroutingPackets performs the postrouting hook on the packets. @@ -439,21 +484,16 @@ func (it *IPTables) CheckOutputPackets(pkts PacketBufferList, r *Route, outNicNa // Returns a map of packets that must be dropped. // // Precondition: The packets' network and transport header must be set. -func (it *IPTables) CheckPostroutingPackets(pkts PacketBufferList, r *Route, outNicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) { - return it.checkPackets(Postrouting, pkts, r, outNicName) +func (it *IPTables) CheckPostroutingPackets(pkts PacketBufferList, r *Route, addressEP AddressableEndpoint, outNicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) { + return checkPackets(pkts, func(pkt *PacketBuffer) bool { + return it.CheckPostrouting(pkt, r, addressEP, outNicName) + }) } -// checkPackets runs pkts through the rules for hook and returns a map of -// packets that should not go forward. -// -// NOTE: unlike the Check API the returned map contains packets that should be -// dropped. -// -// Precondition: The packets' network and transport header must be set. -func (it *IPTables) checkPackets(hook Hook, pkts PacketBufferList, r *Route, outNicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) { +func checkPackets(pkts PacketBufferList, f func(*PacketBuffer) bool) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) { for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { if !pkt.NatDone { - if ok := it.check(hook, pkt, r, nil /* addressEP */, "" /* inNicName */, outNicName); !ok { + if ok := f(pkt); !ok { if drop == nil { drop = make(map[*PacketBuffer]struct{}) } @@ -543,7 +583,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx } // All the matchers matched, so run the target. - return rule.Target.Action(pkt, &it.connections, hook, r, addressEP) + return rule.Target.Action(pkt, hook, r, addressEP) } // OriginalDst returns the original destination of redirected connections. It diff --git a/pkg/tcpip/stack/iptables_state.go b/pkg/tcpip/stack/iptables_state.go index 529e02a07..3d3c39c20 100644 --- a/pkg/tcpip/stack/iptables_state.go +++ b/pkg/tcpip/stack/iptables_state.go @@ -26,11 +26,15 @@ type unixTime struct { // saveLastUsed is invoked by stateify. func (cn *conn) saveLastUsed() unixTime { + cn.mu.Lock() + defer cn.mu.Unlock() return unixTime{cn.lastUsed.Unix(), cn.lastUsed.UnixNano()} } // loadLastUsed is invoked by stateify. func (cn *conn) loadLastUsed(unix unixTime) { + cn.mu.Lock() + defer cn.mu.Unlock() cn.lastUsed = time.Unix(unix.second, unix.nano) } diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go index e8806ebdb..85490e2d4 100644 --- a/pkg/tcpip/stack/iptables_targets.go +++ b/pkg/tcpip/stack/iptables_targets.go @@ -29,7 +29,7 @@ type AcceptTarget struct { } // Action implements Target.Action. -func (*AcceptTarget) Action(*PacketBuffer, *ConnTrack, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { +func (*AcceptTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { return RuleAccept, 0 } @@ -40,7 +40,7 @@ type DropTarget struct { } // Action implements Target.Action. -func (*DropTarget) Action(*PacketBuffer, *ConnTrack, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { +func (*DropTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { return RuleDrop, 0 } @@ -52,7 +52,7 @@ type ErrorTarget struct { } // Action implements Target.Action. -func (*ErrorTarget) Action(*PacketBuffer, *ConnTrack, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { +func (*ErrorTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { log.Debugf("ErrorTarget triggered.") return RuleDrop, 0 } @@ -67,7 +67,7 @@ type UserChainTarget struct { } // Action implements Target.Action. -func (*UserChainTarget) Action(*PacketBuffer, *ConnTrack, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { +func (*UserChainTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { panic("UserChainTarget should never be called.") } @@ -79,10 +79,49 @@ type ReturnTarget struct { } // Action implements Target.Action. -func (*ReturnTarget) Action(*PacketBuffer, *ConnTrack, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { +func (*ReturnTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { return RuleReturn, 0 } +// DNATTarget modifies the destination port/IP of packets. +type DNATTarget struct { + // The new destination address for packets. + // + // Immutable. + Addr tcpip.Address + + // The new destination port for packets. + // + // Immutable. + Port uint16 + + // NetworkProtocol is the network protocol the target is used with. + // + // Immutable. + NetworkProtocol tcpip.NetworkProtocolNumber +} + +// Action implements Target.Action. +func (rt *DNATTarget) Action(pkt *PacketBuffer, hook Hook, r *Route, addressEP AddressableEndpoint) (RuleVerdict, int) { + // Sanity check. + if rt.NetworkProtocol != pkt.NetworkProtocolNumber { + panic(fmt.Sprintf( + "DNATTarget.Action with NetworkProtocol %d called on packet with NetworkProtocolNumber %d", + rt.NetworkProtocol, pkt.NetworkProtocolNumber)) + } + + switch hook { + case Prerouting, Output: + case Input, Forward, Postrouting: + panic(fmt.Sprintf("%s not supported for DNAT", hook)) + default: + panic(fmt.Sprintf("%s unrecognized", hook)) + } + + return natAction(pkt, hook, r, rt.Port, rt.Addr, true /* dnat */) + +} + // RedirectTarget redirects the packet to this machine by modifying the // destination port/IP. Outgoing packets are redirected to the loopback device, // and incoming packets are redirected to the incoming interface (rather than @@ -97,7 +136,7 @@ type RedirectTarget struct { } // Action implements Target.Action. -func (rt *RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r *Route, addressEP AddressableEndpoint) (RuleVerdict, int) { +func (rt *RedirectTarget) Action(pkt *PacketBuffer, hook Hook, r *Route, addressEP AddressableEndpoint) (RuleVerdict, int) { // Sanity check. if rt.NetworkProtocol != pkt.NetworkProtocolNumber { panic(fmt.Sprintf( @@ -105,16 +144,6 @@ func (rt *RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r rt.NetworkProtocol, pkt.NetworkProtocolNumber)) } - // Packet is already manipulated. - if pkt.NatDone { - return RuleAccept, 0 - } - - // Drop the packet if network and transport header are not set. - if pkt.NetworkHeader().View().IsEmpty() || pkt.TransportHeader().View().IsEmpty() { - return RuleDrop, 0 - } - // Change the address to loopback (127.0.0.1 or ::1) in Output and to // the primary address of the incoming interface in Prerouting. var address tcpip.Address @@ -132,43 +161,7 @@ func (rt *RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r panic("redirect target is supported only on output and prerouting hooks") } - switch protocol := pkt.TransportProtocolNumber; protocol { - case header.UDPProtocolNumber: - udpHeader := header.UDP(pkt.TransportHeader().View()) - - if hook == Output { - // Only calculate the checksum if offloading isn't supported. - requiresChecksum := r.RequiresTXTransportChecksum() - rewritePacket( - pkt.Network(), - udpHeader, - false, /* updateSRCFields */ - requiresChecksum, - requiresChecksum, - rt.Port, - address, - ) - } else { - udpHeader.SetDestinationPort(rt.Port) - } - - pkt.NatDone = true - case header.TCPProtocolNumber: - if ct == nil { - return RuleAccept, 0 - } - - // Set up conection for matching NAT rule. Only the first - // packet of the connection comes here. Other packets will be - // manipulated in connection tracking. - if conn := ct.insertRedirectConn(pkt, hook, rt.Port, address); conn != nil { - ct.handlePacket(pkt, hook, r) - } - default: - return RuleDrop, 0 - } - - return RuleAccept, 0 + return natAction(pkt, hook, r, rt.Port, address, true /* dnat */) } // SNATTarget modifies the source port/IP in the outgoing packets. @@ -181,15 +174,7 @@ type SNATTarget struct { NetworkProtocol tcpip.NetworkProtocolNumber } -// Action implements Target.Action. -func (st *SNATTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r *Route, _ AddressableEndpoint) (RuleVerdict, int) { - // Sanity check. - if st.NetworkProtocol != pkt.NetworkProtocolNumber { - panic(fmt.Sprintf( - "SNATTarget.Action with NetworkProtocol %d called on packet with NetworkProtocolNumber %d", - st.NetworkProtocol, pkt.NetworkProtocolNumber)) - } - +func natAction(pkt *PacketBuffer, hook Hook, r *Route, port uint16, address tcpip.Address, dnat bool) (RuleVerdict, int) { // Packet is already manipulated. if pkt.NatDone { return RuleAccept, 0 @@ -200,6 +185,37 @@ func (st *SNATTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r *Rou return RuleDrop, 0 } + t := pkt.tuple + if t == nil { + return RuleDrop, 0 + } + + // TODO(https://gvisor.dev/issue/5773): If the port is in use, pick a + // different port. + if port == 0 { + switch protocol := pkt.TransportProtocolNumber; protocol { + case header.UDPProtocolNumber: + port = header.UDP(pkt.TransportHeader().View()).SourcePort() + case header.TCPProtocolNumber: + port = header.TCP(pkt.TransportHeader().View()).SourcePort() + default: + panic(fmt.Sprintf("unsupported transport protocol = %d", pkt.TransportProtocolNumber)) + } + } + + t.conn.performNAT(pkt, hook, r, port, address, dnat) + return RuleAccept, 0 +} + +// Action implements Target.Action. +func (st *SNATTarget) Action(pkt *PacketBuffer, hook Hook, r *Route, _ AddressableEndpoint) (RuleVerdict, int) { + // Sanity check. + if st.NetworkProtocol != pkt.NetworkProtocolNumber { + panic(fmt.Sprintf( + "SNATTarget.Action with NetworkProtocol %d called on packet with NetworkProtocolNumber %d", + st.NetworkProtocol, pkt.NetworkProtocolNumber)) + } + switch hook { case Postrouting, Input: case Prerouting, Output, Forward: @@ -208,31 +224,43 @@ func (st *SNATTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, r *Rou panic(fmt.Sprintf("%s unrecognized", hook)) } - port := st.Port + return natAction(pkt, hook, r, st.Port, st.Addr, false /* dnat */) +} - if port == 0 { - switch protocol := pkt.TransportProtocolNumber; protocol { - case header.UDPProtocolNumber: - if port == 0 { - port = header.UDP(pkt.TransportHeader().View()).SourcePort() - } - case header.TCPProtocolNumber: - if port == 0 { - port = header.TCP(pkt.TransportHeader().View()).SourcePort() - } - } +// MasqueradeTarget modifies the source port/IP in the outgoing packets. +type MasqueradeTarget struct { + // NetworkProtocol is the network protocol the target is used with. It + // is immutable. + NetworkProtocol tcpip.NetworkProtocolNumber +} + +// Action implements Target.Action. +func (mt *MasqueradeTarget) Action(pkt *PacketBuffer, hook Hook, r *Route, addressEP AddressableEndpoint) (RuleVerdict, int) { + // Sanity check. + if mt.NetworkProtocol != pkt.NetworkProtocolNumber { + panic(fmt.Sprintf( + "MasqueradeTarget.Action with NetworkProtocol %d called on packet with NetworkProtocolNumber %d", + mt.NetworkProtocol, pkt.NetworkProtocolNumber)) } - // Set up conection for matching NAT rule. Only the first packet of the - // connection comes here. Other packets will be manipulated in connection - // tracking. - // - // Does nothing if the protocol does not support connection tracking. - if conn := ct.insertSNATConn(pkt, hook, port, st.Addr); conn != nil { - ct.handlePacket(pkt, hook, r) + switch hook { + case Postrouting: + case Prerouting, Input, Forward, Output: + panic(fmt.Sprintf("masquerade target is supported only on postrouting hook; hook = %d", hook)) + default: + panic(fmt.Sprintf("%s unrecognized", hook)) } - return RuleAccept, 0 + // addressEP is expected to be set for the postrouting hook. + ep := addressEP.AcquireOutgoingPrimaryAddress(pkt.Network().DestinationAddress(), false /* allowExpired */) + if ep == nil { + // No address exists that we can use as a source address. + return RuleDrop, 0 + } + + address := ep.AddressWithPrefix().Address + ep.DecRef() + return natAction(pkt, hook, r, 0 /* port */, address, false /* dnat */) } func rewritePacket(n header.Network, t header.ChecksummableTransport, updateSRCFields, fullChecksum, updatePseudoHeader bool, newPort uint16, newAddr tcpip.Address) { diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go index 976194124..b22024667 100644 --- a/pkg/tcpip/stack/iptables_types.go +++ b/pkg/tcpip/stack/iptables_types.go @@ -81,17 +81,6 @@ const ( // // +stateify savable type IPTables struct { - // mu protects v4Tables, v6Tables, and modified. - mu sync.RWMutex - // v4Tables and v6tables map tableIDs to tables. They hold builtin - // tables only, not user tables. mu must be locked for accessing. - v4Tables [NumTables]Table - v6Tables [NumTables]Table - // modified is whether tables have been modified at least once. It is - // used to elide the iptables performance overhead for workloads that - // don't utilize iptables. - modified bool - // priorities maps each hook to a list of table names. The order of the // list is the order in which each table should be visited for that // hook. It is immutable. @@ -101,6 +90,21 @@ type IPTables struct { // reaperDone can be signaled to stop the reaper goroutine. reaperDone chan struct{} + + mu sync.RWMutex + // v4Tables and v6tables map tableIDs to tables. They hold builtin + // tables only, not user tables. + // + // +checklocks:mu + v4Tables [NumTables]Table + // +checklocks:mu + v6Tables [NumTables]Table + // modified is whether tables have been modified at least once. It is + // used to elide the iptables performance overhead for workloads that + // don't utilize iptables. + // + // +checklocks:mu + modified bool } // VisitTargets traverses all the targets of all tables and replaces each with @@ -352,5 +356,5 @@ type Target interface { // Action takes an action on the packet and returns a verdict on how // traversal should (or should not) continue. If the return value is // Jump, it also returns the index of the rule to jump to. - Action(*PacketBuffer, *ConnTrack, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) + Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) } diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go index bf248ef20..888a8bd9d 100644 --- a/pkg/tcpip/stack/packet_buffer.go +++ b/pkg/tcpip/stack/packet_buffer.go @@ -143,6 +143,8 @@ type PacketBuffer struct { // NetworkPacketInfo holds an incoming packet's network-layer information. NetworkPacketInfo NetworkPacketInfo + + tuple *tuple } // NewPacketBuffer creates a new PacketBuffer with opts. @@ -302,6 +304,7 @@ func (pk *PacketBuffer) Clone() *PacketBuffer { NICID: pk.NICID, RXTransportChecksumValidated: pk.RXTransportChecksumValidated, NetworkPacketInfo: pk.NetworkPacketInfo, + tuple: pk.tuple, } } @@ -329,13 +332,8 @@ func (pk *PacketBuffer) CloneToInbound() *PacketBuffer { buf: pk.buf.Clone(), // Treat unfilled header portion as reserved. reserved: pk.AvailableHeaderBytes(), + tuple: pk.tuple, } - // TODO(gvisor.dev/issue/5696): reimplement conntrack so that no need to - // maintain this flag in the packet. Currently conntrack needs this flag to - // tell if a noop connection should be inserted at Input hook. Once conntrack - // redefines the manipulation field as mutable, we won't need the special noop - // connection. - newPk.NatDone = pk.NatDone return newPk } @@ -367,12 +365,7 @@ func (pk *PacketBuffer) DeepCopyForForwarding(reservedHeaderBytes int) *PacketBu newPk.TransportProtocolNumber = pk.TransportProtocolNumber } - // TODO(gvisor.dev/issue/5696): reimplement conntrack so that no need to - // maintain this flag in the packet. Currently conntrack needs this flag to - // tell if a noop connection should be inserted at Input hook. Once conntrack - // redefines the manipulation field as mutable, we won't need the special noop - // connection. - newPk.NatDone = pk.NatDone + newPk.tuple = pk.tuple return newPk } @@ -425,13 +418,14 @@ func (d PacketData) PullUp(size int) (tcpipbuffer.View, bool) { return d.pk.buf.PullUp(d.pk.dataOffset(), size) } -// DeleteFront removes count from the beginning of d. It panics if count > -// d.Size(). All backing storage references after the front of the d are -// invalidated. -func (d PacketData) DeleteFront(count int) { - if !d.pk.buf.Remove(d.pk.dataOffset(), count) { - panic("count > d.Size()") +// Consume is the same as PullUp except that is additionally consumes the +// returned bytes. Subsequent PullUp or Consume will not return these bytes. +func (d PacketData) Consume(size int) (tcpipbuffer.View, bool) { + v, ok := d.PullUp(size) + if ok { + d.pk.consumed += size } + return v, ok } // CapLength reduces d to at most length bytes. diff --git a/pkg/tcpip/stack/packet_buffer_test.go b/pkg/tcpip/stack/packet_buffer_test.go index 87b023445..c376ed1a1 100644 --- a/pkg/tcpip/stack/packet_buffer_test.go +++ b/pkg/tcpip/stack/packet_buffer_test.go @@ -123,32 +123,6 @@ func TestPacketHeaderPush(t *testing.T) { } } -func TestPacketBufferClone(t *testing.T) { - data := concatViews(makeView(20), makeView(30), makeView(40)) - pk := NewPacketBuffer(PacketBufferOptions{ - // Make a copy of data to make sure our truth data won't be taint by - // PacketBuffer. - Data: buffer.NewViewFromBytes(data).ToVectorisedView(), - }) - - bytesToDelete := 30 - originalSize := data.Size() - - clonedPks := []*PacketBuffer{ - pk.Clone(), - pk.CloneToInbound(), - } - pk.Data().DeleteFront(bytesToDelete) - if got, want := pk.Data().Size(), originalSize-bytesToDelete; got != want { - t.Errorf("original packet was not changed: size expected = %d, got = %d", want, got) - } - for _, clonedPk := range clonedPks { - if got := clonedPk.Data().Size(); got != originalSize { - t.Errorf("cloned packet should not be modified: expected size = %d, got = %d", originalSize, got) - } - } -} - func TestPacketHeaderConsume(t *testing.T) { for _, test := range []struct { name string @@ -461,11 +435,17 @@ func TestPacketBufferData(t *testing.T) { } }) - // DeleteFront + // Consume. for _, n := range []int{1, len(tc.data)} { - t.Run(fmt.Sprintf("DeleteFront%d", n), func(t *testing.T) { + t.Run(fmt.Sprintf("Consume%d", n), func(t *testing.T) { pkt := tc.makePkt(t) - pkt.Data().DeleteFront(n) + v, ok := pkt.Data().Consume(n) + if !ok { + t.Fatalf("Consume failed") + } + if want := []byte(tc.data)[:n]; !bytes.Equal(v, want) { + t.Fatalf("pkt.Data().Consume(n) = 0x%x, want 0x%x", v, want) + } checkData(t, pkt, []byte(tc.data)[n:]) }) diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index cd4137794..c23e91702 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -139,18 +139,15 @@ func (f *fakeNetworkEndpoint) HandlePacket(pkt *stack.PacketBuffer) { // Handle control packets. if netHdr[protocolNumberOffset] == uint8(fakeControlProtocol) { - hdr, ok := pkt.Data().PullUp(fakeNetHeaderLen) + hdr, ok := pkt.Data().Consume(fakeNetHeaderLen) if !ok { return } - // DeleteFront invalidates slices. Make a copy before trimming. - nb := append([]byte(nil), hdr...) - pkt.Data().DeleteFront(fakeNetHeaderLen) f.dispatcher.DeliverTransportError( - tcpip.Address(nb[srcAddrOffset:srcAddrOffset+1]), - tcpip.Address(nb[dstAddrOffset:dstAddrOffset+1]), + tcpip.Address(hdr[srcAddrOffset:srcAddrOffset+1]), + tcpip.Address(hdr[dstAddrOffset:dstAddrOffset+1]), fakeNetNumber, - tcpip.TransportProtocolNumber(nb[protocolNumberOffset]), + tcpip.TransportProtocolNumber(hdr[protocolNumberOffset]), // Nothing checks the error. nil, /* transport error */ pkt, diff --git a/pkg/tcpip/stack/tcp.go b/pkg/tcpip/stack/tcp.go index dc7289441..a941091b0 100644 --- a/pkg/tcpip/stack/tcp.go +++ b/pkg/tcpip/stack/tcp.go @@ -289,6 +289,12 @@ type TCPSenderState struct { // RACKState holds the state related to RACK loss detection algorithm. RACKState TCPRACKState + + // RetransmitTS records the timestamp used to detect spurious recovery. + RetransmitTS uint32 + + // SpuriousRecovery indicates if the sender entered recovery spuriously. + SpuriousRecovery bool } // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index d45a2c05c..460a6afaf 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -423,9 +423,9 @@ type ControlMessages struct { // HasTimestamp indicates whether Timestamp is valid/set. HasTimestamp bool - // Timestamp is the time (in ns) that the last packet used to create - // the read data was received. - Timestamp int64 + // Timestamp is the time that the last packet used to create the read data + // was received. + Timestamp time.Time `state:".(int64)"` // HasInq indicates whether Inq is valid/set. HasInq bool @@ -471,10 +471,10 @@ type ControlMessages struct { // PacketOwner is used to get UID and GID of the packet. type PacketOwner interface { - // UID returns KUID of the packet. + // KUID returns KUID of the packet. KUID() uint32 - // GID returns KGID of the packet. + // KGID returns KGID of the packet. KGID() uint32 } @@ -1245,11 +1245,11 @@ type Route struct { // String implements the fmt.Stringer interface. func (r Route) String() string { var out strings.Builder - fmt.Fprintf(&out, "%s", r.Destination) + _, _ = fmt.Fprintf(&out, "%s", r.Destination) if len(r.Gateway) > 0 { - fmt.Fprintf(&out, " via %s", r.Gateway) + _, _ = fmt.Fprintf(&out, " via %s", r.Gateway) } - fmt.Fprintf(&out, " nic %d", r.NIC) + _, _ = fmt.Fprintf(&out, " nic %d", r.NIC) return out.String() } @@ -1286,7 +1286,7 @@ func (s *StatCounter) Decrement() { } // Value returns the current value of the counter. -func (s *StatCounter) Value(name ...string) uint64 { +func (s *StatCounter) Value(...string) uint64 { return s.count.Load() } @@ -1865,6 +1865,10 @@ type TCPStats struct { // SegmentsAckedWithDSACK is the number of segments acknowledged with // DSACK. SegmentsAckedWithDSACK *StatCounter + + // SpuriousRecovery is the number of times the connection entered loss + // recovery spuriously. + SpuriousRecovery *StatCounter } // UDPStats collects UDP-specific stats. diff --git a/pkg/tcpip/tcpip_state.go b/pkg/tcpip/tcpip_state.go new file mode 100644 index 000000000..1953e24a1 --- /dev/null +++ b/pkg/tcpip/tcpip_state.go @@ -0,0 +1,27 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcpip + +import ( + "time" +) + +func (c *ControlMessages) saveTimestamp() int64 { + return c.Timestamp.UnixNano() +} + +func (c *ControlMessages) loadTimestamp(nsec int64) { + c.Timestamp = time.Unix(0, nsec) +} diff --git a/pkg/tcpip/tests/integration/iptables_test.go b/pkg/tcpip/tests/integration/iptables_test.go index bdf4a64b9..7f872c271 100644 --- a/pkg/tcpip/tests/integration/iptables_test.go +++ b/pkg/tcpip/tests/integration/iptables_test.go @@ -1162,19 +1162,19 @@ func TestInputHookWithLocalForwarding(t *testing.T) { } } -func TestSNAT(t *testing.T) { - const listenPort = 8080 +func TestNAT(t *testing.T) { + const listenPort uint16 = 8080 type endpointAndAddresses struct { - serverEP tcpip.Endpoint - serverAddr tcpip.Address - serverReadableCH chan struct{} - - clientEP tcpip.Endpoint - clientAddr tcpip.Address - clientReadableCH chan struct{} - - nattedClientAddr tcpip.Address + serverEP tcpip.Endpoint + serverAddr tcpip.FullAddress + serverReadableCH chan struct{} + serverConnectAddr tcpip.Address + + clientEP tcpip.Endpoint + clientAddr tcpip.Address + clientReadableCH chan struct{} + clientConnectAddr tcpip.FullAddress } newEP := func(t *testing.T, s *stack.Stack, transProto tcpip.TransportProtocolNumber, netProto tcpip.NetworkProtocolNumber) (tcpip.Endpoint, chan struct{}) { @@ -1195,71 +1195,247 @@ func TestSNAT(t *testing.T) { return ep, ch } + setupNAT := func(t *testing.T, s *stack.Stack, netProto tcpip.NetworkProtocolNumber, hook stack.Hook, filter stack.IPHeaderFilter, target stack.Target) { + t.Helper() + + ipv6 := netProto == ipv6.ProtocolNumber + ipt := s.IPTables() + table := ipt.GetTable(stack.NATID, ipv6) + ruleIdx := table.BuiltinChains[hook] + table.Rules[ruleIdx].Filter = filter + table.Rules[ruleIdx].Target = target + // Make sure the packet is not dropped by the next rule. + table.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.NATID, table, ipv6); err != nil { + t.Fatalf("ipt.ReplaceTable(%d, _, %t): %s", stack.NATID, ipv6, err) + } + } + + setupDNAT := func(t *testing.T, s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, target stack.Target) { + t.Helper() + + setupNAT( + t, + s, + netProto, + stack.Prerouting, + stack.IPHeaderFilter{ + Protocol: transProto, + CheckProtocol: true, + InputInterface: utils.RouterNIC2Name, + }, + target) + } + + setupSNAT := func(t *testing.T, s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, target stack.Target) { + t.Helper() + + setupNAT( + t, + s, + netProto, + stack.Postrouting, + stack.IPHeaderFilter{ + Protocol: transProto, + CheckProtocol: true, + OutputInterface: utils.RouterNIC1Name, + }, + target) + } + + type natType struct { + name string + setupNAT func(_ *testing.T, _ *stack.Stack, _ tcpip.NetworkProtocolNumber, _ tcpip.TransportProtocolNumber, snatAddr, dnatAddr tcpip.Address) + } + + snatTypes := []natType{ + { + name: "SNAT", + setupNAT: func(t *testing.T, s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, snatAddr, _ tcpip.Address) { + t.Helper() + + setupSNAT(t, s, netProto, transProto, &stack.SNATTarget{NetworkProtocol: netProto, Addr: snatAddr}) + }, + }, + { + name: "Masquerade", + setupNAT: func(t *testing.T, s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, _, _ tcpip.Address) { + t.Helper() + + setupSNAT(t, s, netProto, transProto, &stack.MasqueradeTarget{NetworkProtocol: netProto}) + }, + }, + } + dnatTypes := []natType{ + { + name: "Redirect", + setupNAT: func(t *testing.T, s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, _, _ tcpip.Address) { + t.Helper() + + setupDNAT(t, s, netProto, transProto, &stack.RedirectTarget{NetworkProtocol: netProto, Port: listenPort}) + }, + }, + { + name: "DNAT", + setupNAT: func(t *testing.T, s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, _, dnatAddr tcpip.Address) { + t.Helper() + + setupDNAT(t, s, netProto, transProto, &stack.DNATTarget{NetworkProtocol: netProto, Addr: dnatAddr, Port: listenPort}) + }, + }, + } + tests := []struct { - name string + name string + netProto tcpip.NetworkProtocolNumber + // Setups up the stacks in such a way that: + // + // - Host2 is the client for all tests. + // - Host1 is the server when performing SNAT + // + NAT will transform client-originating packets' source addresses to + // the router's NIC1's address before reaching Host1. + // - Router is the server when performing DNAT (client will still attempt to + // send packets to Host1). + // + NAT will transform client-originating packets' destination addresses + // to the router's NIC2's address. epAndAddrs func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack, proto tcpip.TransportProtocolNumber) endpointAndAddresses + natTypes []natType }{ { - name: "IPv4 host1 server with host2 client", + name: "IPv4 SNAT", + netProto: ipv4.ProtocolNumber, epAndAddrs: func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack, proto tcpip.TransportProtocolNumber) endpointAndAddresses { t.Helper() - ipt := routerStack.IPTables() - filter := ipt.GetTable(stack.NATID, false /* ipv6 */) - ruleIdx := filter.BuiltinChains[stack.Postrouting] - filter.Rules[ruleIdx].Filter = stack.IPHeaderFilter{OutputInterface: utils.RouterNIC1Name} - filter.Rules[ruleIdx].Target = &stack.SNATTarget{NetworkProtocol: ipv4.ProtocolNumber, Addr: utils.RouterNIC1IPv4Addr.AddressWithPrefix.Address} - // Make sure the packet is not dropped by the next rule. - filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} - if err := ipt.ReplaceTable(stack.NATID, filter, false /* ipv6 */); err != nil { - t.Fatalf("ipt.ReplaceTable(%d, _, %t): %s", stack.NATID, false, err) + listenerStack := host1Stack + serverAddr := tcpip.FullAddress{ + Addr: utils.Host1IPv4Addr.AddressWithPrefix.Address, + Port: listenPort, } - - ep1, ep1WECH := newEP(t, host1Stack, proto, ipv4.ProtocolNumber) + serverConnectAddr := utils.RouterNIC1IPv4Addr.AddressWithPrefix.Address + clientConnectPort := serverAddr.Port + ep1, ep1WECH := newEP(t, listenerStack, proto, ipv4.ProtocolNumber) ep2, ep2WECH := newEP(t, host2Stack, proto, ipv4.ProtocolNumber) return endpointAndAddresses{ - serverEP: ep1, - serverAddr: utils.Host1IPv4Addr.AddressWithPrefix.Address, - serverReadableCH: ep1WECH, + serverEP: ep1, + serverAddr: serverAddr, + serverReadableCH: ep1WECH, + serverConnectAddr: serverConnectAddr, clientEP: ep2, clientAddr: utils.Host2IPv4Addr.AddressWithPrefix.Address, clientReadableCH: ep2WECH, - - nattedClientAddr: utils.RouterNIC1IPv4Addr.AddressWithPrefix.Address, + clientConnectAddr: tcpip.FullAddress{ + Addr: utils.Host1IPv4Addr.AddressWithPrefix.Address, + Port: clientConnectPort, + }, } }, + natTypes: snatTypes, }, { - name: "IPv6 host1 server with host2 client", + name: "IPv4 DNAT", + netProto: ipv4.ProtocolNumber, epAndAddrs: func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack, proto tcpip.TransportProtocolNumber) endpointAndAddresses { t.Helper() - ipt := routerStack.IPTables() - filter := ipt.GetTable(stack.NATID, true /* ipv6 */) - ruleIdx := filter.BuiltinChains[stack.Postrouting] - filter.Rules[ruleIdx].Filter = stack.IPHeaderFilter{OutputInterface: utils.RouterNIC1Name} - filter.Rules[ruleIdx].Target = &stack.SNATTarget{NetworkProtocol: ipv6.ProtocolNumber, Addr: utils.RouterNIC1IPv6Addr.AddressWithPrefix.Address} - // Make sure the packet is not dropped by the next rule. - filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{} - if err := ipt.ReplaceTable(stack.NATID, filter, true /* ipv6 */); err != nil { - t.Fatalf("ipt.ReplaceTable(%d, _, %t): %s", stack.NATID, true, err) + // If we are performing DNAT, then the packet will be redirected + // to the router. + listenerStack := routerStack + serverAddr := tcpip.FullAddress{ + Addr: utils.RouterNIC2IPv4Addr.AddressWithPrefix.Address, + Port: listenPort, } + serverConnectAddr := utils.Host2IPv4Addr.AddressWithPrefix.Address + // DNAT will update the destination port to what the server is + // bound to. + clientConnectPort := serverAddr.Port + 1 + ep1, ep1WECH := newEP(t, listenerStack, proto, ipv4.ProtocolNumber) + ep2, ep2WECH := newEP(t, host2Stack, proto, ipv4.ProtocolNumber) + return endpointAndAddresses{ + serverEP: ep1, + serverAddr: serverAddr, + serverReadableCH: ep1WECH, + serverConnectAddr: serverConnectAddr, - ep1, ep1WECH := newEP(t, host1Stack, proto, ipv6.ProtocolNumber) + clientEP: ep2, + clientAddr: utils.Host2IPv4Addr.AddressWithPrefix.Address, + clientReadableCH: ep2WECH, + clientConnectAddr: tcpip.FullAddress{ + Addr: utils.Host1IPv4Addr.AddressWithPrefix.Address, + Port: clientConnectPort, + }, + } + }, + natTypes: dnatTypes, + }, + { + name: "IPv6 SNAT", + netProto: ipv6.ProtocolNumber, + epAndAddrs: func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack, proto tcpip.TransportProtocolNumber) endpointAndAddresses { + t.Helper() + + listenerStack := host1Stack + serverAddr := tcpip.FullAddress{ + Addr: utils.Host1IPv6Addr.AddressWithPrefix.Address, + Port: listenPort, + } + serverConnectAddr := utils.RouterNIC1IPv6Addr.AddressWithPrefix.Address + clientConnectPort := serverAddr.Port + ep1, ep1WECH := newEP(t, listenerStack, proto, ipv6.ProtocolNumber) ep2, ep2WECH := newEP(t, host2Stack, proto, ipv6.ProtocolNumber) return endpointAndAddresses{ - serverEP: ep1, - serverAddr: utils.Host1IPv6Addr.AddressWithPrefix.Address, - serverReadableCH: ep1WECH, + serverEP: ep1, + serverAddr: serverAddr, + serverReadableCH: ep1WECH, + serverConnectAddr: serverConnectAddr, clientEP: ep2, clientAddr: utils.Host2IPv6Addr.AddressWithPrefix.Address, clientReadableCH: ep2WECH, + clientConnectAddr: tcpip.FullAddress{ + Addr: utils.Host1IPv6Addr.AddressWithPrefix.Address, + Port: clientConnectPort, + }, + } + }, + natTypes: snatTypes, + }, + { + name: "IPv6 DNAT", + netProto: ipv6.ProtocolNumber, + epAndAddrs: func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack, proto tcpip.TransportProtocolNumber) endpointAndAddresses { + t.Helper() + + // If we are performing DNAT, then the packet will be redirected + // to the router. + listenerStack := routerStack + serverAddr := tcpip.FullAddress{ + Addr: utils.RouterNIC2IPv6Addr.AddressWithPrefix.Address, + Port: listenPort, + } + serverConnectAddr := utils.Host2IPv6Addr.AddressWithPrefix.Address + // DNAT will update the destination port to what the server is + // bound to. + clientConnectPort := serverAddr.Port + 1 + ep1, ep1WECH := newEP(t, listenerStack, proto, ipv6.ProtocolNumber) + ep2, ep2WECH := newEP(t, host2Stack, proto, ipv6.ProtocolNumber) + return endpointAndAddresses{ + serverEP: ep1, + serverAddr: serverAddr, + serverReadableCH: ep1WECH, + serverConnectAddr: serverConnectAddr, - nattedClientAddr: utils.RouterNIC1IPv6Addr.AddressWithPrefix.Address, + clientEP: ep2, + clientAddr: utils.Host2IPv6Addr.AddressWithPrefix.Address, + clientReadableCH: ep2WECH, + clientConnectAddr: tcpip.FullAddress{ + Addr: utils.Host1IPv6Addr.AddressWithPrefix.Address, + Port: clientConnectPort, + }, } }, + natTypes: dnatTypes, }, } @@ -1328,116 +1504,121 @@ func TestSNAT(t *testing.T) { t.Run(test.name, func(t *testing.T) { for _, subTest := range subTests { t.Run(subTest.name, func(t *testing.T) { - stackOpts := stack.Options{ - NetworkProtocols: []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol}, - TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol}, - } + for _, natType := range test.natTypes { + t.Run(natType.name, func(t *testing.T) { + stackOpts := stack.Options{ + NetworkProtocols: []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol}, + } - host1Stack := stack.New(stackOpts) - routerStack := stack.New(stackOpts) - host2Stack := stack.New(stackOpts) - utils.SetupRoutedStacks(t, host1Stack, routerStack, host2Stack) + host1Stack := stack.New(stackOpts) + routerStack := stack.New(stackOpts) + host2Stack := stack.New(stackOpts) + utils.SetupRoutedStacks(t, host1Stack, routerStack, host2Stack) - epsAndAddrs := test.epAndAddrs(t, host1Stack, routerStack, host2Stack, subTest.proto) - serverAddr := tcpip.FullAddress{Addr: epsAndAddrs.serverAddr, Port: listenPort} - if err := epsAndAddrs.serverEP.Bind(serverAddr); err != nil { - t.Fatalf("epsAndAddrs.serverEP.Bind(%#v): %s", serverAddr, err) - } - clientAddr := tcpip.FullAddress{Addr: epsAndAddrs.clientAddr} - if err := epsAndAddrs.clientEP.Bind(clientAddr); err != nil { - t.Fatalf("epsAndAddrs.clientEP.Bind(%#v): %s", clientAddr, err) - } + epsAndAddrs := test.epAndAddrs(t, host1Stack, routerStack, host2Stack, subTest.proto) + natType.setupNAT(t, routerStack, test.netProto, subTest.proto, epsAndAddrs.serverConnectAddr, epsAndAddrs.serverAddr.Addr) - if subTest.setupServer != nil { - subTest.setupServer(t, epsAndAddrs.serverEP) - } - { - err := epsAndAddrs.clientEP.Connect(serverAddr) - if diff := cmp.Diff(subTest.expectedConnectErr, err); diff != "" { - t.Fatalf("unexpected error from epsAndAddrs.clientEP.Connect(%#v), (-want, +got):\n%s", serverAddr, diff) - } - } - nattedClientAddr := tcpip.FullAddress{Addr: epsAndAddrs.nattedClientAddr} - if addr, err := epsAndAddrs.clientEP.GetLocalAddress(); err != nil { - t.Fatalf("epsAndAddrs.clientEP.GetLocalAddress(): %s", err) - } else { - nattedClientAddr.Port = addr.Port - } + if err := epsAndAddrs.serverEP.Bind(epsAndAddrs.serverAddr); err != nil { + t.Fatalf("epsAndAddrs.serverEP.Bind(%#v): %s", epsAndAddrs.serverAddr, err) + } + clientAddr := tcpip.FullAddress{Addr: epsAndAddrs.clientAddr} + if err := epsAndAddrs.clientEP.Bind(clientAddr); err != nil { + t.Fatalf("epsAndAddrs.clientEP.Bind(%#v): %s", clientAddr, err) + } - serverEP := epsAndAddrs.serverEP - serverCH := epsAndAddrs.serverReadableCH - if ep, ch := subTest.setupServerConn(t, serverEP, serverCH, nattedClientAddr); ep != nil { - defer ep.Close() - serverEP = ep - serverCH = ch - } + if subTest.setupServer != nil { + subTest.setupServer(t, epsAndAddrs.serverEP) + } + { + err := epsAndAddrs.clientEP.Connect(epsAndAddrs.clientConnectAddr) + if diff := cmp.Diff(subTest.expectedConnectErr, err); diff != "" { + t.Fatalf("unexpected error from epsAndAddrs.clientEP.Connect(%#v), (-want, +got):\n%s", epsAndAddrs.clientConnectAddr, diff) + } + } + serverConnectAddr := tcpip.FullAddress{Addr: epsAndAddrs.serverConnectAddr} + if addr, err := epsAndAddrs.clientEP.GetLocalAddress(); err != nil { + t.Fatalf("epsAndAddrs.clientEP.GetLocalAddress(): %s", err) + } else { + serverConnectAddr.Port = addr.Port + } - write := func(ep tcpip.Endpoint, data []byte) { - t.Helper() - - var r bytes.Reader - r.Reset(data) - var wOpts tcpip.WriteOptions - n, err := ep.Write(&r, wOpts) - if err != nil { - t.Fatalf("ep.Write(_, %#v): %s", wOpts, err) - } - if want := int64(len(data)); n != want { - t.Fatalf("got ep.Write(_, %#v) = (%d, _), want = (%d, _)", wOpts, n, want) - } - } + serverEP := epsAndAddrs.serverEP + serverCH := epsAndAddrs.serverReadableCH + if ep, ch := subTest.setupServerConn(t, serverEP, serverCH, serverConnectAddr); ep != nil { + defer ep.Close() + serverEP = ep + serverCH = ch + } - read := func(ch chan struct{}, ep tcpip.Endpoint, data []byte, expectedFrom tcpip.FullAddress) { - t.Helper() - - var buf bytes.Buffer - var res tcpip.ReadResult - for { - var err tcpip.Error - opts := tcpip.ReadOptions{NeedRemoteAddr: subTest.needRemoteAddr} - res, err = ep.Read(&buf, opts) - if _, ok := err.(*tcpip.ErrWouldBlock); ok { - <-ch - continue + write := func(ep tcpip.Endpoint, data []byte) { + t.Helper() + + var r bytes.Reader + r.Reset(data) + var wOpts tcpip.WriteOptions + n, err := ep.Write(&r, wOpts) + if err != nil { + t.Fatalf("ep.Write(_, %#v): %s", wOpts, err) + } + if want := int64(len(data)); n != want { + t.Fatalf("got ep.Write(_, %#v) = (%d, _), want = (%d, _)", wOpts, n, want) + } } - if err != nil { - t.Fatalf("ep.Read(_, %d, %#v): %s", len(data), opts, err) + + read := func(ch chan struct{}, ep tcpip.Endpoint, data []byte, expectedFrom tcpip.FullAddress) { + t.Helper() + + var buf bytes.Buffer + var res tcpip.ReadResult + for { + var err tcpip.Error + opts := tcpip.ReadOptions{NeedRemoteAddr: subTest.needRemoteAddr} + res, err = ep.Read(&buf, opts) + if _, ok := err.(*tcpip.ErrWouldBlock); ok { + <-ch + continue + } + if err != nil { + t.Fatalf("ep.Read(_, %d, %#v): %s", len(data), opts, err) + } + break + } + + readResult := tcpip.ReadResult{ + Count: len(data), + Total: len(data), + } + if subTest.needRemoteAddr { + readResult.RemoteAddr = expectedFrom + } + if diff := cmp.Diff(readResult, res, checker.IgnoreCmpPath( + "ControlMessages", + "RemoteAddr.NIC", + )); diff != "" { + t.Errorf("ep.Read: unexpected result (-want +got):\n%s", diff) + } + if diff := cmp.Diff(buf.Bytes(), data); diff != "" { + t.Errorf("received data mismatch (-want +got):\n%s", diff) + } + + if t.Failed() { + t.FailNow() + } } - break - } - - readResult := tcpip.ReadResult{ - Count: len(data), - Total: len(data), - } - if subTest.needRemoteAddr { - readResult.RemoteAddr = expectedFrom - } - if diff := cmp.Diff(readResult, res, checker.IgnoreCmpPath( - "ControlMessages", - "RemoteAddr.NIC", - )); diff != "" { - t.Errorf("ep.Read: unexpected result (-want +got):\n%s", diff) - } - if diff := cmp.Diff(buf.Bytes(), data); diff != "" { - t.Errorf("received data mismatch (-want +got):\n%s", diff) - } - - if t.Failed() { - t.FailNow() - } - } - { - data := []byte{1, 2, 3, 4} - write(epsAndAddrs.clientEP, data) - read(serverCH, serverEP, data, nattedClientAddr) - } + { + data := []byte{1, 2, 3, 4} + write(epsAndAddrs.clientEP, data) + read(serverCH, serverEP, data, serverConnectAddr) + } - { - data := []byte{5, 6, 7, 8, 9, 10, 11, 12} - write(serverEP, data) - read(epsAndAddrs.clientReadableCH, epsAndAddrs.clientEP, data, serverAddr) + { + data := []byte{5, 6, 7, 8, 9, 10, 11, 12} + write(serverEP, data) + read(epsAndAddrs.clientReadableCH, epsAndAddrs.clientEP, data, epsAndAddrs.clientConnectAddr) + } + }) } }) } diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index bb0db9f70..31579a896 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -180,7 +180,7 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult Total: p.data.Size(), ControlMessages: tcpip.ControlMessages{ HasTimestamp: true, - Timestamp: p.receivedAt.UnixNano(), + Timestamp: p.receivedAt, }, } if opts.NeedRemoteAddr { diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 689427d53..80eef39e9 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -182,7 +182,7 @@ func (ep *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResul Total: packet.data.Size(), ControlMessages: tcpip.ControlMessages{ HasTimestamp: true, - Timestamp: packet.receivedAt.UnixNano(), + Timestamp: packet.receivedAt, }, } if opts.NeedRemoteAddr { @@ -409,7 +409,7 @@ func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { } // HandlePacket implements stack.PacketEndpoint.HandlePacket. -func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { +func (ep *endpoint) HandlePacket(nicID tcpip.NICID, _ tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { ep.rcvMu.Lock() // Drop the packet if our buffer is currently full. diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index bfef75da7..ce76774af 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -49,6 +49,7 @@ type rawPacket struct { receivedAt time.Time `state:".(int64)"` // senderAddr is the network address of the sender. senderAddr tcpip.FullAddress + packetInfo tcpip.IPPacketInfo } // endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to @@ -202,12 +203,29 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult Total: pkt.data.Size(), ControlMessages: tcpip.ControlMessages{ HasTimestamp: true, - Timestamp: pkt.receivedAt.UnixNano(), + Timestamp: pkt.receivedAt, }, } if opts.NeedRemoteAddr { res.RemoteAddr = pkt.senderAddr } + switch netProto := e.net.NetProto(); netProto { + case header.IPv4ProtocolNumber: + if e.ops.GetReceivePacketInfo() { + res.ControlMessages.HasIPPacketInfo = true + res.ControlMessages.PacketInfo = pkt.packetInfo + } + case header.IPv6ProtocolNumber: + if e.ops.GetIPv6ReceivePacketInfo() { + res.ControlMessages.HasIPv6PacketInfo = true + res.ControlMessages.IPv6PacketInfo = tcpip.IPv6PacketInfo{ + NIC: pkt.packetInfo.NIC, + Addr: pkt.packetInfo.DestinationAddr, + } + } + default: + panic(fmt.Sprintf("unrecognized network protocol = %d", netProto)) + } n, err := pkt.data.ReadTo(dst, opts.Peek) if n == 0 && err != nil { @@ -435,7 +453,9 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { return false } - srcAddr := pkt.Network().SourceAddress() + net := pkt.Network() + dstAddr := net.DestinationAddress() + srcAddr := net.SourceAddress() info := e.net.Info() switch state := e.net.State(); state { @@ -457,7 +477,7 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { } // If bound to an address, only accept data for that address. - if info.BindAddr != "" && info.BindAddr != pkt.Network().DestinationAddress() { + if info.BindAddr != "" && info.BindAddr != dstAddr { return false } default: @@ -472,6 +492,14 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { NIC: pkt.NICID, Addr: srcAddr, }, + packetInfo: tcpip.IPPacketInfo{ + // TODO(gvisor.dev/issue/3556): dstAddr may be a multicast or broadcast + // address. LocalAddr should hold a unicast address that can be + // used to respond to the incoming packet. + LocalAddr: dstAddr, + DestinationAddr: dstAddr, + NIC: pkt.NICID, + }, } // Raw IPv4 endpoints return the IP header, but IPv6 endpoints do not. @@ -483,10 +511,10 @@ func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { // overlapping slices. var combinedVV buffer.VectorisedView if info.NetProto == header.IPv4ProtocolNumber { - network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View() - headers := make(buffer.View, 0, len(network)+len(transport)) - headers = append(headers, network...) - headers = append(headers, transport...) + networkHeader, transportHeader := pkt.NetworkHeader().View(), pkt.TransportHeader().View() + headers := make(buffer.View, 0, len(networkHeader)+len(transportHeader)) + headers = append(headers, networkHeader...) + headers = append(headers, transportHeader...) combinedVV = headers.ToVectorisedView() } else { combinedVV = append(buffer.View(nil), pkt.TransportHeader().View()...).ToVectorisedView() diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 7115d0a12..caf14b0dc 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -15,12 +15,12 @@ package tcp import ( + "container/list" "crypto/sha1" "encoding/binary" "fmt" "hash" "io" - "sync/atomic" "time" "gvisor.dev/gvisor/pkg/sleep" @@ -100,18 +100,6 @@ type listenContext struct { // netProto indicates the network protocol(IPv4/v6) for the listening // endpoint. netProto tcpip.NetworkProtocolNumber - - // pendingMu protects pendingEndpoints. This should only be accessed - // by the listening endpoint's worker goroutine. - // - // Lock Ordering: listenEP.workerMu -> pendingMu - pendingMu sync.Mutex - // pending is used to wait for all pendingEndpoints to finish when - // a socket is closed. - pending sync.WaitGroup - // pendingEndpoints is a map of all endpoints for which a handshake is - // in progress. - pendingEndpoints map[stack.TransportEndpointID]*endpoint } // timeStamp returns an 8-bit timestamp with a granularity of 64 seconds. @@ -122,14 +110,13 @@ func timeStamp(clock tcpip.Clock) uint32 { // newListenContext creates a new listen context. func newListenContext(stk *stack.Stack, protocol *protocol, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext { l := &listenContext{ - stack: stk, - protocol: protocol, - rcvWnd: rcvWnd, - hasher: sha1.New(), - v6Only: v6Only, - netProto: netProto, - listenEP: listenEP, - pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint), + stack: stk, + protocol: protocol, + rcvWnd: rcvWnd, + hasher: sha1.New(), + v6Only: v6Only, + netProto: netProto, + listenEP: listenEP, } for i := range l.nonce { @@ -265,7 +252,6 @@ func (l *listenContext) startHandshake(s *segment, opts header.TCPSynOptions, qu return nil, &tcpip.ErrConnectionAborted{} } - l.addPendingEndpoint(ep) // Propagate any inheritable options from the listening endpoint // to the newly created endpoint. @@ -275,8 +261,6 @@ func (l *listenContext) startHandshake(s *segment, opts header.TCPSynOptions, qu ep.mu.Unlock() ep.Close() - l.removePendingEndpoint(ep) - return nil, &tcpip.ErrConnectionAborted{} } @@ -295,10 +279,6 @@ func (l *listenContext) startHandshake(s *segment, opts header.TCPSynOptions, qu ep.mu.Unlock() ep.Close() - if l.listenEP != nil { - l.removePendingEndpoint(ep) - } - ep.drainClosingSegmentQueue() return nil, err @@ -336,38 +316,12 @@ func (l *listenContext) performHandshake(s *segment, opts header.TCPSynOptions, return ep, nil } -func (l *listenContext) addPendingEndpoint(n *endpoint) { - l.pendingMu.Lock() - l.pendingEndpoints[n.TransportEndpointInfo.ID] = n - l.pending.Add(1) - l.pendingMu.Unlock() -} - -func (l *listenContext) removePendingEndpoint(n *endpoint) { - l.pendingMu.Lock() - delete(l.pendingEndpoints, n.TransportEndpointInfo.ID) - l.pending.Done() - l.pendingMu.Unlock() -} - -func (l *listenContext) closeAllPendingEndpoints() { - l.pendingMu.Lock() - for _, n := range l.pendingEndpoints { - n.notifyProtocolGoroutine(notifyClose) - } - l.pendingMu.Unlock() - l.pending.Wait() -} - // +checklocks:h.ep.mu func (l *listenContext) cleanupFailedHandshake(h *handshake) { e := h.ep e.mu.Unlock() e.Close() e.notifyAborted() - if l.listenEP != nil { - l.removePendingEndpoint(e) - } e.drainClosingSegmentQueue() e.h = nil } @@ -378,9 +332,6 @@ func (l *listenContext) cleanupFailedHandshake(h *handshake) { // +checklocks:h.ep.mu func (l *listenContext) cleanupCompletedHandshake(h *handshake) { e := h.ep - if l.listenEP != nil { - l.removePendingEndpoint(e) - } e.isConnectNotified = true // Update the receive window scaling. We can't do it before the @@ -444,101 +395,30 @@ func (e *endpoint) notifyAborted() { e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } -// handleSynSegment is called in its own goroutine once the listening endpoint -// receives a SYN segment. It is responsible for completing the handshake and -// queueing the new endpoint for acceptance. -// -// A limited number of these goroutines are allowed before TCP starts using SYN -// cookies to accept connections. -// -// +checklocks:e.mu -func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts header.TCPSynOptions) tcpip.Error { - defer s.decRef() - - h, err := ctx.startHandshake(s, opts, &waiter.Queue{}, e.owner) - if err != nil { - e.stack.Stats().TCP.FailedConnectionAttempts.Increment() - e.stats.FailedConnectionAttempts.Increment() - atomic.AddInt32(&e.synRcvdCount, -1) - return err - } - - go func() { - // Note that startHandshake returns a locked endpoint. The - // force call here just makes it so. - if err := h.complete(); err != nil { // +checklocksforce - e.stack.Stats().TCP.FailedConnectionAttempts.Increment() - e.stats.FailedConnectionAttempts.Increment() - ctx.cleanupFailedHandshake(h) - atomic.AddInt32(&e.synRcvdCount, -1) - return - } - ctx.cleanupCompletedHandshake(h) - h.ep.startAcceptedLoop() - e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() - - // Deliver the endpoint to the accept queue. - e.mu.Lock() - e.pendingAccepted.Add(1) - e.mu.Unlock() - defer e.pendingAccepted.Done() - - // Drop the lock before notifying to avoid deadlock in user-specified - // callbacks. - delivered := func() bool { - e.acceptMu.Lock() - defer e.acceptMu.Unlock() - for { - if e.accepted == (accepted{}) { - // If the listener has transitioned out of the listen state (accepted - // is the zero value), the new endpoint is reset instead. - return false - } - if e.accepted.acceptQueueIsFullLocked() { - e.acceptCond.Wait() - continue - } - - e.accepted.endpoints.PushBack(h.ep) - atomic.AddInt32(&e.synRcvdCount, -1) - return true - } - }() - - if delivered { - e.waiterQueue.Notify(waiter.ReadableEvents) - } else { - h.ep.notifyProtocolGoroutine(notifyReset) - } - }() - - return nil -} - -func (e *endpoint) synRcvdBacklogFull() bool { - e.acceptMu.Lock() - acceptedCap := e.accepted.cap - e.acceptMu.Unlock() - // The capacity of the accepted queue would always be one greater than the - // listen backlog. But, the SYNRCVD connections count is always checked - // against the listen backlog value for Linux parity reason. - // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280 - // - // We maintain an equality check here as the synRcvdCount is incremented - // and compared only from a single listener context and the capacity of - // the accepted queue can only increase by a new listen call. - return int(atomic.LoadInt32(&e.synRcvdCount)) == acceptedCap-1 -} - func (e *endpoint) acceptQueueIsFull() bool { e.acceptMu.Lock() - full := e.accepted.acceptQueueIsFullLocked() + full := e.acceptQueue.isFull() e.acceptMu.Unlock() return full } -func (a *accepted) acceptQueueIsFullLocked() bool { - return a.endpoints.Len() == a.cap +// +stateify savable +type acceptQueue struct { + // NB: this could be an endpointList, but ilist only permits endpoints to + // belong to one list at a time, and endpoints are already stored in the + // dispatcher's list. + endpoints list.List `state:".([]*endpoint)"` + + // pendingEndpoints is a set of all endpoints for which a handshake is + // in progress. + pendingEndpoints map[*endpoint]struct{} + + // capacity is the maximum number of endpoints that can be in endpoints. + capacity int +} + +func (a *acceptQueue) isFull() bool { + return a.endpoints.Len() == a.capacity } // handleListenSegment is called when a listening endpoint receives a segment @@ -571,20 +451,96 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err return nil } - alwaysUseSynCookies := func() bool { + opts := parseSynSegmentOptions(s) + + useSynCookies, err := func() (bool, tcpip.Error) { var alwaysUseSynCookies tcpip.TCPAlwaysUseSynCookies if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &alwaysUseSynCookies); err != nil { panic(fmt.Sprintf("TransportProtocolOption(%d, %T) = %s", header.TCPProtocolNumber, alwaysUseSynCookies, err)) } - return bool(alwaysUseSynCookies) - }() + if alwaysUseSynCookies { + return true, nil + } + e.acceptMu.Lock() + defer e.acceptMu.Unlock() - opts := parseSynSegmentOptions(s) - if !alwaysUseSynCookies && !e.synRcvdBacklogFull() { - s.incRef() - atomic.AddInt32(&e.synRcvdCount, 1) - return e.handleSynSegment(ctx, s, opts) + // The capacity of the accepted queue would always be one greater than the + // listen backlog. But, the SYNRCVD connections count is always checked + // against the listen backlog value for Linux parity reason. + // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280 + if len(e.acceptQueue.pendingEndpoints) == e.acceptQueue.capacity-1 { + return true, nil + } + + h, err := ctx.startHandshake(s, opts, &waiter.Queue{}, e.owner) + if err != nil { + e.stack.Stats().TCP.FailedConnectionAttempts.Increment() + e.stats.FailedConnectionAttempts.Increment() + return false, err + } + + e.acceptQueue.pendingEndpoints[h.ep] = struct{}{} + e.pendingAccepted.Add(1) + + go func() { + defer func() { + e.pendingAccepted.Done() + + e.acceptMu.Lock() + defer e.acceptMu.Unlock() + delete(e.acceptQueue.pendingEndpoints, h.ep) + }() + + // Note that startHandshake returns a locked endpoint. The force call + // here just makes it so. + if err := h.complete(); err != nil { // +checklocksforce + e.stack.Stats().TCP.FailedConnectionAttempts.Increment() + e.stats.FailedConnectionAttempts.Increment() + ctx.cleanupFailedHandshake(h) + return + } + ctx.cleanupCompletedHandshake(h) + h.ep.startAcceptedLoop() + e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() + + // Deliver the endpoint to the accept queue. + // + // Drop the lock before notifying to avoid deadlock in user-specified + // callbacks. + delivered := func() bool { + e.acceptMu.Lock() + defer e.acceptMu.Unlock() + for { + // The listener is transitioning out of the Listen state; bail. + if e.acceptQueue.capacity == 0 { + return false + } + if e.acceptQueue.isFull() { + e.acceptCond.Wait() + continue + } + + e.acceptQueue.endpoints.PushBack(h.ep) + return true + } + }() + + if delivered { + e.waiterQueue.Notify(waiter.ReadableEvents) + } else { + h.ep.notifyProtocolGoroutine(notifyReset) + } + }() + + return false, nil + }() + if err != nil { + return err } + if !useSynCookies { + return nil + } + route, err := e.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */) if err != nil { return err @@ -627,23 +583,6 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err return nil case s.flags.Contains(header.TCPFlagAck): - // Keep hold of acceptMu until the new endpoint is in the accept queue (or - // if there is an error), to guarantee that we will keep our spot in the - // queue even if another handshake from the syn queue completes. - e.acceptMu.Lock() - if e.accepted.acceptQueueIsFullLocked() { - // Silently drop the ack as the application can't accept - // the connection at this point. The ack will be - // retransmitted by the sender anyway and we can - // complete the connection at the time of retransmit if - // the backlog has space. - e.acceptMu.Unlock() - e.stack.Stats().TCP.ListenOverflowAckDrop.Increment() - e.stats.ReceiveErrors.ListenOverflowAckDrop.Increment() - e.stack.Stats().DroppedPackets.Increment() - return nil - } - iss := s.ackNumber - 1 irs := s.sequenceNumber - 1 @@ -659,7 +598,6 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err // Validate the cookie. data, ok := ctx.isCookieValid(s.id, iss, irs) if !ok || int(data) >= len(mssTable) { - e.acceptMu.Unlock() e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment() e.stack.Stats().DroppedPackets.Increment() @@ -680,6 +618,24 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err // ACK was received from the sender. return replyWithReset(e.stack, s, e.sendTOS, e.ttl) } + + // Keep hold of acceptMu until the new endpoint is in the accept queue (or + // if there is an error), to guarantee that we will keep our spot in the + // queue even if another handshake from the syn queue completes. + e.acceptMu.Lock() + if e.acceptQueue.isFull() { + // Silently drop the ack as the application can't accept + // the connection at this point. The ack will be + // retransmitted by the sender anyway and we can + // complete the connection at the time of retransmit if + // the backlog has space. + e.acceptMu.Unlock() + e.stack.Stats().TCP.ListenOverflowAckDrop.Increment() + e.stats.ReceiveErrors.ListenOverflowAckDrop.Increment() + e.stack.Stats().DroppedPackets.Increment() + return nil + } + e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment() // Create newly accepted endpoint and deliver it. rcvdSynOptions := header.TCPSynOptions{ @@ -769,7 +725,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() // Deliver the endpoint to the accept queue. - e.accepted.endpoints.PushBack(n) + e.acceptQueue.endpoints.PushBack(n) e.acceptMu.Unlock() e.waiterQueue.Notify(waiter.ReadableEvents) @@ -789,14 +745,8 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) { ctx := newListenContext(e.stack, e.protocol, e, rcvWnd, v6Only, e.NetProto) defer func() { - // Mark endpoint as closed. This will prevent goroutines running - // handleSynSegment() from attempting to queue new connections - // to the endpoint. e.setEndpointState(StateClose) - // Close any endpoints in SYN-RCVD state. - ctx.closeAllPendingEndpoints() - // Do cleanup if needed. e.completeWorkerLocked() diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 407ab2664..066ffe051 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -15,7 +15,6 @@ package tcp import ( - "container/list" "encoding/binary" "fmt" "io" @@ -205,6 +204,8 @@ type SACKInfo struct { } // ReceiveErrors collect segment receive errors within transport layer. +// +// +stateify savable type ReceiveErrors struct { tcpip.ReceiveErrors @@ -234,6 +235,8 @@ type ReceiveErrors struct { } // SendErrors collect segment send errors within the transport layer. +// +// +stateify savable type SendErrors struct { tcpip.SendErrors @@ -257,6 +260,8 @@ type SendErrors struct { } // Stats holds statistics about the endpoint. +// +// +stateify savable type Stats struct { // SegmentsReceived is the number of TCP segments received that // the transport layer successfully parsed. @@ -311,18 +316,6 @@ type rcvQueueInfo struct { rcvQueue segmentList `state:"wait"` } -// +stateify savable -type accepted struct { - // NB: this could be an endpointList, but ilist only permits endpoints to - // belong to one list at a time, and endpoints are already stored in the - // dispatcher's list. - endpoints list.List `state:".([]*endpoint)"` - - // cap is the maximum number of endpoints that can be in the accepted endpoint - // list. - cap int -} - // endpoint represents a TCP endpoint. This struct serves as the interface // between users of the endpoint and the protocol implementation; it is legal to // have concurrent goroutines make calls into the endpoint, they are properly @@ -338,7 +331,7 @@ type accepted struct { // The following three mutexes can be acquired independent of e.mu but if // acquired with e.mu then e.mu must be acquired first. // -// e.acceptMu -> Protects e.accepted. +// e.acceptMu -> Protects e.acceptQueue. // e.rcvQueueMu -> Protects e.rcvQueue and associated fields. // e.sndQueueMu -> Protects the e.sndQueue and associated fields. // e.lastErrorMu -> Protects the lastError field. @@ -502,10 +495,6 @@ type endpoint struct { // and dropped when it is. segmentQueue segmentQueue `state:"wait"` - // synRcvdCount is the number of connections for this endpoint that are - // in SYN-RCVD state; this is only accessed atomically. - synRcvdCount int32 - // userMSS if non-zero is the MSS value explicitly set by the user // for this endpoint using the TCP_MAXSEG setsockopt. userMSS uint16 @@ -579,7 +568,7 @@ type endpoint struct { // send newly accepted connections to the endpoint so that they can be // read by Accept() calls. // +checklocks:acceptMu - accepted accepted + acceptQueue acceptQueue // The following are only used from the protocol goroutine, and // therefore don't need locks to protect them. @@ -612,8 +601,7 @@ type endpoint struct { gso stack.GSO - // TODO(b/142022063): Add ability to save and restore per endpoint stats. - stats Stats `state:"nosave"` + stats Stats // tcpLingerTimeout is the maximum amount of a time a socket // a socket stays in TIME_WAIT state before being marked @@ -825,10 +813,9 @@ func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProto waiterQueue: waiterQueue, state: uint32(StateInitial), keepalive: keepalive{ - // Linux defaults. - idle: 2 * time.Hour, - interval: 75 * time.Second, - count: 9, + idle: DefaultKeepaliveIdle, + interval: DefaultKeepaliveInterval, + count: DefaultKeepaliveCount, }, uniqueID: s.UniqueID(), txHash: s.Rand().Uint32(), @@ -910,7 +897,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // Check if there's anything in the accepted queue. if (mask & waiter.ReadableEvents) != 0 { e.acceptMu.Lock() - if e.accepted.endpoints.Len() != 0 { + if e.acceptQueue.endpoints.Len() != 0 { result |= waiter.ReadableEvents } e.acceptMu.Unlock() @@ -1093,20 +1080,20 @@ func (e *endpoint) closeNoShutdownLocked() { // handshake but not yet been delivered to the application. func (e *endpoint) closePendingAcceptableConnectionsLocked() { e.acceptMu.Lock() - acceptedCopy := e.accepted - e.accepted = accepted{} - e.acceptMu.Unlock() - - if acceptedCopy == (accepted{}) { - return + // Close any endpoints in SYN-RCVD state. + for n := range e.acceptQueue.pendingEndpoints { + n.notifyProtocolGoroutine(notifyClose) } - - e.acceptCond.Broadcast() - + e.acceptQueue.pendingEndpoints = nil // Reset all connections that are waiting to be accepted. - for n := acceptedCopy.endpoints.Front(); n != nil; n = n.Next() { + for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() { n.Value.(*endpoint).notifyProtocolGoroutine(notifyReset) } + e.acceptQueue.endpoints.Init() + e.acceptMu.Unlock() + + e.acceptCond.Broadcast() + // Wait for reset of all endpoints that are still waiting to be delivered to // the now closed accepted. e.pendingAccepted.Wait() @@ -2498,22 +2485,23 @@ func (e *endpoint) listen(backlog int) tcpip.Error { if e.EndpointState() == StateListen && !e.closed { e.acceptMu.Lock() defer e.acceptMu.Unlock() - if e.accepted == (accepted{}) { - // listen is called after shutdown. - e.accepted.cap = backlog - e.shutdownFlags = 0 - e.rcvQueueInfo.rcvQueueMu.Lock() - e.rcvQueueInfo.RcvClosed = false - e.rcvQueueInfo.rcvQueueMu.Unlock() - } else { - // Adjust the size of the backlog iff we can fit - // existing pending connections into the new one. - if e.accepted.endpoints.Len() > backlog { - return &tcpip.ErrInvalidEndpointState{} - } - e.accepted.cap = backlog + + // Adjust the size of the backlog iff we can fit + // existing pending connections into the new one. + if e.acceptQueue.endpoints.Len() > backlog { + return &tcpip.ErrInvalidEndpointState{} + } + e.acceptQueue.capacity = backlog + + if e.acceptQueue.pendingEndpoints == nil { + e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{}) } + e.shutdownFlags = 0 + e.rcvQueueInfo.rcvQueueMu.Lock() + e.rcvQueueInfo.RcvClosed = false + e.rcvQueueInfo.rcvQueueMu.Unlock() + // Notify any blocked goroutines that they can attempt to // deliver endpoints again. e.acceptCond.Broadcast() @@ -2548,8 +2536,11 @@ func (e *endpoint) listen(backlog int) tcpip.Error { // may be pre-populated with some previously accepted (but not Accepted) // endpoints. e.acceptMu.Lock() - if e.accepted == (accepted{}) { - e.accepted.cap = backlog + if e.acceptQueue.pendingEndpoints == nil { + e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{}) + } + if e.acceptQueue.capacity == 0 { + e.acceptQueue.capacity = backlog } e.acceptMu.Unlock() @@ -2589,8 +2580,8 @@ func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter. // Get the new accepted endpoint. var n *endpoint e.acceptMu.Lock() - if element := e.accepted.endpoints.Front(); element != nil { - n = e.accepted.endpoints.Remove(element).(*endpoint) + if element := e.acceptQueue.endpoints.Front(); element != nil { + n = e.acceptQueue.endpoints.Remove(element).(*endpoint) } e.acceptMu.Unlock() if n == nil { @@ -3007,6 +2998,8 @@ func (e *endpoint) completeStateLocked() stack.TCPEndpointState { } s.Sender.RACKState = e.snd.rc.TCPRACKState + s.Sender.RetransmitTS = e.snd.retransmitTS + s.Sender.SpuriousRecovery = e.snd.spuriousRecovery return s } diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index 381f4474d..94072a115 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -100,7 +100,7 @@ func (e *endpoint) beforeSave() { } // saveEndpoints is invoked by stateify. -func (a *accepted) saveEndpoints() []*endpoint { +func (a *acceptQueue) saveEndpoints() []*endpoint { acceptedEndpoints := make([]*endpoint, a.endpoints.Len()) for i, e := 0, a.endpoints.Front(); e != nil; i, e = i+1, e.Next() { acceptedEndpoints[i] = e.Value.(*endpoint) @@ -109,7 +109,7 @@ func (a *accepted) saveEndpoints() []*endpoint { } // loadEndpoints is invoked by stateify. -func (a *accepted) loadEndpoints(acceptedEndpoints []*endpoint) { +func (a *acceptQueue) loadEndpoints(acceptedEndpoints []*endpoint) { for _, ep := range acceptedEndpoints { a.endpoints.PushBack(ep) } @@ -252,7 +252,7 @@ func (e *endpoint) Resume(s *stack.Stack) { connectedLoading.Wait() bind() e.acceptMu.Lock() - backlog := e.accepted.cap + backlog := e.acceptQueue.capacity e.acceptMu.Unlock() if err := e.Listen(backlog); err != nil { panic("endpoint listening failed: " + err.String()) diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index e4410ad93..f122ea009 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -66,6 +66,18 @@ const ( // DefaultSynRetries is the default value for the number of SYN retransmits // before a connect is aborted. DefaultSynRetries = 6 + + // DefaultKeepaliveIdle is the idle time for a connection before keep-alive + // probes are sent. + DefaultKeepaliveIdle = 2 * time.Hour + + // DefaultKeepaliveInterval is the time between two successive keep-alive + // probes. + DefaultKeepaliveInterval = 75 * time.Second + + // DefaultKeepaliveCount is the number of keep-alive probes that are sent + // before declaring the connection dead. + DefaultKeepaliveCount = 9 ) const ( diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 2fabf1594..4377f07a0 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -144,6 +144,15 @@ type sender struct { // probeTimer and probeWaker are used to schedule PTO for RACK TLP algorithm. probeTimer timer `state:"nosave"` probeWaker sleep.Waker `state:"nosave"` + + // spuriousRecovery indicates whether the sender entered recovery + // spuriously as described in RFC3522 Section 3.2. + spuriousRecovery bool + + // retransmitTS is the timestamp at which the sender sends retransmitted + // segment after entering an RTO for the first time as described in + // RFC3522 Section 3.2. + retransmitTS uint32 } // rtt is a synchronization wrapper used to appease stateify. See the comment @@ -425,6 +434,13 @@ func (s *sender) retransmitTimerExpired() bool { return true } + // Initialize the variables used to detect spurious recovery after + // entering RTO. + // + // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. + s.spuriousRecovery = false + s.retransmitTS = 0 + // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases // when writeList is empty. Remove this once we have a proper fix for this // issue. @@ -495,6 +511,10 @@ func (s *sender) retransmitTimerExpired() bool { s.leaveRecovery() } + // Record retransmitTS if the sender is not in recovery as per: + // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 + s.recordRetransmitTS() + s.state = tcpip.RTORecovery s.cc.HandleRTOExpired() @@ -958,6 +978,13 @@ func (s *sender) sendData() { } func (s *sender) enterRecovery() { + // Initialize the variables used to detect spurious recovery after + // entering recovery. + // + // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. + s.spuriousRecovery = false + s.retransmitTS = 0 + s.FastRecovery.Active = true // Save state to reflect we're now in fast recovery. // @@ -972,6 +999,11 @@ func (s *sender) enterRecovery() { s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding s.FastRecovery.HighRxt = s.SndUna s.FastRecovery.RescueRxt = s.SndUna + + // Record retransmitTS if the sender is not in recovery as per: + // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 + s.recordRetransmitTS() + if s.ep.SACKPermitted { s.state = tcpip.SACKRecovery s.ep.stack.Stats().TCP.SACKRecovery.Increment() @@ -1147,13 +1179,15 @@ func (s *sender) isDupAck(seg *segment) bool { // Iterate the writeList and update RACK for each segment which is newly acked // either cumulatively or selectively. Loop through the segments which are // sacked, and update the RACK related variables and check for reordering. +// Returns true when the DSACK block has been detected in the received ACK. // // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // steps 2 and 3. -func (s *sender) walkSACK(rcvdSeg *segment) { +func (s *sender) walkSACK(rcvdSeg *segment) bool { s.rc.setDSACKSeen(false) // Look for DSACK block. + hasDSACK := false idx := 0 n := len(rcvdSeg.parsedOptions.SACKBlocks) if checkDSACK(rcvdSeg) { @@ -1167,10 +1201,11 @@ func (s *sender) walkSACK(rcvdSeg *segment) { s.rc.setDSACKSeen(true) idx = 1 n-- + hasDSACK = true } if n == 0 { - return + return hasDSACK } // Sort the SACK blocks. The first block is the most recent unacked @@ -1193,6 +1228,7 @@ func (s *sender) walkSACK(rcvdSeg *segment) { seg = seg.Next() } } + return hasDSACK } // checkDSACK checks if a DSACK is reported. @@ -1239,6 +1275,85 @@ func checkDSACK(rcvdSeg *segment) bool { return false } +func (s *sender) recordRetransmitTS() { + // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 + // + // The Eifel detection algorithm is used, only upon initiation of loss + // recovery, i.e., when either the timeout-based retransmit or the fast + // retransmit is sent. The Eifel detection algorithm MUST NOT be + // reinitiated after loss recovery has already started. In particular, + // it must not be reinitiated upon subsequent timeouts for the same + // segment, and not upon retransmitting segments other than the oldest + // outstanding segment, e.g., during selective loss recovery. + if s.inRecovery() { + return + } + + // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 + // + // Set a "RetransmitTS" variable to the value of the Timestamp Value + // field of the Timestamps option included in the retransmit sent when + // loss recovery is initiated. A TCP sender must ensure that + // RetransmitTS does not get overwritten as loss recovery progresses, + // e.g., in case of a second timeout and subsequent second retransmit of + // the same octet. + s.retransmitTS = s.ep.tsValNow() +} + +func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) { + // Return if the sender has already detected spurious recovery. + if s.spuriousRecovery { + return + } + + // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4 + // + // If the value of the Timestamp Echo Reply field of the acceptable ACK's + // Timestamps option is smaller than the value of RetransmitTS, then + // proceed to next step, else return. + if tsEchoReply >= s.retransmitTS { + return + } + + // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 + // + // If the acceptable ACK carries a DSACK option [RFC2883], then return. + if hasDSACK { + return + } + + // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 + // + // If during the lifetime of the TCP connection the TCP sender has + // previously received an ACK with a DSACK option, or the acceptable ACK + // does not acknowledge all outstanding data, then proceed to next step, + // else return. + numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value() + if numDSACK == 0 && s.SndUna == s.SndNxt { + return + } + + // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6 + // + // If the loss recovery has been initiated with a timeout-based + // retransmit, then set + // SpuriousRecovery <- SPUR_TO (equal 1), + // else set + // SpuriousRecovery <- dupacks+1 + // Set the spurious recovery variable to true as we do not differentiate + // between fast, SACK or RTO recovery. + s.spuriousRecovery = true + s.ep.stack.Stats().TCP.SpuriousRecovery.Increment() +} + +// Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state. +func (s *sender) inRecovery() bool { + if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery { + return true + } + return false +} + // handleRcvdSegment is called when a segment is received; it is responsible for // updating the send-related state. func (s *sender) handleRcvdSegment(rcvdSeg *segment) { @@ -1254,6 +1369,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { } // Insert SACKBlock information into our scoreboard. + hasDSACK := false if s.ep.SACKPermitted { for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { // Only insert the SACK block if the following holds @@ -1288,7 +1404,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { // RACK.fack, then the corresponding packet has been // reordered and RACK.reord is set to TRUE. if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { - s.walkSACK(rcvdSeg) + hasDSACK = s.walkSACK(rcvdSeg) } s.SetPipe() } @@ -1418,6 +1534,11 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { // Clear SACK information for all acked data. s.ep.scoreboard.Delete(s.SndUna) + // Detect if the sender entered recovery spuriously. + if s.inRecovery() { + s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr) + } + // If we are not in fast recovery then update the congestion // window based on the number of acknowledged packets. if !s.FastRecovery.Active { diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go index c35db7c95..0d36d0dd0 100644 --- a/pkg/tcpip/transport/tcp/tcp_rack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go @@ -1059,16 +1059,17 @@ func TestRACKWithWindowFull(t *testing.T) { for i := 0; i < numPkts; i++ { c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize) bytesRead += maxPayload - if i == 0 { - // Send ACK for the first packet to establish RTT. - c.SendAck(seq, maxPayload) - } } - // SACK for #10 packet. - start := c.IRS.Add(seqnum.Size(1 + (numPkts-1)*maxPayload)) + // Expect retransmission of last packet due to TLP. + c.ReceiveAndCheckPacketWithOptions(data, (numPkts-1)*maxPayload, maxPayload, tsOptionSize) + + // SACK for first and last packet. + start := c.IRS.Add(seqnum.Size(maxPayload)) end := start.Add(seqnum.Size(maxPayload)) - c.SendAckWithSACK(seq, 2*maxPayload, []header.SACKBlock{{start, end}}) + dsackStart := c.IRS.Add(seqnum.Size(1 + (numPkts-1)*maxPayload)) + dsackEnd := dsackStart.Add(seqnum.Size(maxPayload)) + c.SendAckWithSACK(seq, 2*maxPayload, []header.SACKBlock{{dsackStart, dsackEnd}, {start, end}}) var info tcpip.TCPInfoOption if err := c.EP.GetSockOpt(&info); err != nil { diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go index 6255355bb..896249d2d 100644 --- a/pkg/tcpip/transport/tcp/tcp_sack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go @@ -23,6 +23,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/checker" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -702,3 +703,257 @@ func TestRecoveryEntry(t *testing.T) { t.Error(err) } } + +func verifySpuriousRecoveryMetric(t *testing.T, c *context.Context, numSpuriousRecovery uint64) { + t.Helper() + + metricPollFn := func() error { + tcpStats := c.Stack().Stats().TCP + stats := []struct { + stat *tcpip.StatCounter + name string + want uint64 + }{ + {tcpStats.SpuriousRecovery, "stats.TCP.SpuriousRecovery", numSpuriousRecovery}, + } + for _, s := range stats { + if got, want := s.stat.Value(), s.want; got != want { + return fmt.Errorf("got %s.Value() = %d, want = %d", s.name, got, want) + } + } + return nil + } + + if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil { + t.Error(err) + } +} + +func checkReceivedPacket(t *testing.T, c *context.Context, tcpHdr header.TCP, bytesRead uint32, b, data []byte) { + payloadLen := uint32(len(tcpHdr.Payload())) + checker.IPv4(t, b, + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPSeqNum(uint32(c.IRS)+1+bytesRead), + checker.TCPAckNum(context.TestInitialSequenceNumber+1), + checker.TCPFlagsMatch(header.TCPFlagAck, ^header.TCPFlagPsh), + ), + ) + pdata := data[bytesRead : bytesRead+payloadLen] + if p := tcpHdr.Payload(); !bytes.Equal(pdata, p) { + t.Fatalf("got data = %v, want = %v", p, pdata) + } +} + +func buildTSOptionFromHeader(tcpHdr header.TCP) []byte { + parsedOpts := tcpHdr.ParsedOptions() + tsOpt := [12]byte{header.TCPOptionNOP, header.TCPOptionNOP} + header.EncodeTSOption(parsedOpts.TSEcr+1, parsedOpts.TSVal, tsOpt[2:]) + return tsOpt[:] +} + +func TestDetectSpuriousRecoveryWithRTO(t *testing.T) { + c := context.New(t, uint32(mtu)) + defer c.Cleanup() + + probeDone := make(chan struct{}) + c.Stack().AddTCPProbe(func(s stack.TCPEndpointState) { + if s.Sender.RetransmitTS == 0 { + t.Fatalf("RetransmitTS did not get updated, got: 0 want > 0") + } + if !s.Sender.SpuriousRecovery { + t.Fatalf("Spurious recovery was not detected") + } + close(probeDone) + }) + + setStackSACKPermitted(t, c, true) + createConnectedWithSACKAndTS(c) + numPackets := 5 + data := make([]byte, numPackets*maxPayload) + for i := range data { + data[i] = byte(i) + } + // Write the data. + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { + t.Fatalf("Write failed: %s", err) + } + + var options []byte + var bytesRead uint32 + for i := 0; i < numPackets; i++ { + b := c.GetPacket() + tcpHdr := header.TCP(header.IPv4(b).Payload()) + checkReceivedPacket(t, c, tcpHdr, bytesRead, b, data) + + // Get options only for the first packet. This will be sent with + // the ACK to indicate the acknowledgement is for the original + // packet. + if i == 0 && c.TimeStampEnabled { + options = buildTSOptionFromHeader(tcpHdr) + } + bytesRead += uint32(len(tcpHdr.Payload())) + } + + seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1) + // Expect #5 segment with TLP. + c.ReceiveAndCheckPacketWithOptions(data, 4*maxPayload, maxPayload, tsOptionSize) + + // Expect #1 segment because of RTO. + c.ReceiveAndCheckPacketWithOptions(data, 0, maxPayload, tsOptionSize) + + info := tcpip.TCPInfoOption{} + if err := c.EP.GetSockOpt(&info); err != nil { + t.Fatalf("c.EP.GetSockOpt(&%T) = %s", info, err) + } + + if info.CcState != tcpip.RTORecovery { + t.Fatalf("Loss recovery did not happen, got: %v want: %v", info.CcState, tcpip.RTORecovery) + } + + // Acknowledge the data. + rcvWnd := seqnum.Size(30000) + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + Flags: header.TCPFlagAck, + SeqNum: seq, + AckNum: c.IRS.Add(1 + seqnum.Size(maxPayload)), + RcvWnd: rcvWnd, + TCPOpts: options, + }) + + // Wait for the probe function to finish processing the + // ACK before the test completes. + <-probeDone + + verifySpuriousRecoveryMetric(t, c, 1 /* numSpuriousRecovery */) +} + +func TestSACKDetectSpuriousRecoveryWithDupACK(t *testing.T) { + c := context.New(t, uint32(mtu)) + defer c.Cleanup() + + numAck := 0 + probeDone := make(chan struct{}) + c.Stack().AddTCPProbe(func(s stack.TCPEndpointState) { + if numAck < 3 { + numAck++ + return + } + + if s.Sender.RetransmitTS == 0 { + t.Fatalf("RetransmitTS did not get updated, got: 0 want > 0") + } + if !s.Sender.SpuriousRecovery { + t.Fatalf("Spurious recovery was not detected") + } + close(probeDone) + }) + + setStackSACKPermitted(t, c, true) + createConnectedWithSACKAndTS(c) + numPackets := 5 + data := make([]byte, numPackets*maxPayload) + for i := range data { + data[i] = byte(i) + } + // Write the data. + var r bytes.Reader + r.Reset(data) + if _, err := c.EP.Write(&r, tcpip.WriteOptions{}); err != nil { + t.Fatalf("Write failed: %s", err) + } + + var options []byte + var bytesRead uint32 + for i := 0; i < numPackets; i++ { + b := c.GetPacket() + tcpHdr := header.TCP(header.IPv4(b).Payload()) + checkReceivedPacket(t, c, tcpHdr, bytesRead, b, data) + + // Get options only for the first packet. This will be sent with + // the ACK to indicate the acknowledgement is for the original + // packet. + if i == 0 && c.TimeStampEnabled { + options = buildTSOptionFromHeader(tcpHdr) + } + bytesRead += uint32(len(tcpHdr.Payload())) + } + + // Receive the retransmitted packet after TLP. + c.ReceiveAndCheckPacketWithOptions(data, 4*maxPayload, maxPayload, tsOptionSize) + + seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1) + // Send ACK for #3 and #4 segments to avoid entering TLP. + start := c.IRS.Add(3*maxPayload + 1) + end := start.Add(2 * maxPayload) + c.SendAckWithSACK(seq, 0, []header.SACKBlock{{start, end}}) + + c.SendAck(seq, 0 /* bytesReceived */) + c.SendAck(seq, 0 /* bytesReceived */) + + // Receive the retransmitted packet after three duplicate ACKs. + c.ReceiveAndCheckPacketWithOptions(data, 0, maxPayload, tsOptionSize) + + info := tcpip.TCPInfoOption{} + if err := c.EP.GetSockOpt(&info); err != nil { + t.Fatalf("c.EP.GetSockOpt(&%T) = %s", info, err) + } + + if info.CcState != tcpip.SACKRecovery { + t.Fatalf("Loss recovery did not happen, got: %v want: %v", info.CcState, tcpip.SACKRecovery) + } + + // Acknowledge the data. + rcvWnd := seqnum.Size(30000) + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + Flags: header.TCPFlagAck, + SeqNum: seq, + AckNum: c.IRS.Add(1 + seqnum.Size(maxPayload)), + RcvWnd: rcvWnd, + TCPOpts: options, + }) + + // Wait for the probe function to finish processing the + // ACK before the test completes. + <-probeDone + + verifySpuriousRecoveryMetric(t, c, 1 /* numSpuriousRecovery */) +} + +func TestNoSpuriousRecoveryWithDSACK(t *testing.T) { + c := context.New(t, uint32(mtu)) + defer c.Cleanup() + setStackSACKPermitted(t, c, true) + createConnectedWithSACKAndTS(c) + numPackets := 5 + data := sendAndReceiveWithSACK(t, c, numPackets, true /* enableRACK */) + + // Receive the retransmitted packet after TLP. + c.ReceiveAndCheckPacketWithOptions(data, 4*maxPayload, maxPayload, tsOptionSize) + + // Send ACK for #3 and #4 segments to avoid entering TLP. + start := c.IRS.Add(3*maxPayload + 1) + end := start.Add(2 * maxPayload) + seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1) + c.SendAckWithSACK(seq, 0, []header.SACKBlock{{start, end}}) + + c.SendAck(seq, 0 /* bytesReceived */) + c.SendAck(seq, 0 /* bytesReceived */) + + // Receive the retransmitted packet after three duplicate ACKs. + c.ReceiveAndCheckPacketWithOptions(data, 0, maxPayload, tsOptionSize) + + // Acknowledge the data with DSACK for #1 segment. + start = c.IRS.Add(maxPayload + 1) + end = start.Add(2 * maxPayload) + seq = seqnum.Value(context.TestInitialSequenceNumber).Add(1) + c.SendAckWithSACK(seq, 6*maxPayload, []header.SACKBlock{{start, end}}) + + verifySpuriousRecoveryMetric(t, c, 0 /* numSpuriousRecovery */) +} diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 049957b81..39b1e08c0 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -233,7 +233,7 @@ func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult // Control Messages cm := tcpip.ControlMessages{ HasTimestamp: true, - Timestamp: p.receivedAt.UnixNano(), + Timestamp: p.receivedAt, } switch p.netProto { diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD index 234125c38..8902be2d3 100644 --- a/pkg/unet/BUILD +++ b/pkg/unet/BUILD @@ -10,6 +10,7 @@ go_library( ], visibility = ["//visibility:public"], deps = [ + "//pkg/eventfd", "//pkg/sync", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go index 40fa72925..0dc0c37bd 100644 --- a/pkg/unet/unet.go +++ b/pkg/unet/unet.go @@ -23,6 +23,7 @@ import ( "sync/atomic" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/sync" ) @@ -55,15 +56,6 @@ func socket(packet bool) (int, error) { return fd, nil } -// eventFD returns a new event FD with initial value 0. -func eventFD() (int, error) { - f, _, e := unix.Syscall(unix.SYS_EVENTFD2, 0, 0, 0) - if e != 0 { - return -1, e - } - return int(f), nil -} - // Socket is a connected unix domain socket. type Socket struct { // gate protects use of fd. @@ -78,7 +70,7 @@ type Socket struct { // efd is an event FD that is signaled when the socket is closing. // // efd is immutable and remains valid until Close/Release. - efd int + efd eventfd.Eventfd // race is an atomic variable used to avoid triggering the race // detector. See comment in SocketPair below. @@ -95,7 +87,7 @@ func NewSocket(fd int) (*Socket, error) { return nil, err } - efd, err := eventFD() + efd, err := eventfd.Create() if err != nil { return nil, err } @@ -110,16 +102,14 @@ func NewSocket(fd int) (*Socket, error) { // closing the event FD. func (s *Socket) finish() error { // Signal any blocked or future polls. - // - // N.B. eventfd writes must be 8 bytes. - if _, err := unix.Write(s.efd, []byte{1, 0, 0, 0, 0, 0, 0, 0}); err != nil { + if err := s.efd.Notify(); err != nil { return err } // Close the gate, blocking until all FD users leave. s.gate.Close() - return unix.Close(s.efd) + return s.efd.Close() } // Close closes the socket. diff --git a/pkg/unet/unet_unsafe.go b/pkg/unet/unet_unsafe.go index f0bf93ddd..ea281fec3 100644 --- a/pkg/unet/unet_unsafe.go +++ b/pkg/unet/unet_unsafe.go @@ -43,7 +43,7 @@ func (s *Socket) wait(write bool) error { }, { // The eventfd, signaled when we are closing. - Fd: int32(s.efd), + Fd: int32(s.efd.FD()), Events: unix.POLLIN, }, } diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index ff7a5a44b..36806b740 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -80,7 +80,6 @@ go_library( "//pkg/sentry/loader", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/sighandling", "//pkg/sentry/socket/hostinet", "//pkg/sentry/socket/netfilter", "//pkg/sentry/socket/netlink", @@ -96,6 +95,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/watchdog", + "//pkg/sighandling", "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/link/ethernet", diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 703f34827..db363435b 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -304,6 +304,22 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */ }, }, + unix.SYS_TIMER_CREATE: []seccomp.Rule{ + { + seccomp.EqualTo(unix.CLOCK_THREAD_CPUTIME_ID), /* which */ + seccomp.MatchAny{}, /* sevp */ + seccomp.MatchAny{}, /* timerid */ + }, + }, + unix.SYS_TIMER_DELETE: []seccomp.Rule{}, + unix.SYS_TIMER_SETTIME: []seccomp.Rule{ + { + seccomp.MatchAny{}, /* timerid */ + seccomp.EqualTo(0), /* flags */ + seccomp.MatchAny{}, /* new_value */ + seccomp.EqualTo(0), /* old_value */ + }, + }, unix.SYS_TGKILL: []seccomp.Rule{ { seccomp.EqualTo(uint64(os.Getpid())), @@ -630,6 +646,11 @@ func hostInetFilters() seccomp.SyscallRules { func controlServerFilters(fd int) seccomp.SyscallRules { return seccomp.SyscallRules{ + unix.SYS_ACCEPT4: []seccomp.Rule{ + { + seccomp.EqualTo(fd), + }, + }, unix.SYS_ACCEPT: []seccomp.Rule{ { seccomp.EqualTo(fd), diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index b46d84e5a..2f2d4df5e 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -49,13 +49,13 @@ import ( "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/sighandling" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" @@ -241,10 +241,8 @@ func New(args Args) (*Loader, error) { // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true - if args.Conf.FUSE { - kernel.FUSEEnabled = true - } - + kernel.FUSEEnabled = args.Conf.FUSE + kernel.LISAFSEnabled = args.Conf.Lisafs vfs2.Override() } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 2f1332566..ac1e5ac37 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -173,7 +173,7 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create rootProcArgs.Credentials = rootCreds rootProcArgs.Umask = 0022 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals - rootCtx := procArgs.NewContext(c.k) + rootCtx := rootProcArgs.NewContext(c.k) mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) if err != nil { diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD index f7e892584..d3aec1fff 100644 --- a/runsc/cgroup/BUILD +++ b/runsc/cgroup/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/cleanup", "//pkg/log", + "//pkg/sync", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index 5dbf14376..7a0f0694f 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -19,7 +19,6 @@ package cgroup import ( "bufio" "context" - "errors" "fmt" "io" "io/ioutil" @@ -34,6 +33,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" ) const ( @@ -104,17 +104,21 @@ func setOptionalValueUint16(path, name string, val *uint16) error { func setValue(path, name, data string) error { fullpath := filepath.Join(path, name) + log.Debugf("Setting %q to %q", fullpath, data) + return writeFile(fullpath, []byte(data), 0700) +} - // Retry writes on EINTR; see: - // https://github.com/golang/go/issues/38033 - for { - err := ioutil.WriteFile(fullpath, []byte(data), 0700) - if err == nil { - return nil - } else if !errors.Is(err, unix.EINTR) { - return err - } +// writeFile is similar to ioutil.WriteFile() but doesn't create the file if it +// doesn't exist. +func writeFile(path string, data []byte, perm os.FileMode) error { + f, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC, perm) + if err != nil { + return err } + defer f.Close() + + _, err = f.Write(data) + return err } func getValue(path, name string) (string, error) { @@ -155,15 +159,8 @@ func fillFromAncestor(path string) (string, error) { return "", err } - // Retry writes on EINTR; see: - // https://github.com/golang/go/issues/38033 - for { - err := ioutil.WriteFile(path, []byte(val), 0700) - if err == nil { - break - } else if !errors.Is(err, unix.EINTR) { - return "", err - } + if err := writeFile(path, []byte(val), 0700); err != nil { + return "", nil } return val, nil } @@ -309,7 +306,13 @@ func NewFromSpec(spec *specs.Spec) (*Cgroup, error) { if spec.Linux == nil || spec.Linux.CgroupsPath == "" { return nil, nil } - return new("self", spec.Linux.CgroupsPath) + return NewFromPath(spec.Linux.CgroupsPath) +} + +// NewFromPath creates a new Cgroup instance from the specified relative path. +// Cgroup paths are loaded based on the current process. +func NewFromPath(cgroupsPath string) (*Cgroup, error) { + return new("self", cgroupsPath) } // NewFromPid loads cgroup for the given process. @@ -365,21 +368,20 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { } for _, key := range missing { ctrlr := controllers[key] - path := c.MakePath(key) - log.Debugf("Creating cgroup %q: %q", key, path) - if err := os.MkdirAll(path, 0755); err != nil { - if ctrlr.optional() && errors.Is(err, unix.EROFS) { - if err := ctrlr.skip(res); err != nil { - return err - } - log.Infof("Skipping cgroup %q", key) - continue + + if skip, err := c.createController(key); skip && ctrlr.optional() { + if err := ctrlr.skip(res); err != nil { + return err } + log.Infof("Skipping cgroup %q, err: %v", key, err) + continue + } else if err != nil { return err } // Only set controllers that were created by me. c.Own[key] = true + path := c.MakePath(key) if err := ctrlr.set(res, path); err != nil { return err } @@ -388,10 +390,29 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { return nil } +// createController creates the controller directory, checking that the +// controller is enabled in the system. It returns a boolean indicating whether +// the controller should be skipped (e.g. controller is disabled). In case it +// should be skipped, it also returns the error it got. +func (c *Cgroup) createController(name string) (bool, error) { + ctrlrPath := filepath.Join(cgroupRoot, name) + if _, err := os.Stat(ctrlrPath); err != nil { + return os.IsNotExist(err), err + } + + path := c.MakePath(name) + log.Debugf("Creating cgroup %q: %q", name, path) + if err := os.MkdirAll(path, 0755); err != nil { + return false, err + } + return false, nil +} + // Uninstall removes the settings done in Install(). If cgroup path already // existed when Install() was called, Uninstall is a noop. func (c *Cgroup) Uninstall() error { log.Debugf("Deleting cgroup %q", c.Name) + wait := sync.WaitGroupErr{} for key := range controllers { if !c.Own[key] { // cgroup is managed by caller, don't touch it. @@ -400,9 +421,8 @@ func (c *Cgroup) Uninstall() error { path := c.MakePath(key) log.Debugf("Removing cgroup controller for key=%q path=%q", key, path) - // If we try to remove the cgroup too soon after killing the - // sandbox we might get EBUSY, so we retry for a few seconds - // until it succeeds. + // If we try to remove the cgroup too soon after killing the sandbox we + // might get EBUSY, so we retry for a few seconds until it succeeds. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) @@ -413,11 +433,18 @@ func (c *Cgroup) Uninstall() error { } return err } - if err := backoff.Retry(fn, b); err != nil { - return fmt.Errorf("removing cgroup path %q: %w", path, err) - } + // Run deletions in parallel to remove all directories even if there are + // failures/timeouts in other directories. + wait.Add(1) + go func() { + defer wait.Done() + if err := backoff.Retry(fn, b); err != nil { + wait.ReportError(fmt.Errorf("removing cgroup path %q: %w", path, err)) + return + } + }() } - return nil + return wait.Error() } // Join adds the current process to the all controllers. Returns function that diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go index 1431b4e8f..0b6a5431b 100644 --- a/runsc/cgroup/cgroup_test.go +++ b/runsc/cgroup/cgroup_test.go @@ -129,6 +129,18 @@ func boolPtr(v bool) *bool { return &v } +func createDir(dir string, contents map[string]string) error { + for name := range contents { + path := filepath.Join(dir, name) + f, err := os.Create(path) + if err != nil { + return err + } + f.Close() + } + return nil +} + func checkDir(t *testing.T, dir string, contents map[string]string) { all, err := ioutil.ReadDir(dir) if err != nil { @@ -254,6 +266,9 @@ func TestBlockIO(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ BlockIO: tc.spec, @@ -304,6 +319,9 @@ func TestCPU(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ CPU: tc.spec, @@ -343,6 +361,9 @@ func TestCPUSet(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ CPU: tc.spec, @@ -481,6 +502,9 @@ func TestHugeTlb(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ HugepageLimits: tc.spec, @@ -542,6 +566,9 @@ func TestMemory(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ Memory: tc.spec, @@ -584,6 +611,9 @@ func TestNetworkClass(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ Network: tc.spec, @@ -631,6 +661,9 @@ func TestNetworkPriority(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ Network: tc.spec, @@ -671,6 +704,9 @@ func TestPids(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ Pids: tc.spec, diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 5314549d6..4e744e604 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -19,7 +19,7 @@ go_library( "//pkg/cleanup", "//pkg/log", "//pkg/sentry/control", - "//pkg/sentry/sighandling", + "//pkg/sighandling", "//pkg/sync", "//runsc/boot", "//runsc/cgroup", diff --git a/runsc/container/container.go b/runsc/container/container.go index 9c0004753..6a59df411 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -35,7 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/pkg/sentry/sighandling" + "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/config" @@ -44,6 +44,8 @@ import ( "gvisor.dev/gvisor/runsc/specutils" ) +const cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" + // validateID validates the container id. func validateID(id string) error { // See libcontainer/factory_linux.go. @@ -113,6 +115,16 @@ type Container struct { // container is created and reset when the sandbox is destroyed. Sandbox *sandbox.Sandbox `json:"sandbox"` + // CompatCgroup has the cgroup configuration for the container. For the single + // container case, container cgroup is set in `c.Sandbox` only. CompactCgroup + // is only set for multi-container, where the `c.Sandbox` cgroup represents + // the entire pod. + // + // Note that CompatCgroup is created only for compatibility with tools + // that expect container cgroups to exist. Setting limits here makes no change + // to the container in question. + CompatCgroup *cgroup.Cgroup `json:"compatCgroup"` + // Saver handles load from/save to the state file safely from multiple // processes. Saver StateFile `json:"saver"` @@ -233,27 +245,12 @@ func New(conf *config.Config, args Args) (*Container, error) { } // Create and join cgroup before processes are created to ensure they are // part of the cgroup from the start (and all their children processes). - cg, err := cgroup.NewFromSpec(args.Spec) + parentCgroup, subCgroup, err := c.setupCgroupForRoot(conf, args.Spec) if err != nil { return nil, err } - if cg != nil { - // TODO(gvisor.dev/issue/3481): Remove when cgroups v2 is supported. - if !conf.Rootless && cgroup.IsOnlyV2() { - return nil, fmt.Errorf("cgroups V2 is not yet supported. Enable cgroups V1 and retry") - } - // If there is cgroup config, install it before creating sandbox process. - if err := cg.Install(args.Spec.Linux.Resources); err != nil { - switch { - case errors.Is(err, unix.EACCES) && conf.Rootless: - log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) - cg = nil - default: - return nil, fmt.Errorf("configuring cgroup: %v", err) - } - } - } - if err := runInCgroup(cg, func() error { + c.CompatCgroup = subCgroup + if err := runInCgroup(parentCgroup, func() error { ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached) if err != nil { return err @@ -269,7 +266,7 @@ func New(conf *config.Config, args Args) (*Container, error) { UserLog: args.UserLog, IOFiles: ioFiles, MountsFile: specFile, - Cgroup: cg, + Cgroup: parentCgroup, Attached: args.Attached, } sand, err := sandbox.New(conf, sandArgs) @@ -296,6 +293,12 @@ func New(conf *config.Config, args Args) (*Container, error) { } c.Sandbox = sb.Sandbox + subCgroup, err := c.setupCgroupForSubcontainer(conf, args.Spec) + if err != nil { + return nil, err + } + c.CompatCgroup = subCgroup + // If the console control socket file is provided, then create a new // pty master/slave pair and send the TTY to the sandbox process. var tty *os.File @@ -781,16 +784,16 @@ func (c *Container) saveLocked() error { // root containers), and waits for the container or sandbox and the gofer // to stop. If any of them doesn't stop before timeout, an error is returned. func (c *Container) stop() error { - var cgroup *cgroup.Cgroup + var parentCgroup *cgroup.Cgroup if c.Sandbox != nil { log.Debugf("Destroying container, cid: %s", c.ID) if err := c.Sandbox.DestroyContainer(c.ID); err != nil { return fmt.Errorf("destroying container %q: %v", c.ID, err) } - // Only uninstall cgroup for sandbox stop. + // Only uninstall parentCgroup for sandbox stop. if c.Sandbox.IsRootContainer(c.ID) { - cgroup = c.Sandbox.Cgroup + parentCgroup = c.Sandbox.Cgroup } // Only set sandbox to nil after it has been told to destroy the container. c.Sandbox = nil @@ -809,9 +812,16 @@ func (c *Container) stop() error { return err } - // Gofer is running in cgroups, so Cgroup.Uninstall has to be called after it. - if cgroup != nil { - if err := cgroup.Uninstall(); err != nil { + // Delete container cgroup if any. + if c.CompatCgroup != nil { + if err := c.CompatCgroup.Uninstall(); err != nil { + return err + } + } + // Gofer is running inside parentCgroup, so Cgroup.Uninstall has to be called + // after the gofer has stopped. + if parentCgroup != nil { + if err := parentCgroup.Uninstall(); err != nil { return err } } @@ -1208,3 +1218,77 @@ func (c *Container) populateStats(event *boot.EventOut) { event.Event.Data.CPU.Usage.Total = uint64(total) return } + +// setupCgroupForRoot configures and returns cgroup for the sandbox and the +// root container. If `cgroupParentAnnotation` is set, use that path as the +// sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup. +func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (*cgroup.Cgroup, *cgroup.Cgroup, error) { + var parentCgroup *cgroup.Cgroup + if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok { + var err error + parentCgroup, err = cgroup.NewFromPath(parentPath) + if err != nil { + return nil, nil, err + } + } else { + var err error + parentCgroup, err = cgroup.NewFromSpec(spec) + if parentCgroup == nil || err != nil { + return nil, nil, err + } + } + + var err error + parentCgroup, err = cgroupInstall(conf, parentCgroup, spec.Linux.Resources) + if parentCgroup == nil || err != nil { + return nil, nil, err + } + + subCgroup, err := c.setupCgroupForSubcontainer(conf, spec) + if err != nil { + _ = parentCgroup.Uninstall() + return nil, nil, err + } + return parentCgroup, subCgroup, nil +} + +// setupCgroupForSubcontainer sets up empty cgroups for subcontainers. Since +// subcontainers run exclusively inside the sandbox, subcontainer cgroups on the +// host have no effect on them. However, some tools (e.g. cAdvisor) uses cgroups +// paths to discover new containers and report stats for them. +func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.Spec) (*cgroup.Cgroup, error) { + if isRoot(spec) { + if _, ok := spec.Annotations[cgroupParentAnnotation]; !ok { + return nil, nil + } + } + + cg, err := cgroup.NewFromSpec(spec) + if cg == nil || err != nil { + return nil, err + } + // Use empty resources, just want the directory structure created. + return cgroupInstall(conf, cg, &specs.LinuxResources{}) +} + +// cgroupInstall creates cgroups dir structure and sets their respective +// resources. In case of success, returns the cgroups instance and nil error. +// For rootless, it's possible that cgroups operations fail, in this case the +// error is suppressed and a nil cgroups instance is returned to indicate that +// no cgroups was configured. +func cgroupInstall(conf *config.Config, cg *cgroup.Cgroup, res *specs.LinuxResources) (*cgroup.Cgroup, error) { + // TODO(gvisor.dev/issue/3481): Remove when cgroups v2 is supported. + if !conf.Rootless && cgroup.IsOnlyV2() { + return nil, fmt.Errorf("cgroups V2 is not yet supported. Enable cgroups V1 and retry") + } + if err := cg.Install(res); err != nil { + switch { + case errors.Is(err, unix.EACCES) && conf.Rootless: + log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) + return nil, nil + default: + return nil, fmt.Errorf("configuring cgroup: %v", err) + } + } + return cg, nil +} diff --git a/test/packetimpact/runner/dut.go b/test/packetimpact/runner/dut.go index 02678a76a..f27e52f93 100644 --- a/test/packetimpact/runner/dut.go +++ b/test/packetimpact/runner/dut.go @@ -331,6 +331,22 @@ func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Co t.Logf("sniffer logs:\n%s", snifferOut) } }) + } + + // Arm the cleanup hook before we do anthing else. Otherwise failures below + // can cause the test to hang in the cleanup hook above. + t.Cleanup(func() { + // Wait 1 second before killing tcpdump to give it time to flush + // any packets. On linux tests killing it immediately can + // sometimes result in partial pcaps. + time.Sleep(1 * time.Second) + if logs, err := testbenchContainer.Exec(ctx, dockerutil.ExecOpts{}, "killall", baseSnifferArgs[0]); err != nil { + t.Errorf("failed to kill all sniffers: %s, logs: %s", err, logs) + } + }) + + for _, info := range dutInfos { + n := info.Net // When the Linux kernel receives a SYN-ACK for a SYN it didn't send, it // will respond with an RST. In most packetimpact tests, the SYN is sent // by the raw socket, the kernel knows nothing about the connection, this @@ -344,16 +360,6 @@ func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Co } } - t.Cleanup(func() { - // Wait 1 second before killing tcpdump to give it time to flush - // any packets. On linux tests killing it immediately can - // sometimes result in partial pcaps. - time.Sleep(1 * time.Second) - if logs, err := testbenchContainer.Exec(ctx, dockerutil.ExecOpts{}, "killall", baseSnifferArgs[0]); err != nil { - t.Errorf("failed to kill all sniffers: %s, logs: %s", err, logs) - } - }) - // FIXME(b/156449515): Some piece of the system has a race. The old // bash script version had a sleep, so we have one too. The race should // be fixed and this sleep removed. diff --git a/test/packetimpact/tests/tcp_listen_backlog_test.go b/test/packetimpact/tests/tcp_listen_backlog_test.go index fea7d5b6f..e124002f6 100644 --- a/test/packetimpact/tests/tcp_listen_backlog_test.go +++ b/test/packetimpact/tests/tcp_listen_backlog_test.go @@ -15,7 +15,9 @@ package tcp_listen_backlog_test import ( + "bytes" "flag" + "sync" "testing" "time" @@ -35,60 +37,272 @@ func init() { func TestTCPListenBacklog(t *testing.T) { dut := testbench.NewDUT(t) - // Listening endpoint accepts one more connection than the listen backlog. - listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 0 /*backlog*/) + // This is the number of pending connections before SYN cookies are used. + const backlog = 10 - var establishedConn testbench.TCPIPv4 - var incompleteConn testbench.TCPIPv4 + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, backlog) + defer dut.Close(t, listenFd) - // Test if the DUT listener replies to more SYNs than listen backlog+1 - for i, conn := range []*testbench.TCPIPv4{&establishedConn, &incompleteConn} { - *conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - // Expect dut connection to have transitioned to SYN-RCVD state. + // Fill the SYN queue with connections in SYN-RCVD. We will use these to test + // that ACKs received while the accept queue is full are ignored. + var synQueueConns [backlog]testbench.TCPIPv4 + defer func() { + for i := range synQueueConns { + synQueueConns[i].Close(t) + } + }() + { + var wg sync.WaitGroup + for i := range synQueueConns { + conn := &synQueueConns[i] + *conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{}) + + wg.Add(1) + go func(i int) { + defer wg.Done() + + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)}) + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err != nil { + t.Errorf("%d: expected TCP frame: %s", i, err) + } else if got, want := *got.Flags, header.TCPFlagSyn|header.TCPFlagAck; got != want { + t.Errorf("%d: got %s, want %s", i, got, want) + } + }(i) + } + wg.Wait() + if t.Failed() { + t.FailNow() + } + } + + const payloadLen = 1 + payload := testbench.Payload{Bytes: testbench.GenerateRandomPayload(t, payloadLen)} + + // Fill the accept queue with connections established using SYN cookies. + var synCookieConns [backlog + 1]testbench.TCPIPv4 + defer func() { + for i := range synCookieConns { + synCookieConns[i].Close(t) + } + }() + { + var wg sync.WaitGroup + for i := range synCookieConns { + conn := &synCookieConns[i] + *conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{}) + + wg.Add(1) + go func(i int) { + defer wg.Done() + + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)}) + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err != nil { + t.Errorf("%d: expected TCP frame: %s", i, err) + } else if got, want := *got.Flags, header.TCPFlagSyn|header.TCPFlagAck; got != want { + t.Errorf("%d: got %s, want %s", i, got, want) + } + // Send a payload so we can observe the dut ACK. + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}, &payload) + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err != nil { + t.Errorf("%d: expected TCP frame: %s", i, err) + } else if got, want := *got.Flags, header.TCPFlagAck; got != want { + t.Errorf("%d: got %s, want %s", i, got, want) + } + }(i) + } + wg.Wait() + if t.Failed() { + t.FailNow() + } + } + + // Send ACKs to complete the handshakes. These are expected to be dropped + // because the accept queue is full. + { + var wg sync.WaitGroup + for i := range synQueueConns { + conn := &synQueueConns[i] + wg.Add(1) + go func(i int) { + defer wg.Done() + + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) + // Wait for the SYN-ACK to be retransmitted to confirm the ACK was + // dropped. + seqNum := uint32(*conn.RemoteSeqNum(t) - 1) + if got, err := conn.Expect(t, testbench.TCP{SeqNum: &seqNum}, time.Second); err != nil { + t.Errorf("%d: expected TCP frame: %s", i, err) + } else if got, want := *got.Flags, header.TCPFlagSyn|header.TCPFlagAck; got != want { + t.Errorf("%d: got %s, want %s", i, got, want) + } + }(i) + } + + wg.Wait() + if t.Failed() { + t.FailNow() + } + } + + // While the accept queue is still full, send an unexpected ACK from a new + // socket. The listener should reply with an RST. + func() { + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{}) + defer conn.Close(t) + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err != nil { + t.Errorf("expected TCP frame: %s", err) + } else if got, want := *got.Flags, header.TCPFlagRst; got != want { + t.Errorf("got %s, want %s", got, want) + } + }() + + func() { + // Now initiate a new connection when the accept queue is full. + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{}) + defer conn.Close(t) + // Expect dut connection to drop the SYN. conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)}) - if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { - t.Fatalf("expected SYN-ACK for %d connection, %s", i, err) + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err == nil { + t.Fatalf("expected no TCP frame, got %s", got) + } + }() + + // Drain the accept queue. + { + var wg sync.WaitGroup + for i := range synCookieConns { + conn := &synCookieConns[i] + + wg.Add(1) + go func(i int) { + defer wg.Done() + + fd, _ := dut.Accept(t, listenFd) + b := dut.Recv(t, fd, payloadLen+1, 0) + dut.Close(t, fd) + if !bytes.Equal(b, payload.Bytes) { + t.Errorf("connection %d: got dut.Recv = %x, want = %x", i, b, payload.Bytes) + } + + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err != nil { + t.Errorf("%d: expected TCP frame: %s", i, err) + } else if got, want := *got.Flags, header.TCPFlagFin|header.TCPFlagAck; got != want { + t.Errorf("%d: got %s, want %s", i, got, want) + } + + // Prevent retransmission. + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) + }(i) + } + wg.Wait() + if t.Failed() { + t.FailNow() } } - defer establishedConn.Close(t) - defer incompleteConn.Close(t) - - // Send the ACK to complete handshake. - establishedConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) - - // Poll for the established connection ready for accept. - dut.PollOne(t, listenFd, unix.POLLIN, time.Second) - - // Send the ACK to complete handshake, expect this to be dropped by the - // listener as the accept queue would be full because of the previous - // handshake. - incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) - // Let the test wait for sometime so that the ACK is indeed dropped by - // the listener. Without such a wait, the DUT accept can race with - // ACK handling (dropping) causing the test to be flaky. - time.Sleep(100 * time.Millisecond) - - // Drain the accept queue to enable poll for subsequent connections on the - // listener. - fd, _ := dut.Accept(t, listenFd) - dut.Close(t, fd) - - // The ACK for the incomplete connection should be ignored by the - // listening endpoint and the poll on listener should now time out. - if pfds := dut.Poll(t, []unix.PollFd{{Fd: listenFd, Events: unix.POLLIN}}, time.Second); len(pfds) != 0 { - t.Fatalf("got dut.Poll(...) = %#v", pfds) + + // Complete the partial connections to move them from the SYN queue to the + // accept queue. We will use these to test that connections in the accept + // queue are closed on listener shutdown. + { + var wg sync.WaitGroup + for i := range synQueueConns { + conn := &synQueueConns[i] + wg.Add(1) + go func(i int) { + defer wg.Done() + + tcp := testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)} + + // Exercise connections with and without pending data. + if i%2 == 0 { + // Send ACK with no payload; wait for absence of SYN-ACK retransmit. + conn.Send(t, tcp) + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err == nil { + t.Errorf("%d: expected no TCP frame, got %s", i, got) + } + } else { + // Send ACK with payload; wait for ACK. + conn.Send(t, tcp, &payload) + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err != nil { + t.Errorf("%d: expected TCP frame: %s", i, err) + } else if got, want := *got.Flags, header.TCPFlagAck; got != want { + t.Errorf("%d: got %s, want %s", i, got, want) + } + } + }(i) + } + + wg.Wait() + if t.Failed() { + t.FailNow() + } } - // Re-send the ACK to complete handshake and re-fill the accept-queue. - incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) - dut.PollOne(t, listenFd, unix.POLLIN, time.Second) - - // Now initiate a new connection when the accept queue is full. - connectingConn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) - defer connectingConn.Close(t) - // Expect dut connection to drop the SYN and let the client stay in SYN_SENT state. - connectingConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)}) - if got, err := connectingConn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err == nil { - t.Fatalf("expected no SYN-ACK, but got %s", got) + // The accept queue now has N-1 connections in it. The next incoming SYN will + // enter the SYN queue, and the one following will use SYN cookies. We test + // both. + var connectingConns [2]testbench.TCPIPv4 + defer func() { + for i := range connectingConns { + connectingConns[i].Close(t) + } + }() + { + var wg sync.WaitGroup + for i := range connectingConns { + conn := &connectingConns[i] + *conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{}) + + wg.Add(1) + go func(i int) { + defer wg.Done() + + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)}) + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err != nil { + t.Errorf("%d: expected TCP frame: %s", i, err) + } else if got, want := *got.Flags, header.TCPFlagSyn|header.TCPFlagAck; got != want { + t.Errorf("%d: got %s, want %s", i, got, want) + } + }(i) + } + wg.Wait() + if t.Failed() { + t.FailNow() + } + } + + dut.Shutdown(t, listenFd, unix.SHUT_RD) + + var wg sync.WaitGroup + + // Shutdown causes Connections in the accept queue to be closed. + for i := range synQueueConns { + conn := &synQueueConns[i] + wg.Add(1) + go func(i int) { + defer wg.Done() + + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err != nil { + t.Errorf("%d: expected TCP frame: %s", i, err) + } else if got, want := *got.Flags, header.TCPFlagRst|header.TCPFlagAck; got != want { + t.Errorf("%d: got %s, want %s", i, got, want) + } + }(i) + } + + for i := range connectingConns { + conn := &connectingConns[i] + + wg.Add(1) + go func(i int) { + defer wg.Done() + + if got, err := conn.Expect(t, testbench.TCP{}, time.Second); err == nil { + t.Errorf("%d: expected no TCP frame, got %s", i, got) + } + }(i) } + + wg.Wait() } diff --git a/test/runtimes/runner/lib/BUILD b/test/runtimes/runner/lib/BUILD index d308f41b0..3491c535b 100644 --- a/test/runtimes/runner/lib/BUILD +++ b/test/runtimes/runner/lib/BUILD @@ -5,7 +5,11 @@ package(licenses = ["notice"]) go_library( name = "lib", testonly = 1, - srcs = ["lib.go"], + srcs = [ + "go_test_dependency_go118.go", + "go_test_dependency_not_go118.go", + "lib.go", + ], visibility = ["//test/runtimes/runner:__pkg__"], deps = [ "//pkg/log", diff --git a/test/runtimes/runner/lib/go_test_dependency_go118.go b/test/runtimes/runner/lib/go_test_dependency_go118.go new file mode 100644 index 000000000..d430e81c7 --- /dev/null +++ b/test/runtimes/runner/lib/go_test_dependency_go118.go @@ -0,0 +1,27 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.18 && go1.1 +// +build go1.18,go1.1 + +package lib + +import ( + "testing" +) + +// mainStart wraps testing.MainStart for Go release == 1.18. +func mainStart(tests []testing.InternalTest) *testing.M { + return testing.MainStart(testDeps{}, tests, nil, nil, nil) +} diff --git a/test/runtimes/runner/lib/go_test_dependency_not_go118.go b/test/runtimes/runner/lib/go_test_dependency_not_go118.go new file mode 100644 index 000000000..8b0b34c72 --- /dev/null +++ b/test/runtimes/runner/lib/go_test_dependency_not_go118.go @@ -0,0 +1,25 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !go1.18 && go1.1 +// +build !go1.18,go1.1 + +package lib + +import "testing" + +// mainStart wraps testing.MainStart for Go release < 1.18. +func mainStart(tests []testing.InternalTest) *testing.M { + return testing.MainStart(testDeps{}, tests, nil, nil) +} diff --git a/test/runtimes/runner/lib/lib.go b/test/runtimes/runner/lib/lib.go index d6b652897..d704f8895 100644 --- a/test/runtimes/runner/lib/lib.go +++ b/test/runtimes/runner/lib/lib.go @@ -21,6 +21,7 @@ import ( "fmt" "io" "os" + "reflect" "sort" "strings" "testing" @@ -63,8 +64,7 @@ func RunTests(lang, image, excludeFile string, batchSize int, timeout time.Durat fmt.Fprintf(os.Stderr, "%s\n", err.Error()) return 1 } - - m := testing.MainStart(testDeps{}, tests, nil, nil) + m := mainStart(tests) return m.Run() } @@ -197,3 +197,21 @@ func (f testDeps) ImportPath() string { return "" } func (f testDeps) StartTestLog(io.Writer) {} func (f testDeps) StopTestLog() error { return nil } func (f testDeps) SetPanicOnExit0(bool) {} +func (f testDeps) CoordinateFuzzing(time.Duration, int64, time.Duration, int64, int, []corpusEntry, []reflect.Type, string, string) error { + return nil +} +func (f testDeps) RunFuzzWorker(func(corpusEntry) error) error { return nil } +func (f testDeps) ReadCorpus(string, []reflect.Type) ([]corpusEntry, error) { return nil, nil } +func (f testDeps) CheckCorpus([]interface{}, []reflect.Type) error { return nil } +func (f testDeps) ResetCoverage() {} +func (f testDeps) SnapshotCoverage() {} + +// Copied from testing/fuzz.go. +type corpusEntry = struct { + Parent string + Name string + Data []byte + Values []interface{} + Generation int + IsSeed bool +} diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 5b882875f..b96005d7d 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -590,6 +590,7 @@ cc_binary( "//test/util:posix_error", "//test/util:test_main", "//test/util:test_util", + "//test/util:thread_util", ], ) @@ -1978,6 +1979,7 @@ cc_binary( defines = select_system(), linkstatic = 1, deps = [ + ":ip_socket_test_util", ":unix_domain_socket_test_util", "//test/util:capability_util", "//test/util:file_descriptor", @@ -2660,16 +2662,11 @@ cc_library( cc_library( name = "socket_ip_udp_unbound_external_networking", testonly = 1, - srcs = [ - "socket_ip_udp_unbound_external_networking.cc", - ], hdrs = [ "socket_ip_udp_unbound_external_networking.h", ], deps = [ ":ip_socket_test_util", - "//test/util:socket_util", - "//test/util:test_util", ], alwayslink = 1, ) @@ -2685,6 +2682,9 @@ cc_library( ], deps = [ ":socket_ip_udp_unbound_external_networking", + "//test/util:socket_util", + "//test/util:test_util", + "@com_google_absl//absl/cleanup", gtest, ], alwayslink = 1, diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc index 3ef8b0327..c2dc8174c 100644 --- a/test/syscalls/linux/epoll.cc +++ b/test/syscalls/linux/epoll.cc @@ -30,6 +30,7 @@ #include "test/util/file_descriptor.h" #include "test/util/posix_error.h" #include "test/util/test_util.h" +#include "test/util/thread_util.h" namespace gvisor { namespace testing { @@ -496,6 +497,41 @@ TEST(EpollTest, PipeReaderHupAfterWriterClosed) { EXPECT_EQ(result[0].data.u64, kMagicConstant); } +TEST(EpollTest, DoubleLayerEpoll) { + int pipefds[2]; + ASSERT_THAT(pipe2(pipefds, O_NONBLOCK), SyscallSucceeds()); + FileDescriptor rfd(pipefds[0]); + FileDescriptor wfd(pipefds[1]); + + auto epfd1 = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD()); + ASSERT_NO_ERRNO( + RegisterEpollFD(epfd1.get(), rfd.get(), EPOLLIN | EPOLLHUP, rfd.get())); + + auto epfd2 = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD()); + ASSERT_NO_ERRNO(RegisterEpollFD(epfd2.get(), epfd1.get(), EPOLLIN | EPOLLHUP, + epfd1.get())); + + // Write to wfd and then check if epoll events were generated correctly. + // Run this loop a couple of times to check if event in epfd1 is cleaned. + constexpr char data[] = "data"; + for (int i = 0; i < 2; ++i) { + ScopedThread thread1([&wfd, &data]() { + sleep(1); + ASSERT_EQ(WriteFd(wfd.get(), data, sizeof(data)), sizeof(data)); + }); + + struct epoll_event ret_events[2]; + ASSERT_THAT(RetryEINTR(epoll_wait)(epfd2.get(), ret_events, 2, 5000), + SyscallSucceedsWithValue(1)); + ASSERT_EQ(ret_events[0].data.fd, epfd1.get()); + ASSERT_THAT(RetryEINTR(epoll_wait)(epfd1.get(), ret_events, 2, 5000), + SyscallSucceedsWithValue(1)); + ASSERT_EQ(ret_events[0].data.fd, rfd.get()); + char readBuf[sizeof(data)]; + ASSERT_EQ(ReadFd(rfd.get(), readBuf, sizeof(data)), sizeof(data)); + } +} + } // namespace } // namespace testing diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc index a1216d23f..d18e616d0 100644 --- a/test/syscalls/linux/ip_socket_test_util.cc +++ b/test/syscalls/linux/ip_socket_test_util.cc @@ -16,6 +16,7 @@ #include <net/if.h> #include <netinet/in.h> +#include <netpacket/packet.h> #include <sys/socket.h> #include <cstring> @@ -196,75 +197,53 @@ SocketKind IPv6TCPUnboundSocket(int type) { UnboundSocketCreator(AF_INET6, type | SOCK_STREAM, IPPROTO_TCP)}; } -PosixError IfAddrHelper::Load() { - Release(); -#ifndef ANDROID - RETURN_ERROR_IF_SYSCALL_FAIL(getifaddrs(&ifaddr_)); -#else - // Android does not support getifaddrs in r22. - return PosixError(ENOSYS, "getifaddrs"); -#endif - return NoError(); -} - -void IfAddrHelper::Release() { - if (ifaddr_) { -#ifndef ANDROID - // Android does not support freeifaddrs in r22. - freeifaddrs(ifaddr_); -#endif - ifaddr_ = nullptr; - } -} - -std::vector<std::string> IfAddrHelper::InterfaceList(int family) const { - std::vector<std::string> names; - for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) { - if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) { - continue; - } - names.emplace(names.end(), ifa->ifa_name); - } - return names; -} - -const sockaddr* IfAddrHelper::GetAddr(int family, std::string name) const { - for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) { - if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) { - continue; - } - if (name == ifa->ifa_name) { - return ifa->ifa_addr; - } - } - return nullptr; -} - -PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) const { - return InterfaceIndex(name); -} - std::string GetAddr4Str(const in_addr* a) { char str[INET_ADDRSTRLEN]; - inet_ntop(AF_INET, a, str, sizeof(str)); - return std::string(str); + return inet_ntop(AF_INET, a, str, sizeof(str)); } std::string GetAddr6Str(const in6_addr* a) { char str[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, a, str, sizeof(str)); - return std::string(str); + return inet_ntop(AF_INET6, a, str, sizeof(str)); } std::string GetAddrStr(const sockaddr* a) { - if (a->sa_family == AF_INET) { - auto src = &(reinterpret_cast<const sockaddr_in*>(a)->sin_addr); - return GetAddr4Str(src); - } else if (a->sa_family == AF_INET6) { - auto src = &(reinterpret_cast<const sockaddr_in6*>(a)->sin6_addr); - return GetAddr6Str(src); + switch (a->sa_family) { + case AF_INET: { + return GetAddr4Str(&(reinterpret_cast<const sockaddr_in*>(a)->sin_addr)); + } + case AF_INET6: { + return GetAddr6Str( + &(reinterpret_cast<const sockaddr_in6*>(a)->sin6_addr)); + } + case AF_PACKET: { + const sockaddr_ll& ll = *reinterpret_cast<const sockaddr_ll*>(a); + std::ostringstream ss; + ss << std::hex; + ss << std::showbase; + ss << '{'; + ss << " protocol=" << ntohs(ll.sll_protocol); + ss << " ifindex=" << ll.sll_ifindex; + ss << " hatype=" << ll.sll_hatype; + ss << " pkttype=" << static_cast<unsigned short>(ll.sll_pkttype); + if (ll.sll_halen != 0) { + ss << " addr="; + for (unsigned char i = 0; i < ll.sll_halen; ++i) { + if (i != 0) { + ss << ':'; + } + ss << static_cast<unsigned short>(ll.sll_addr[i]); + } + } + ss << " }"; + return ss.str(); + } + default: { + std::ostringstream ss; + ss << "invalid(sa_family=" << a->sa_family << ")"; + return ss.str(); + } } - return std::string("<invalid>"); } } // namespace testing diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h index 556838356..957006e25 100644 --- a/test/syscalls/linux/ip_socket_test_util.h +++ b/test/syscalls/linux/ip_socket_test_util.h @@ -115,25 +115,6 @@ SocketKind IPv4TCPUnboundSocket(int type); // created with AF_INET6, SOCK_STREAM, IPPROTO_TCP and the given type. SocketKind IPv6TCPUnboundSocket(int type); -// IfAddrHelper is a helper class that determines the local interfaces present -// and provides functions to obtain their names, index numbers, and IP address. -class IfAddrHelper { - public: - IfAddrHelper() : ifaddr_(nullptr) {} - ~IfAddrHelper() { Release(); } - - PosixError Load(); - void Release(); - - std::vector<std::string> InterfaceList(int family) const; - - const sockaddr* GetAddr(int family, std::string name) const; - PosixErrorOr<int> GetIndex(std::string name) const; - - private: - struct ifaddrs* ifaddr_; -}; - // GetAddr4Str returns the given IPv4 network address structure as a string. std::string GetAddr4Str(const in_addr* a); diff --git a/test/syscalls/linux/proc_isolated.cc b/test/syscalls/linux/proc_isolated.cc index a38689667..38d079d2b 100644 --- a/test/syscalls/linux/proc_isolated.cc +++ b/test/syscalls/linux/proc_isolated.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <linux/msg.h> #include <linux/sem.h> #include <linux/shm.h> @@ -73,6 +74,27 @@ TEST(ProcDefaults, PresenceOfSem) { ASSERT_EQ(semmni, SEMMNI); } +TEST(ProcDefaults, PresenceOfMsgMniMaxMnb) { + uint64_t msgmni = 0; + uint64_t msgmax = 0; + uint64_t msgmnb = 0; + + std::string proc_file; + proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/msgmni")); + ASSERT_FALSE(proc_file.empty()); + ASSERT_TRUE(absl::SimpleAtoi(proc_file, &msgmni)); + proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/msgmax")); + ASSERT_FALSE(proc_file.empty()); + ASSERT_TRUE(absl::SimpleAtoi(proc_file, &msgmax)); + proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/msgmnb")); + ASSERT_FALSE(proc_file.empty()); + ASSERT_TRUE(absl::SimpleAtoi(proc_file, &msgmnb)); + + ASSERT_EQ(msgmni, MSGMNI); + ASSERT_EQ(msgmax, MSGMAX); + ASSERT_EQ(msgmnb, MSGMNB); +} + } // namespace } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/raw_socket.cc b/test/syscalls/linux/raw_socket.cc index ef1db47ee..ef176cbee 100644 --- a/test/syscalls/linux/raw_socket.cc +++ b/test/syscalls/linux/raw_socket.cc @@ -25,6 +25,7 @@ #include <algorithm> #include "gtest/gtest.h" +#include "test/syscalls/linux/ip_socket_test_util.h" #include "test/syscalls/linux/unix_domain_socket_test_util.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" @@ -39,6 +40,9 @@ namespace testing { namespace { +using ::testing::IsNull; +using ::testing::NotNull; + // Fixture for tests parameterized by protocol. class RawSocketTest : public ::testing::TestWithParam<std::tuple<int, int>> { protected: @@ -1057,6 +1061,131 @@ TEST(RawSocketTest, BindReceive) { ASSERT_NO_FATAL_FAILURE(TestRawSocketMaybeBindReceive(true /* do_bind */)); } +TEST(RawSocketTest, ReceiveIPPacketInfo) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); + + FileDescriptor raw = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_UDP)); + + const sockaddr_in addr_ = { + .sin_family = AF_INET, + .sin_addr = {.s_addr = htonl(INADDR_LOOPBACK)}, + }; + ASSERT_THAT( + bind(raw.get(), reinterpret_cast<const sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); + + // Register to receive IP packet info. + constexpr int one = 1; + ASSERT_THAT(setsockopt(raw.get(), IPPROTO_IP, IP_PKTINFO, &one, sizeof(one)), + SyscallSucceeds()); + + constexpr char send_buf[] = "malformed UDP"; + ASSERT_THAT(sendto(raw.get(), send_buf, sizeof(send_buf), 0 /* flags */, + reinterpret_cast<const sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceedsWithValue(sizeof(send_buf))); + + struct { + iphdr ip; + char data[sizeof(send_buf)]; + + // Extra space in the receive buffer should be unused. + char unused_space; + } ABSL_ATTRIBUTE_PACKED recv_buf; + iovec recv_iov = { + .iov_base = &recv_buf, + .iov_len = sizeof(recv_buf), + }; + in_pktinfo received_pktinfo; + char recv_cmsg_buf[CMSG_SPACE(sizeof(received_pktinfo))]; + msghdr recv_msg = { + .msg_iov = &recv_iov, + .msg_iovlen = 1, + .msg_control = recv_cmsg_buf, + .msg_controllen = CMSG_LEN(sizeof(received_pktinfo)), + }; + ASSERT_THAT(RetryEINTR(recvmsg)(raw.get(), &recv_msg, 0), + SyscallSucceedsWithValue(sizeof(iphdr) + sizeof(send_buf))); + EXPECT_EQ(memcmp(send_buf, &recv_buf.data, sizeof(send_buf)), 0); + EXPECT_EQ(recv_buf.ip.version, static_cast<unsigned int>(IPVERSION)); + // IHL holds the number of header bytes in 4 byte units. + EXPECT_EQ(recv_buf.ip.ihl, sizeof(iphdr) / 4); + EXPECT_EQ(ntohs(recv_buf.ip.tot_len), sizeof(iphdr) + sizeof(send_buf)); + EXPECT_EQ(recv_buf.ip.protocol, IPPROTO_UDP); + EXPECT_EQ(ntohl(recv_buf.ip.saddr), INADDR_LOOPBACK); + EXPECT_EQ(ntohl(recv_buf.ip.daddr), INADDR_LOOPBACK); + + cmsghdr* cmsg = CMSG_FIRSTHDR(&recv_msg); + ASSERT_THAT(cmsg, NotNull()); + EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(received_pktinfo))); + EXPECT_EQ(cmsg->cmsg_level, IPPROTO_IP); + EXPECT_EQ(cmsg->cmsg_type, IP_PKTINFO); + memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(received_pktinfo)); + EXPECT_EQ(received_pktinfo.ipi_ifindex, + ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex())); + EXPECT_EQ(ntohl(received_pktinfo.ipi_spec_dst.s_addr), INADDR_LOOPBACK); + EXPECT_EQ(ntohl(received_pktinfo.ipi_addr.s_addr), INADDR_LOOPBACK); + + EXPECT_THAT(CMSG_NXTHDR(&recv_msg, cmsg), IsNull()); +} + +TEST(RawSocketTest, ReceiveIPv6PacketInfo) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveRawIPSocketCapability())); + + FileDescriptor raw = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_RAW, IPPROTO_UDP)); + + const sockaddr_in6 addr_ = { + .sin6_family = AF_INET6, + .sin6_addr = in6addr_loopback, + }; + ASSERT_THAT( + bind(raw.get(), reinterpret_cast<const sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceeds()); + + // Register to receive IPv6 packet info. + constexpr int one = 1; + ASSERT_THAT( + setsockopt(raw.get(), IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one)), + SyscallSucceeds()); + + constexpr char send_buf[] = "malformed UDP"; + ASSERT_THAT(sendto(raw.get(), send_buf, sizeof(send_buf), 0 /* flags */, + reinterpret_cast<const sockaddr*>(&addr_), sizeof(addr_)), + SyscallSucceedsWithValue(sizeof(send_buf))); + + char recv_buf[sizeof(send_buf) + 1]; + iovec recv_iov = { + .iov_base = recv_buf, + .iov_len = sizeof(recv_buf), + }; + in6_pktinfo received_pktinfo; + char recv_cmsg_buf[CMSG_SPACE(sizeof(received_pktinfo))]; + msghdr recv_msg = { + .msg_iov = &recv_iov, + .msg_iovlen = 1, + .msg_control = recv_cmsg_buf, + .msg_controllen = CMSG_LEN(sizeof(received_pktinfo)), + }; + ASSERT_THAT(RetryEINTR(recvmsg)(raw.get(), &recv_msg, 0), + SyscallSucceedsWithValue(sizeof(send_buf))); + EXPECT_EQ(memcmp(send_buf, recv_buf, sizeof(send_buf)), 0); + + cmsghdr* cmsg = CMSG_FIRSTHDR(&recv_msg); + ASSERT_THAT(cmsg, NotNull()); + EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(received_pktinfo))); + EXPECT_EQ(cmsg->cmsg_level, IPPROTO_IPV6); + EXPECT_EQ(cmsg->cmsg_type, IPV6_PKTINFO); + memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(received_pktinfo)); + EXPECT_EQ(received_pktinfo.ipi6_ifindex, + ASSERT_NO_ERRNO_AND_VALUE(GetLoopbackIndex())); + ASSERT_EQ(memcmp(&received_pktinfo.ipi6_addr, &in6addr_loopback, + sizeof(in6addr_loopback)), + 0); + + EXPECT_THAT(CMSG_NXTHDR(&recv_msg, cmsg), IsNull()); +} + } // namespace } // namespace testing diff --git a/test/syscalls/linux/socket_ip_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ip_udp_unbound_external_networking.cc deleted file mode 100644 index af2459a2f..000000000 --- a/test/syscalls/linux/socket_ip_udp_unbound_external_networking.cc +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "test/syscalls/linux/socket_ip_udp_unbound_external_networking.h" - -#include "test/util/socket_util.h" -#include "test/util/test_util.h" - -namespace gvisor { -namespace testing { - -void IPUDPUnboundExternalNetworkingSocketTest::SetUp() { - // FIXME(b/137899561): Linux instance for syscall tests sometimes misses its - // IPv4 address on eth0. - found_net_interfaces_ = false; - - // Get interface list. - ASSERT_NO_ERRNO(if_helper_.Load()); - std::vector<std::string> if_names = if_helper_.InterfaceList(AF_INET); - if (if_names.size() != 2) { - return; - } - - // Figure out which interface is where. - std::string lo = if_names[0]; - std::string eth = if_names[1]; - if (lo != "lo") std::swap(lo, eth); - if (lo != "lo") return; - - lo_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(lo)); - auto lo_if_addr = if_helper_.GetAddr(AF_INET, lo); - if (lo_if_addr == nullptr) { - return; - } - lo_if_addr_ = *reinterpret_cast<const sockaddr_in*>(lo_if_addr); - - eth_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(eth)); - auto eth_if_addr = if_helper_.GetAddr(AF_INET, eth); - if (eth_if_addr == nullptr) { - return; - } - eth_if_addr_ = *reinterpret_cast<const sockaddr_in*>(eth_if_addr); - - found_net_interfaces_ = true; -} - -} // namespace testing -} // namespace gvisor diff --git a/test/syscalls/linux/socket_ip_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ip_udp_unbound_external_networking.h index 2e8aab129..92c20eba9 100644 --- a/test/syscalls/linux/socket_ip_udp_unbound_external_networking.h +++ b/test/syscalls/linux/socket_ip_udp_unbound_external_networking.h @@ -16,29 +16,13 @@ #define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_UNBOUND_EXTERNAL_NETWORKING_H_ #include "test/syscalls/linux/ip_socket_test_util.h" -#include "test/util/socket_util.h" namespace gvisor { namespace testing { // Test fixture for tests that apply to unbound IP UDP sockets in a sandbox // with external networking support. -class IPUDPUnboundExternalNetworkingSocketTest : public SimpleSocketTest { - protected: - void SetUp() override; - - IfAddrHelper if_helper_; - - // found_net_interfaces_ is set to false if SetUp() could not obtain - // all interface infos that we need. - bool found_net_interfaces_; - - // Interface infos. - int lo_if_idx_; - int eth_if_idx_; - sockaddr_in lo_if_addr_; - sockaddr_in eth_if_addr_; -}; +class IPUDPUnboundExternalNetworkingSocketTest : public SimpleSocketTest {}; } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc index c6e775b2a..6c67ec51e 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc +++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc @@ -14,9 +14,62 @@ #include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h" +#include <net/if.h> + +#include "absl/cleanup/cleanup.h" +#include "test/util/socket_util.h" +#include "test/util/test_util.h" + namespace gvisor { namespace testing { +void IPv4UDPUnboundExternalNetworkingSocketTest::SetUp() { +#ifdef ANDROID + GTEST_SKIP() << "Android does not support getifaddrs in r22"; +#endif + + ifaddrs* ifaddr; + ASSERT_THAT(getifaddrs(&ifaddr), SyscallSucceeds()); + auto cleanup = absl::MakeCleanup([ifaddr] { freeifaddrs(ifaddr); }); + + for (const ifaddrs* ifa = ifaddr; ifa != nullptr; ifa = ifa->ifa_next) { + ASSERT_NE(ifa->ifa_name, nullptr); + ASSERT_NE(ifa->ifa_addr, nullptr); + + if (ifa->ifa_addr->sa_family != AF_INET) { + continue; + } + + std::optional<std::pair<int, sockaddr_in>>& if_pair = *[this, ifa]() { + if (strcmp(ifa->ifa_name, "lo") == 0) { + return &lo_if_; + } + return ð_if_; + }(); + + const int if_index = + ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex(ifa->ifa_name)); + + std::cout << " name=" << ifa->ifa_name + << " addr=" << GetAddrStr(ifa->ifa_addr) << " index=" << if_index + << " has_value=" << if_pair.has_value() << std::endl; + + if (if_pair.has_value()) { + continue; + } + + if_pair = std::make_pair( + if_index, *reinterpret_cast<const sockaddr_in*>(ifa->ifa_addr)); + } + + if (!(eth_if_.has_value() && lo_if_.has_value())) { + // FIXME(b/137899561): Linux instance for syscall tests sometimes misses its + // IPv4 address on eth0. + GTEST_SKIP() << " eth_if_.has_value()=" << eth_if_.has_value() + << " lo_if_.has_value()=" << lo_if_.has_value(); + } +} + TestAddress V4EmptyAddress() { TestAddress t("V4Empty"); t.addr.ss_family = AF_INET; @@ -28,7 +81,6 @@ TestAddress V4EmptyAddress() { // the destination port number. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, UDPBroadcastReceivedOnExpectedPort) { - SKIP_IF(!found_net_interfaces_); auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto rcvr1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto rcvr2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); @@ -101,8 +153,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // not a unicast address. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, UDPBroadcastReceivedOnExpectedAddresses) { - SKIP_IF(!found_net_interfaces_); - auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto rcvr1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto rcvr2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); @@ -149,7 +199,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // Bind the non-receiving socket to the unicast ethernet address. auto norecv_addr = rcv1_addr; reinterpret_cast<sockaddr_in*>(&norecv_addr.addr)->sin_addr = - eth_if_addr_.sin_addr; + eth_if_addr().sin_addr; ASSERT_THAT( bind(norcv->get(), AsSockAddr(&norecv_addr.addr), norecv_addr.addr_len), SyscallSucceedsWithValue(0)); @@ -184,7 +234,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // (UDPBroadcastSendRecvOnSocketBoundToAny). TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, UDPBroadcastSendRecvOnSocketBoundToBroadcast) { - SKIP_IF(!found_net_interfaces_); auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); // Enable SO_BROADCAST. @@ -224,7 +273,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // (UDPBroadcastSendRecvOnSocketBoundToBroadcast). TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, UDPBroadcastSendRecvOnSocketBoundToAny) { - SKIP_IF(!found_net_interfaces_); auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); // Enable SO_BROADCAST. @@ -261,7 +309,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // Verifies that a UDP broadcast fails to send on a socket with SO_BROADCAST // disabled. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendBroadcast) { - SKIP_IF(!found_net_interfaces_); auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); // Broadcast a test message without having enabled SO_BROADCAST on the sending @@ -306,12 +353,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendUnicastOnUnbound) { // set interface or group membership. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastSelfNoGroup) { - // FIXME(b/125485338): A group membership is not required for external - // multicast on gVisor. - SKIP_IF(IsRunningOnGvisor()); - - SKIP_IF(!found_net_interfaces_); - auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto bind_addr = V4Any(); @@ -345,7 +386,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // Check that multicast packets will be delivered to the sending socket without // setting an interface. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastSelf) { - SKIP_IF(!found_net_interfaces_); auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto bind_addr = V4Any(); @@ -388,7 +428,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastSelf) { // set interface and IP_MULTICAST_LOOP disabled. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastSelfLoopOff) { - SKIP_IF(!found_net_interfaces_); auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto bind_addr = V4Any(); @@ -434,12 +473,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // Check that multicast packets won't be delivered to another socket with no // set interface or group membership. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastNoGroup) { - // FIXME(b/125485338): A group membership is not required for external - // multicast on gVisor. - SKIP_IF(IsRunningOnGvisor()); - - SKIP_IF(!found_net_interfaces_); - auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); @@ -476,7 +509,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastNoGroup) { // Check that multicast packets will be delivered to another socket without // setting an interface. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticast) { - SKIP_IF(!found_net_interfaces_); auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); @@ -522,7 +554,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticast) { // set interface and IP_MULTICAST_LOOP disabled on the sending socket. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastSenderNoLoop) { - SKIP_IF(!found_net_interfaces_); auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); @@ -572,8 +603,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // setting an interface and IP_MULTICAST_LOOP disabled on the receiving socket. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastReceiverNoLoop) { - SKIP_IF(!found_net_interfaces_); - auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); @@ -624,7 +653,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // and both will receive data on it when bound to the ANY address. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastToTwoBoundToAny) { - SKIP_IF(!found_net_interfaces_); auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); std::unique_ptr<FileDescriptor> receivers[2] = { ASSERT_NO_ERRNO_AND_VALUE(NewSocket()), @@ -689,7 +717,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // and both will receive data on it when bound to the multicast address. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastToTwoBoundToMulticastAddress) { - SKIP_IF(!found_net_interfaces_); auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); std::unique_ptr<FileDescriptor> receivers[2] = { ASSERT_NO_ERRNO_AND_VALUE(NewSocket()), @@ -757,7 +784,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // multicast address, both will receive data. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastToTwoBoundToAnyAndMulticastAddress) { - SKIP_IF(!found_net_interfaces_); auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); std::unique_ptr<FileDescriptor> receivers[2] = { ASSERT_NO_ERRNO_AND_VALUE(NewSocket()), @@ -829,8 +855,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // is not a multicast address. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, IpMulticastLoopbackFromAddr) { - SKIP_IF(!found_net_interfaces_); - auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); @@ -893,8 +917,6 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // interface, a multicast packet sent out uses the latter as its source address. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, IpMulticastLoopbackIfNicAndAddr) { - SKIP_IF(!found_net_interfaces_); - // Create receiver, bind to ANY and join the multicast group. auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto receiver_addr = V4Any(); @@ -906,11 +928,15 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, &receiver_addr_len), SyscallSucceeds()); EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len); - int receiver_port = + const in_port_t receiver_port = reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port; - ip_mreqn group = {}; - group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress); - group.imr_ifindex = lo_if_idx_; + const ip_mreqn group = { + .imr_multiaddr = + { + .s_addr = inet_addr(kMulticastAddress), + }, + .imr_ifindex = lo_if_idx(), + }; ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)), SyscallSucceeds()); @@ -918,9 +944,10 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // Set outgoing multicast interface config, with NIC and addr pointing to // different interfaces. auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - ip_mreqn iface = {}; - iface.imr_ifindex = lo_if_idx_; - iface.imr_address = eth_if_addr_.sin_addr; + const ip_mreqn iface = { + .imr_address = eth_if_addr().sin_addr, + .imr_ifindex = lo_if_idx(), + }; ASSERT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, sizeof(iface)), SyscallSucceeds()); @@ -928,67 +955,104 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, // Send a multicast packet. auto sendto_addr = V4Multicast(); reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port = receiver_port; - char send_buf[4] = {}; + char send_buf[4]; ASSERT_THAT( RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0, AsSockAddr(&sendto_addr.addr), sendto_addr.addr_len), SyscallSucceedsWithValue(sizeof(send_buf))); // Receive a multicast packet. - char recv_buf[sizeof(send_buf)] = {}; + char recv_buf[sizeof(send_buf) + 1]; auto src_addr = V4EmptyAddress(); ASSERT_THAT( RetryEINTR(recvfrom)(receiver->get(), recv_buf, sizeof(recv_buf), 0, AsSockAddr(&src_addr.addr), &src_addr.addr_len), - SyscallSucceedsWithValue(sizeof(recv_buf))); - ASSERT_EQ(sizeof(struct sockaddr_in), src_addr.addr_len); - sockaddr_in* src_addr_in = reinterpret_cast<sockaddr_in*>(&src_addr.addr); - - // FIXME (b/137781162): When sending a multicast packet use the proper logic - // to determine the packet's src-IP. - SKIP_IF(IsRunningOnGvisor()); + SyscallSucceedsWithValue(sizeof(send_buf))); + ASSERT_EQ(src_addr.addr_len, sizeof(struct sockaddr_in)); // Verify the received source address. - EXPECT_EQ(eth_if_addr_.sin_addr.s_addr, src_addr_in->sin_addr.s_addr); + // + // TODO(https://gvisor.dev/issue/6686): gVisor is a strong host, preventing + // the packet from being sent from the loopback device using the ethernet + // device's address. + if (IsRunningOnGvisor()) { + EXPECT_EQ(GetAddrStr(AsSockAddr(&src_addr.addr)), + GetAddr4Str(&lo_if_addr().sin_addr)); + } else { + EXPECT_EQ(GetAddrStr(AsSockAddr(&src_addr.addr)), + GetAddr4Str(ð_if_addr().sin_addr)); + } } // Check that when we are bound to one interface we can set IP_MULTICAST_IF to // another interface. TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, IpMulticastLoopbackBindToOneIfSetMcastIfToAnother) { - SKIP_IF(!found_net_interfaces_); - - // FIXME (b/137790511): When bound to one interface it is not possible to set - // IP_MULTICAST_IF to a different interface. - SKIP_IF(IsRunningOnGvisor()); - - // Create sender and bind to eth interface. - auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); - ASSERT_THAT( - bind(sender->get(), AsSockAddr(ð_if_addr_), sizeof(eth_if_addr_)), - SyscallSucceeds()); - // Run through all possible combinations of index and address for // IP_MULTICAST_IF that selects the loopback interface. - struct { - int imr_ifindex; - struct in_addr imr_address; - } test_data[] = { - {lo_if_idx_, {}}, - {0, lo_if_addr_.sin_addr}, - {lo_if_idx_, lo_if_addr_.sin_addr}, - {lo_if_idx_, eth_if_addr_.sin_addr}, + ip_mreqn ifaces[] = { + { + .imr_address = {}, + .imr_ifindex = lo_if_idx(), + }, + { + .imr_address = lo_if_addr().sin_addr, + .imr_ifindex = 0, + }, + { + .imr_address = lo_if_addr().sin_addr, + .imr_ifindex = lo_if_idx(), + }, + { + .imr_address = eth_if_addr().sin_addr, + .imr_ifindex = lo_if_idx(), + }, }; - for (auto t : test_data) { - ip_mreqn iface = {}; - iface.imr_ifindex = t.imr_ifindex; - iface.imr_address = t.imr_address; - EXPECT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, - sizeof(iface)), - SyscallSucceeds()) - << "imr_index=" << iface.imr_ifindex - << " imr_address=" << GetAddr4Str(&iface.imr_address); + + { + auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + ASSERT_THAT( + bind(sender->get(), AsSockAddr(ð_if_addr()), sizeof(eth_if_addr())), + SyscallSucceeds()); + + for (const ip_mreqn& iface : ifaces) { + EXPECT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, + sizeof(iface)), + SyscallSucceeds()) + << " imr_index=" << iface.imr_ifindex + << " imr_address=" << GetAddr4Str(&iface.imr_address); + } + } + + { + char eth_if_name[IF_NAMESIZE]; + memset(eth_if_name, 0xAA, sizeof(eth_if_name)); + ASSERT_NE(if_indextoname(eth_if_idx(), eth_if_name), nullptr) + << strerror(errno); + auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); + ASSERT_THAT(setsockopt(sender->get(), SOL_SOCKET, SO_BINDTODEVICE, + eth_if_name, sizeof(eth_if_name)), + SyscallSucceeds()); + + for (const ip_mreqn& iface : ifaces) { + // FIXME(b/137790511): Disallow mismatching IP_MULTICAST_IF and + // SO_BINDTODEVICE. + if (IsRunningOnGvisor()) { + EXPECT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, + &iface, sizeof(iface)), + SyscallSucceeds()) + << " imr_index=" << iface.imr_ifindex + << " imr_address=" << GetAddr4Str(&iface.imr_address); + } else { + EXPECT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, + &iface, sizeof(iface)), + SyscallFailsWithErrno(EINVAL)) + << " imr_index=" << iface.imr_ifindex + << " imr_address=" << GetAddr4Str(&iface.imr_address); + } + } } } + } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h index 20922ac1f..ac917b32c 100644 --- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h +++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h @@ -22,8 +22,22 @@ namespace testing { // Test fixture for tests that apply to unbound IPv4 UDP sockets in a sandbox // with external networking support. -using IPv4UDPUnboundExternalNetworkingSocketTest = - IPUDPUnboundExternalNetworkingSocketTest; +class IPv4UDPUnboundExternalNetworkingSocketTest + : public IPUDPUnboundExternalNetworkingSocketTest { + protected: + void SetUp() override; + + int lo_if_idx() const { return std::get<0>(lo_if_.value()); } + int eth_if_idx() const { return std::get<0>(eth_if_.value()); } + + const sockaddr_in& lo_if_addr() const { return std::get<1>(lo_if_.value()); } + const sockaddr_in& eth_if_addr() const { + return std::get<1>(eth_if_.value()); + } + + private: + std::optional<std::pair<int, sockaddr_in>> lo_if_, eth_if_; +}; } // namespace testing } // namespace gvisor diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking.cc index d72b6b53f..c6e563cd3 100644 --- a/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking.cc +++ b/test/syscalls/linux/socket_ipv6_udp_unbound_external_networking.cc @@ -18,8 +18,6 @@ namespace gvisor { namespace testing { TEST_P(IPv6UDPUnboundExternalNetworkingSocketTest, TestJoinLeaveMulticast) { - SKIP_IF(!found_net_interfaces_); - auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket()); diff --git a/test/util/socket_util.h b/test/util/socket_util.h index 0e2be63cc..588f041b7 100644 --- a/test/util/socket_util.h +++ b/test/util/socket_util.h @@ -554,15 +554,27 @@ uint16_t ICMPChecksum(struct icmphdr icmphdr, const char* payload, inline sockaddr* AsSockAddr(sockaddr_storage* s) { return reinterpret_cast<sockaddr*>(s); } +inline const sockaddr* AsSockAddr(const sockaddr_storage* s) { + return reinterpret_cast<const sockaddr*>(s); +} inline sockaddr* AsSockAddr(sockaddr_in* s) { return reinterpret_cast<sockaddr*>(s); } +inline const sockaddr* AsSockAddr(const sockaddr_in* s) { + return reinterpret_cast<const sockaddr*>(s); +} inline sockaddr* AsSockAddr(sockaddr_in6* s) { return reinterpret_cast<sockaddr*>(s); } +inline const sockaddr* AsSockAddr(const sockaddr_in6* s) { + return reinterpret_cast<const sockaddr*>(s); +} inline sockaddr* AsSockAddr(sockaddr_un* s) { return reinterpret_cast<sockaddr*>(s); } +inline const sockaddr* AsSockAddr(const sockaddr_un* s) { + return reinterpret_cast<const sockaddr*>(s); +} PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr); diff --git a/tools/bazel.mk b/tools/bazel.mk index 1444423e4..68b804ec4 100644 --- a/tools/bazel.mk +++ b/tools/bazel.mk @@ -186,8 +186,8 @@ build_paths = \ (set -euo pipefail; \ $(call wrapper,$(BAZEL) build $(BASE_OPTIONS) $(BAZEL_OPTIONS) $(1)) && \ $(call wrapper,$(BAZEL) cquery $(BASE_OPTIONS) $(BAZEL_OPTIONS) $(1) --output=starlark --starlark:file=tools/show_paths.bzl) \ - | xargs -r -n 1 -I {} bash -c 'test -e "{}" || exit 0; readlink -f "{}"' \ - | xargs -r -n 1 -I {} bash -c 'set -euo pipefail; $(2)') + | xargs -r -I {} bash -c 'test -e "{}" || exit 0; readlink -f "{}"' \ + | xargs -r -I {} bash -c 'set -euo pipefail; $(2)') clean = $(call header,CLEAN) && $(call wrapper,$(BAZEL) clean) build = $(call header,BUILD $(1)) && $(call build_paths,$(1),echo {}) diff --git a/tools/nogo/nogo.go b/tools/nogo/nogo.go index d95d7652f..2f88f84db 100644 --- a/tools/nogo/nogo.go +++ b/tools/nogo/nogo.go @@ -293,6 +293,19 @@ func CheckStdlib(config *StdlibConfig, analyzers []*analysis.Analyzer) (allFindi break } + // Go standard library packages using Go 1.18 type parameter features. + // + // As of writing, analysis tooling is not updated to support type + // parameters and will choke on these packages. We skip these packages + // entirely for now. + // + // TODO(b/201686256): remove once tooling can handle type parameters. + usesTypeParams := map[string]struct{}{ + "constraints": struct{}{}, // golang.org/issue/45458 + "maps": struct{}{}, // golang.org/issue/47649 + "slices": struct{}{}, // golang.org/issue/45955 + } + // Aggregate all files by directory. packages := make(map[string]*PackageConfig) for _, file := range config.Srcs { @@ -306,10 +319,17 @@ func CheckStdlib(config *StdlibConfig, analyzers []*analysis.Analyzer) (allFindi continue // Not a file. } pkg := d[len(rootSrcPrefix):] + // Skip cmd packages and obvious test files: see above. if strings.HasPrefix(pkg, "cmd/") || strings.HasSuffix(file, "_test.go") { continue } + + if _, ok := usesTypeParams[pkg]; ok { + log.Printf("WARNING: Skipping package %q: type param analysis not yet supported", pkg) + continue + } + c, ok := packages[pkg] if !ok { c = &PackageConfig{ |