diff options
author | Ian Lewis <ianmlewis@gmail.com> | 2020-08-17 21:44:31 -0400 |
---|---|---|
committer | Ian Lewis <ianmlewis@gmail.com> | 2020-08-17 21:44:31 -0400 |
commit | ac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch) | |
tree | 0cbc5018e8807421d701d190dc20525726c7ca76 /test/benchmarks/tcp | |
parent | 352ae1022ce19de28fc72e034cc469872ad79d06 (diff) | |
parent | 6d0c5803d557d453f15ac6f683697eeb46dab680 (diff) |
Merge branch 'master' into ip-forwarding
- Merges aleksej-paschenko's with HEAD
- Adds vfs2 support for ip_forward
Diffstat (limited to 'test/benchmarks/tcp')
-rw-r--r-- | test/benchmarks/tcp/BUILD | 41 | ||||
-rw-r--r-- | test/benchmarks/tcp/README.md | 87 | ||||
-rw-r--r-- | test/benchmarks/tcp/nsjoin.c | 47 | ||||
-rwxr-xr-x | test/benchmarks/tcp/tcp_benchmark.sh | 392 | ||||
-rw-r--r-- | test/benchmarks/tcp/tcp_proxy.go | 451 |
5 files changed, 1018 insertions, 0 deletions
diff --git a/test/benchmarks/tcp/BUILD b/test/benchmarks/tcp/BUILD new file mode 100644 index 000000000..6dde7d9e6 --- /dev/null +++ b/test/benchmarks/tcp/BUILD @@ -0,0 +1,41 @@ +load("//tools:defs.bzl", "cc_binary", "go_binary") + +package(licenses = ["notice"]) + +go_binary( + name = "tcp_proxy", + srcs = ["tcp_proxy.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/tcpip", + "//pkg/tcpip/adapters/gonet", + "//pkg/tcpip/link/fdbased", + "//pkg/tcpip/link/qdisc/fifo", + "//pkg/tcpip/network/arp", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport/tcp", + "//pkg/tcpip/transport/udp", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +# nsjoin is a trivial replacement for nsenter. This is used because nsenter is +# not available on all systems where this benchmark is run (and we aim to +# minimize external dependencies.) + +cc_binary( + name = "nsjoin", + srcs = ["nsjoin.c"], + visibility = ["//:sandbox"], +) + +sh_binary( + name = "tcp_benchmark", + srcs = ["tcp_benchmark.sh"], + data = [ + ":nsjoin", + ":tcp_proxy", + ], + visibility = ["//:sandbox"], +) diff --git a/test/benchmarks/tcp/README.md b/test/benchmarks/tcp/README.md new file mode 100644 index 000000000..38e6e69f0 --- /dev/null +++ b/test/benchmarks/tcp/README.md @@ -0,0 +1,87 @@ +# TCP Benchmarks + +This directory contains a standardized TCP benchmark. This helps to evaluate the +performance of netstack and native networking stacks under various conditions. + +## `tcp_benchmark` + +This benchmark allows TCP throughput testing under various conditions. The setup +consists of an iperf client, a client proxy, a server proxy and an iperf server. +The client proxy and server proxy abstract the network mechanism used to +communicate between the iperf client and server. + +The setup looks like the following: + +``` + +--------------+ (native) +--------------+ + | iperf client |[lo @ 10.0.0.1]------>| client proxy | + +--------------+ +--------------+ + [client.0 @ 10.0.0.2] + (netstack) | | (native) + +------+-----+ + | + [br0] + | + Network emulation applied ---> [wan.0:wan.1] + | + [br1] + | + +------+-----+ + (netstack) | | (native) + [server.0 @ 10.0.0.3] + +--------------+ +--------------+ + | iperf server |<------[lo @ 10.0.0.4]| server proxy | + +--------------+ (native) +--------------+ +``` + +Different configurations can be run using different arguments. For example: + +* Native test under normal internet conditions: `tcp_benchmark` +* Native test under ideal conditions: `tcp_benchmark --ideal` +* Netstack client under ideal conditions: `tcp_benchmark --client --ideal` +* Netstack client with 5% packet loss: `tcp_benchmark --client --ideal --loss + 5` + +Use `tcp_benchmark --help` for full arguments. + +This tool may be used to easily generate data for graphing. For example, to +generate a CSV for various latencies, you might do: + +``` +rm -f /tmp/netstack_latency.csv /tmp/native_latency.csv +latencies=$(seq 0 5 50; + seq 60 10 100; + seq 125 25 250; + seq 300 50 500) +for latency in $latencies; do + read throughput client_cpu server_cpu <<< \ + $(./tcp_benchmark --duration 30 --client --ideal --latency $latency) + echo $latency,$throughput,$client_cpu >> /tmp/netstack_latency.csv +done +for latency in $latencies; do + read throughput client_cpu server_cpu <<< \ + $(./tcp_benchmark --duration 30 --ideal --latency $latency) + echo $latency,$throughput,$client_cpu >> /tmp/native_latency.csv +done +``` + +Similarly, to generate a CSV for various levels of packet loss, the following +would be appropriate: + +``` +rm -f /tmp/netstack_loss.csv /tmp/native_loss.csv +losses=$(seq 0 0.1 1.0; + seq 1.2 0.2 2.0; + seq 2.5 0.5 5.0; + seq 6.0 1.0 10.0) +for loss in $losses; do + read throughput client_cpu server_cpu <<< \ + $(./tcp_benchmark --duration 30 --client --ideal --latency 10 --loss $loss) + echo $loss,$throughput,$client_cpu >> /tmp/netstack_loss.csv +done +for loss in $losses; do + read throughput client_cpu server_cpu <<< \ + $(./tcp_benchmark --duration 30 --ideal --latency 10 --loss $loss) + echo $loss,$throughput,$client_cpu >> /tmp/native_loss.csv +done +``` diff --git a/test/benchmarks/tcp/nsjoin.c b/test/benchmarks/tcp/nsjoin.c new file mode 100644 index 000000000..524b4d549 --- /dev/null +++ b/test/benchmarks/tcp/nsjoin.c @@ -0,0 +1,47 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <errno.h> +#include <fcntl.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +int main(int argc, char** argv) { + if (argc <= 2) { + fprintf(stderr, "error: must provide a namespace file.\n"); + fprintf(stderr, "usage: %s <file> [arguments...]\n", argv[0]); + return 1; + } + + int fd = open(argv[1], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "error opening %s: %s\n", argv[1], strerror(errno)); + return 1; + } + if (setns(fd, 0) < 0) { + fprintf(stderr, "error joining %s: %s\n", argv[1], strerror(errno)); + return 1; + } + + execvp(argv[2], &argv[2]); + return 1; +} diff --git a/test/benchmarks/tcp/tcp_benchmark.sh b/test/benchmarks/tcp/tcp_benchmark.sh new file mode 100755 index 000000000..ef04b4ace --- /dev/null +++ b/test/benchmarks/tcp/tcp_benchmark.sh @@ -0,0 +1,392 @@ +#!/bin/bash + +# Copyright 2018 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TCP benchmark; see README.md for documentation. + +# Fixed parameters. +iperf_port=45201 # Not likely to be privileged. +proxy_port=44000 # Ditto. +client_addr=10.0.0.1 +client_proxy_addr=10.0.0.2 +server_proxy_addr=10.0.0.3 +server_addr=10.0.0.4 +mask=8 + +# Defaults; this provides a reasonable approximation of a decent internet link. +# Parameters can be varied independently from this set to see response to +# various changes in the kind of link available. +client=false +server=false +verbose=false +gso=0 +swgso=false +mtu=1280 # 1280 is a reasonable lowest-common-denominator. +latency=10 # 10ms approximates a fast, dedicated connection. +latency_variation=1 # +/- 1ms is a relatively low amount of jitter. +loss=0.1 # 0.1% loss is non-zero, but not extremely high. +duplicate=0.1 # 0.1% means duplicates are 1/10x as frequent as losses. +duration=30 # 30s is enough time to consistent results (experimentally). +helper_dir=$(dirname $0) +netstack_opts= +disable_linux_gso= +num_client_threads=1 + +# Check for netem support. +lsmod_output=$(lsmod | grep sch_netem) +if [ "$?" != "0" ]; then + echo "warning: sch_netem may not be installed." >&2 +fi + +while [ $# -gt 0 ]; do + case "$1" in + --client) + client=true + ;; + --client_tcp_probe_file) + shift + netstack_opts="${netstack_opts} -client_tcp_probe_file=$1" + ;; + --server) + server=true + ;; + --verbose) + verbose=true + ;; + --gso) + shift + gso=$1 + ;; + --swgso) + swgso=true + ;; + --server_tcp_probe_file) + shift + netstack_opts="${netstack_opts} -server_tcp_probe_file=$1" + ;; + --ideal) + mtu=1500 # Standard ethernet. + latency=0 # No latency. + latency_variation=0 # No jitter. + loss=0 # No loss. + duplicate=0 # No duplicates. + ;; + --mtu) + shift + [ "$#" -le 0 ] && echo "no mtu provided" && exit 1 + mtu=$1 + ;; + --sack) + netstack_opts="${netstack_opts} -sack" + ;; + --cubic) + netstack_opts="${netstack_opts} -cubic" + ;; + --moderate-recv-buf) + netstack_opts="${netstack_opts} -moderate_recv_buf" + ;; + --duration) + shift + [ "$#" -le 0 ] && echo "no duration provided" && exit 1 + duration=$1 + ;; + --latency) + shift + [ "$#" -le 0 ] && echo "no latency provided" && exit 1 + latency=$1 + ;; + --latency-variation) + shift + [ "$#" -le 0 ] && echo "no latency variation provided" && exit 1 + latency_variation=$1 + ;; + --loss) + shift + [ "$#" -le 0 ] && echo "no loss probability provided" && exit 1 + loss=$1 + ;; + --duplicate) + shift + [ "$#" -le 0 ] && echo "no duplicate provided" && exit 1 + duplicate=$1 + ;; + --cpuprofile) + shift + netstack_opts="${netstack_opts} -cpuprofile=$1" + ;; + --memprofile) + shift + netstack_opts="${netstack_opts} -memprofile=$1" + ;; + --disable-linux-gso) + disable_linux_gso=1 + ;; + --num-client-threads) + shift + num_client_threads=$1 + ;; + --helpers) + shift + [ "$#" -le 0 ] && echo "no helper dir provided" && exit 1 + helper_dir=$1 + ;; + *) + echo "usage: $0 [options]" + echo "options:" + echo " --help show this message" + echo " --verbose verbose output" + echo " --client use netstack as the client" + echo " --ideal reset all network emulation" + echo " --server use netstack as the server" + echo " --mtu set the mtu (bytes)" + echo " --sack enable SACK support" + echo " --moderate-recv-buf enable TCP receive buffer auto-tuning" + echo " --cubic enable CUBIC congestion control for Netstack" + echo " --duration set the test duration (s)" + echo " --latency set the latency (ms)" + echo " --latency-variation set the latency variation" + echo " --loss set the loss probability (%)" + echo " --duplicate set the duplicate probability (%)" + echo " --helpers set the helper directory" + echo " --num-client-threads number of parallel client threads to run" + echo " --disable-linux-gso disable segmentation offload in the Linux network stack" + echo "" + echo "The output will of the script will be:" + echo " <throughput> <client-cpu-usage> <server-cpu-usage>" + exit 1 + esac + shift +done + +if [ ${verbose} == "true" ]; then + set -x +fi + +# Latency needs to be halved, since it's applied on both ways. +half_latency=$(echo ${latency}/2 | bc -l | awk '{printf "%1.2f", $0}') +half_loss=$(echo ${loss}/2 | bc -l | awk '{printf "%1.6f", $0}') +half_duplicate=$(echo ${duplicate}/2 | bc -l | awk '{printf "%1.6f", $0}') +helper_dir=${helper_dir#$(pwd)/} # Use relative paths. +proxy_binary=${helper_dir}/tcp_proxy +nsjoin_binary=${helper_dir}/nsjoin + +if [ ! -e ${proxy_binary} ]; then + echo "Could not locate ${proxy_binary}, please make sure you've built the binary" + exit 1 +fi + +if [ ! -e ${nsjoin_binary} ]; then + echo "Could not locate ${nsjoin_binary}, please make sure you've built the binary" + exit 1 +fi + +if [ $(echo ${latency_variation} | awk '{printf "%1.2f", $0}') != "0.00" ]; then + # As long as there's some jitter, then we use the paretonormal distribution. + # This will preserve the minimum RTT, but add a realistic amount of jitter to + # the connection and cause re-ordering, etc. The regular pareto distribution + # appears to an unreasonable level of delay (we want only small spikes.) + distribution="distribution paretonormal" +else + distribution="" +fi + +# Client proxy that will listen on the client's iperf target forward traffic +# using the host networking stack. +client_args="${proxy_binary} -port ${proxy_port} -forward ${server_proxy_addr}:${proxy_port}" +if ${client}; then + # Client proxy that will listen on the client's iperf target + # and forward traffic using netstack. + client_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -client \\ + -mtu ${mtu} -iface client.0 -addr ${client_proxy_addr} -mask ${mask} \\ + -forward ${server_proxy_addr}:${proxy_port} -gso=${gso} -swgso=${swgso}" +fi + +# Server proxy that will listen on the proxy port and forward to the server's +# iperf server using the host networking stack. +server_args="${proxy_binary} -port ${proxy_port} -forward ${server_addr}:${iperf_port}" +if ${server}; then + # Server proxy that will listen on the proxy port and forward to the servers' + # iperf server using netstack. + server_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -server \\ + -mtu ${mtu} -iface server.0 -addr ${server_proxy_addr} -mask ${mask} \\ + -forward ${server_addr}:${iperf_port} -gso=${gso} -swgso=${swgso}" +fi + +# Specify loss and duplicate parameters only if they are non-zero +loss_opt="" +if [ "$(echo $half_loss | bc -q)" != "0" ]; then + loss_opt="loss random ${half_loss}%" +fi +duplicate_opt="" +if [ "$(echo $half_duplicate | bc -q)" != "0" ]; then + duplicate_opt="duplicate ${half_duplicate}%" +fi + +exec unshare -U -m -n -r -f -p --mount-proc /bin/bash << EOF +set -e -m + +if [ ${verbose} == "true" ]; then + set -x +fi + +mount -t tmpfs netstack-bench /tmp + +# We may have reset the path in the unshare if the shell loaded some public +# profiles. Ensure that tools are discoverable via the parent's PATH. +export PATH=${PATH} + +# Add client, server interfaces. +ip link add client.0 type veth peer name client.1 +ip link add server.0 type veth peer name server.1 + +# Add network emulation devices. +ip link add wan.0 type veth peer name wan.1 +ip link set wan.0 up +ip link set wan.1 up + +# Enroll on the bridge. +ip link add name br0 type bridge +ip link add name br1 type bridge +ip link set client.1 master br0 +ip link set server.1 master br1 +ip link set wan.0 master br0 +ip link set wan.1 master br1 +ip link set br0 up +ip link set br1 up + +# Set the MTU appropriately. +ip link set client.0 mtu ${mtu} +ip link set server.0 mtu ${mtu} +ip link set wan.0 mtu ${mtu} +ip link set wan.1 mtu ${mtu} + +# Add appropriate latency, loss and duplication. +# +# This is added in at the point of bridge connection. +for device in wan.0 wan.1; do + # NOTE: We don't support a loss correlation as testing has shown that it + # actually doesn't work. The man page actually has a small comment about this + # "It is also possible to add a correlation, but this option is now deprecated + # due to the noticed bad behavior." For more information see netem(8). + tc qdisc add dev \$device root netem \\ + delay ${half_latency}ms ${latency_variation}ms ${distribution} \\ + ${loss_opt} ${duplicate_opt} +done + +# Start a client proxy. +touch /tmp/client.netns +unshare -n mount --bind /proc/self/ns/net /tmp/client.netns + +# Move the endpoint into the namespace. +while ip link | grep client.0 > /dev/null; do + ip link set dev client.0 netns /tmp/client.netns +done + +if ! ${client}; then + # Only add the address to NIC if netstack is not in use. Otherwise the host + # will also process the inbound SYN and send a RST back. + ${nsjoin_binary} /tmp/client.netns ip addr add ${client_proxy_addr}/${mask} dev client.0 +fi + +# Start a server proxy. +touch /tmp/server.netns +unshare -n mount --bind /proc/self/ns/net /tmp/server.netns +# Move the endpoint into the namespace. +while ip link | grep server.0 > /dev/null; do + ip link set dev server.0 netns /tmp/server.netns +done +if ! ${server}; then + # Only add the address to NIC if netstack is not in use. Otherwise the host + # will also process the inbound SYN and send a RST back. + ${nsjoin_binary} /tmp/server.netns ip addr add ${server_proxy_addr}/${mask} dev server.0 +fi + +# Add client and server addresses, and bring everything up. +${nsjoin_binary} /tmp/client.netns ip addr add ${client_addr}/${mask} dev client.0 +${nsjoin_binary} /tmp/server.netns ip addr add ${server_addr}/${mask} dev server.0 +if [ "${disable_linux_gso}" == "1" ]; then + ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 tso off + ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gro off + ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gso off + ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 tso off + ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gso off + ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gro off +fi +${nsjoin_binary} /tmp/client.netns ip link set client.0 up +${nsjoin_binary} /tmp/client.netns ip link set lo up +${nsjoin_binary} /tmp/server.netns ip link set server.0 up +${nsjoin_binary} /tmp/server.netns ip link set lo up +ip link set dev client.1 up +ip link set dev server.1 up + +${nsjoin_binary} /tmp/client.netns ${client_args} & +client_pid=\$! +${nsjoin_binary} /tmp/server.netns ${server_args} & +server_pid=\$! + +# Start the iperf server. +${nsjoin_binary} /tmp/server.netns iperf -p ${iperf_port} -s >&2 & +iperf_pid=\$! + +# Show traffic information. +if ! ${client} && ! ${server}; then + ${nsjoin_binary} /tmp/client.netns ping -c 100 -i 0.001 -W 1 ${server_addr} >&2 || true +fi + +results_file=\$(mktemp) +function cleanup { + rm -f \$results_file + kill -TERM \$client_pid + kill -TERM \$server_pid + wait \$client_pid + wait \$server_pid + kill -9 \$iperf_pid 2>/dev/null +} + +# Allow failure from this point. +set +e +trap cleanup EXIT + +# Run the benchmark, recording the results file. +while ${nsjoin_binary} /tmp/client.netns iperf \\ + -p ${proxy_port} -c ${client_addr} -t ${duration} -f m -P ${num_client_threads} 2>&1 \\ + | tee \$results_file \\ + | grep "connect failed" >/dev/null; do + sleep 0.1 # Wait for all services. +done + +# Unlink all relevant devices from the bridge. This is because when the bridge +# is deleted, the kernel may hang. It appears that this problem is fixed in +# upstream commit 1ce5cce895309862d2c35d922816adebe094fe4a. +ip link set client.1 nomaster +ip link set server.1 nomaster +ip link set wan.0 nomaster +ip link set wan.1 nomaster + +# Emit raw results. +cat \$results_file >&2 + +# Emit a useful result (final throughput). +mbits=\$(grep Mbits/sec \$results_file \\ + | sed -n -e 's/^.*[[:space:]]\\([[:digit:]]\\+\\(\\.[[:digit:]]\\+\\)\\?\\)[[:space:]]*Mbits\\/sec.*/\\1/p') +client_cpu_ticks=\$(cat /proc/\$client_pid/stat \\ + | awk '{print (\$14+\$15);}') +server_cpu_ticks=\$(cat /proc/\$server_pid/stat \\ + | awk '{print (\$14+\$15);}') +ticks_per_sec=\$(getconf CLK_TCK) +client_cpu_load=\$(bc -l <<< \$client_cpu_ticks/\$ticks_per_sec/${duration}) +server_cpu_load=\$(bc -l <<< \$server_cpu_ticks/\$ticks_per_sec/${duration}) +echo \$mbits \$client_cpu_load \$server_cpu_load +EOF diff --git a/test/benchmarks/tcp/tcp_proxy.go b/test/benchmarks/tcp/tcp_proxy.go new file mode 100644 index 000000000..4b7ca7a14 --- /dev/null +++ b/test/benchmarks/tcp/tcp_proxy.go @@ -0,0 +1,451 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Binary tcp_proxy is a simple TCP proxy. +package main + +import ( + "encoding/gob" + "flag" + "fmt" + "io" + "log" + "math/rand" + "net" + "os" + "os/signal" + "regexp" + "runtime" + "runtime/pprof" + "strconv" + "syscall" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" + "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" + "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" +) + +var ( + port = flag.Int("port", 0, "bind port (all addresses)") + forward = flag.String("forward", "", "forwarding target") + client = flag.Bool("client", false, "use netstack for listen") + server = flag.Bool("server", false, "use netstack for dial") + + // Netstack-specific options. + mtu = flag.Int("mtu", 1280, "mtu for network stack") + addr = flag.String("addr", "", "address for tap-based netstack") + mask = flag.Int("mask", 8, "mask size for address") + iface = flag.String("iface", "", "network interface name to bind for netstack") + sack = flag.Bool("sack", false, "enable SACK support for netstack") + moderateRecvBuf = flag.Bool("moderate_recv_buf", false, "enable TCP Receive Buffer Auto-tuning") + cubic = flag.Bool("cubic", false, "enable use of CUBIC congestion control for netstack") + gso = flag.Int("gso", 0, "GSO maximum size") + swgso = flag.Bool("swgso", false, "software-level GSO") + clientTCPProbeFile = flag.String("client_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.") + serverTCPProbeFile = flag.String("server_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.") + cpuprofile = flag.String("cpuprofile", "", "write cpu profile to the specified file.") + memprofile = flag.String("memprofile", "", "write memory profile to the specified file.") +) + +type impl interface { + dial(address string) (net.Conn, error) + listen(port int) (net.Listener, error) + printStats() +} + +type netImpl struct{} + +func (netImpl) dial(address string) (net.Conn, error) { + return net.Dial("tcp", address) +} + +func (netImpl) listen(port int) (net.Listener, error) { + return net.Listen("tcp", fmt.Sprintf(":%d", port)) +} + +func (netImpl) printStats() { +} + +const ( + nicID = 1 // Fixed. + bufSize = 4 << 20 // 4MB. +) + +type netstackImpl struct { + s *stack.Stack + addr tcpip.Address + mode string +} + +func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) { + // Get all interfaces in the namespace. + ifaces, err := net.Interfaces() + if err != nil { + return nil, fmt.Errorf("querying interfaces: %v", err) + } + + for _, iface := range ifaces { + if iface.Name != ifaceName { + continue + } + // Create the socket. + const protocol = 0x0300 // htons(ETH_P_ALL) + fds := make([]int, numChannels) + for i := range fds { + fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) + if err != nil { + return nil, fmt.Errorf("unable to create raw socket: %v", err) + } + + // Bind to the appropriate device. + ll := syscall.SockaddrLinklayer{ + Protocol: protocol, + Ifindex: iface.Index, + Pkttype: syscall.PACKET_HOST, + } + if err := syscall.Bind(fd, &ll); err != nil { + return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) + } + + // RAW Sockets by default have a very small SO_RCVBUF of 256KB, + // up it to at least 4MB to reduce packet drops. + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize); err != nil { + return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", bufSize, err) + } + + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize); err != nil { + return nil, fmt.Errorf("setsockopt(..., SO_SNDBUF, %v,..) = %v", bufSize, err) + } + + if !*swgso && *gso != 0 { + if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { + return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) + } + } + fds[i] = fd + } + return fds, nil + } + return nil, fmt.Errorf("failed to find interface: %v", ifaceName) +} + +func newNetstackImpl(mode string) (impl, error) { + fds, err := setupNetwork(*iface, runtime.GOMAXPROCS(-1)) + if err != nil { + return nil, err + } + + // Parse details. + parsedAddr := tcpip.Address(net.ParseIP(*addr).To4()) + parsedDest := tcpip.Address("") // Filled in below. + parsedMask := tcpip.AddressMask("") // Filled in below. + switch *mask { + case 8: + parsedDest = tcpip.Address([]byte{parsedAddr[0], 0, 0, 0}) + parsedMask = tcpip.AddressMask([]byte{0xff, 0, 0, 0}) + case 16: + parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], 0, 0}) + parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0, 0}) + case 24: + parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], parsedAddr[2], 0}) + parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0xff, 0}) + default: + // This is just laziness; we don't expect a different mask. + return nil, fmt.Errorf("mask %d not supported", mask) + } + + // Create a new network stack. + netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()} + transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol()} + s := stack.New(stack.Options{ + NetworkProtocols: netProtos, + TransportProtocols: transProtos, + }) + + // Generate a new mac for the eth device. + mac := make(net.HardwareAddr, 6) + rand.Read(mac) // Fill with random data. + mac[0] &^= 0x1 // Clear multicast bit. + mac[0] |= 0x2 // Set local assignment bit (IEEE802). + ep, err := fdbased.New(&fdbased.Options{ + FDs: fds, + MTU: uint32(*mtu), + EthernetHeader: true, + Address: tcpip.LinkAddress(mac), + // Enable checksum generation as we need to generate valid + // checksums for the veth device to deliver our packets to the + // peer. But we do want to disable checksum verification as veth + // devices do perform GRO and the linux host kernel may not + // regenerate valid checksums after GRO. + TXChecksumOffload: false, + RXChecksumOffload: true, + PacketDispatchMode: fdbased.RecvMMsg, + GSOMaxSize: uint32(*gso), + SoftwareGSOEnabled: *swgso, + }) + if err != nil { + return nil, fmt.Errorf("failed to create FD endpoint: %v", err) + } + if err := s.CreateNIC(nicID, fifo.New(ep, runtime.GOMAXPROCS(0), 1000)); err != nil { + return nil, fmt.Errorf("error creating NIC %q: %v", *iface, err) + } + if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + return nil, fmt.Errorf("error adding ARP address to %q: %v", *iface, err) + } + if err := s.AddAddress(nicID, ipv4.ProtocolNumber, parsedAddr); err != nil { + return nil, fmt.Errorf("error adding IP address to %q: %v", *iface, err) + } + + subnet, err := tcpip.NewSubnet(parsedDest, parsedMask) + if err != nil { + return nil, fmt.Errorf("tcpip.Subnet(%s, %s): %s", parsedDest, parsedMask, err) + } + // Add default route; we only support + s.SetRouteTable([]tcpip.Route{ + { + Destination: subnet, + NIC: nicID, + }, + }) + + // Set protocol options. + if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(*sack)); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption for SACKEnabled failed: %s", err) + } + + // Enable Receive Buffer Auto-Tuning. + if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(*moderateRecvBuf)); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err) + } + + // Set Congestion Control to cubic if requested. + if *cubic { + if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.CongestionControlOption("cubic")); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption for CongestionControlOption(cubic) failed: %s", err) + } + } + + return netstackImpl{ + s: s, + addr: parsedAddr, + mode: mode, + }, nil +} + +func (n netstackImpl) dial(address string) (net.Conn, error) { + host, port, err := net.SplitHostPort(address) + if err != nil { + return nil, err + } + if host == "" { + // A host must be provided for the dial. + return nil, fmt.Errorf("no host provided") + } + portNumber, err := strconv.Atoi(port) + if err != nil { + return nil, err + } + addr := tcpip.FullAddress{ + NIC: nicID, + Addr: tcpip.Address(net.ParseIP(host).To4()), + Port: uint16(portNumber), + } + conn, err := gonet.DialTCP(n.s, addr, ipv4.ProtocolNumber) + if err != nil { + return nil, err + } + return conn, nil +} + +func (n netstackImpl) listen(port int) (net.Listener, error) { + addr := tcpip.FullAddress{ + NIC: nicID, + Port: uint16(port), + } + listener, err := gonet.ListenTCP(n.s, addr, ipv4.ProtocolNumber) + if err != nil { + return nil, err + } + return listener, nil +} + +var zeroFieldsRegexp = regexp.MustCompile(`\s*[a-zA-Z0-9]*:0`) + +func (n netstackImpl) printStats() { + // Don't show zero fields. + stats := zeroFieldsRegexp.ReplaceAllString(fmt.Sprintf("%+v", n.s.Stats()), "") + log.Printf("netstack %s Stats: %+v\n", n.mode, stats) +} + +// installProbe installs a TCP Probe function that will dump endpoint +// state to the specified file. It also returns a close func() that +// can be used to close the probeFile. +func (n netstackImpl) installProbe(probeFileName string) (close func()) { + // Install Probe to dump out end point state. + probeFile, err := os.Create(probeFileName) + if err != nil { + log.Fatalf("failed to create tcp_probe file %s: %v", probeFileName, err) + } + probeEncoder := gob.NewEncoder(probeFile) + // Install a TCP Probe. + n.s.AddTCPProbe(func(state stack.TCPEndpointState) { + probeEncoder.Encode(state) + }) + return func() { probeFile.Close() } +} + +func main() { + flag.Parse() + if *port == 0 { + log.Fatalf("no port provided") + } + if *forward == "" { + log.Fatalf("no forward provided") + } + // Seed the random number generator to ensure that we are given MAC addresses that don't + // for the case of the client and server stack. + rand.Seed(time.Now().UTC().UnixNano()) + + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + log.Fatal("could not create CPU profile: ", err) + } + defer func() { + if err := f.Close(); err != nil { + log.Print("error closing CPU profile: ", err) + } + }() + if err := pprof.StartCPUProfile(f); err != nil { + log.Fatal("could not start CPU profile: ", err) + } + defer pprof.StopCPUProfile() + } + + var ( + in impl + out impl + err error + ) + if *server { + in, err = newNetstackImpl("server") + if *serverTCPProbeFile != "" { + defer in.(netstackImpl).installProbe(*serverTCPProbeFile)() + } + + } else { + in = netImpl{} + } + if err != nil { + log.Fatalf("netstack error: %v", err) + } + if *client { + out, err = newNetstackImpl("client") + if *clientTCPProbeFile != "" { + defer out.(netstackImpl).installProbe(*clientTCPProbeFile)() + } + } else { + out = netImpl{} + } + if err != nil { + log.Fatalf("netstack error: %v", err) + } + + // Dial forward before binding. + var next net.Conn + for { + next, err = out.dial(*forward) + if err == nil { + break + } + time.Sleep(50 * time.Millisecond) + log.Printf("connect failed retrying: %v", err) + } + + // Bind once to the server socket. + listener, err := in.listen(*port) + if err != nil { + // Should not happen, everything must be bound by this time + // this proxy is started. + log.Fatalf("unable to listen: %v", err) + } + log.Printf("client=%v, server=%v, ready.", *client, *server) + + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGTERM) + go func() { + <-sigs + if *cpuprofile != "" { + pprof.StopCPUProfile() + } + if *memprofile != "" { + f, err := os.Create(*memprofile) + if err != nil { + log.Fatal("could not create memory profile: ", err) + } + defer func() { + if err := f.Close(); err != nil { + log.Print("error closing memory profile: ", err) + } + }() + runtime.GC() // get up-to-date statistics + if err := pprof.WriteHeapProfile(f); err != nil { + log.Fatalf("Unable to write heap profile: %v", err) + } + } + os.Exit(0) + }() + + for { + // Forward all connections. + inConn, err := listener.Accept() + if err != nil { + // This should not happen; we are listening + // successfully. Exhausted all available FDs? + log.Fatalf("accept error: %v", err) + } + log.Printf("incoming connection established.") + + // Copy both ways. + go io.Copy(inConn, next) + go io.Copy(next, inConn) + + // Print stats every second. + go func() { + t := time.NewTicker(time.Second) + defer t.Stop() + for { + <-t.C + in.printStats() + out.printStats() + } + }() + + for { + // Dial again. + next, err = out.dial(*forward) + if err == nil { + break + } + } + } +} |