summaryrefslogtreecommitdiffhomepage
path: root/test/benchmarks/tcp
diff options
context:
space:
mode:
Diffstat (limited to 'test/benchmarks/tcp')
-rw-r--r--test/benchmarks/tcp/BUILD41
-rw-r--r--test/benchmarks/tcp/README.md87
-rw-r--r--test/benchmarks/tcp/nsjoin.c47
-rwxr-xr-xtest/benchmarks/tcp/tcp_benchmark.sh396
-rw-r--r--test/benchmarks/tcp/tcp_proxy.go466
5 files changed, 0 insertions, 1037 deletions
diff --git a/test/benchmarks/tcp/BUILD b/test/benchmarks/tcp/BUILD
deleted file mode 100644
index 6dde7d9e6..000000000
--- a/test/benchmarks/tcp/BUILD
+++ /dev/null
@@ -1,41 +0,0 @@
-load("//tools:defs.bzl", "cc_binary", "go_binary")
-
-package(licenses = ["notice"])
-
-go_binary(
- name = "tcp_proxy",
- srcs = ["tcp_proxy.go"],
- visibility = ["//:sandbox"],
- deps = [
- "//pkg/tcpip",
- "//pkg/tcpip/adapters/gonet",
- "//pkg/tcpip/link/fdbased",
- "//pkg/tcpip/link/qdisc/fifo",
- "//pkg/tcpip/network/arp",
- "//pkg/tcpip/network/ipv4",
- "//pkg/tcpip/stack",
- "//pkg/tcpip/transport/tcp",
- "//pkg/tcpip/transport/udp",
- "@org_golang_x_sys//unix:go_default_library",
- ],
-)
-
-# nsjoin is a trivial replacement for nsenter. This is used because nsenter is
-# not available on all systems where this benchmark is run (and we aim to
-# minimize external dependencies.)
-
-cc_binary(
- name = "nsjoin",
- srcs = ["nsjoin.c"],
- visibility = ["//:sandbox"],
-)
-
-sh_binary(
- name = "tcp_benchmark",
- srcs = ["tcp_benchmark.sh"],
- data = [
- ":nsjoin",
- ":tcp_proxy",
- ],
- visibility = ["//:sandbox"],
-)
diff --git a/test/benchmarks/tcp/README.md b/test/benchmarks/tcp/README.md
deleted file mode 100644
index 38e6e69f0..000000000
--- a/test/benchmarks/tcp/README.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# TCP Benchmarks
-
-This directory contains a standardized TCP benchmark. This helps to evaluate the
-performance of netstack and native networking stacks under various conditions.
-
-## `tcp_benchmark`
-
-This benchmark allows TCP throughput testing under various conditions. The setup
-consists of an iperf client, a client proxy, a server proxy and an iperf server.
-The client proxy and server proxy abstract the network mechanism used to
-communicate between the iperf client and server.
-
-The setup looks like the following:
-
-```
- +--------------+ (native) +--------------+
- | iperf client |[lo @ 10.0.0.1]------>| client proxy |
- +--------------+ +--------------+
- [client.0 @ 10.0.0.2]
- (netstack) | | (native)
- +------+-----+
- |
- [br0]
- |
- Network emulation applied ---> [wan.0:wan.1]
- |
- [br1]
- |
- +------+-----+
- (netstack) | | (native)
- [server.0 @ 10.0.0.3]
- +--------------+ +--------------+
- | iperf server |<------[lo @ 10.0.0.4]| server proxy |
- +--------------+ (native) +--------------+
-```
-
-Different configurations can be run using different arguments. For example:
-
-* Native test under normal internet conditions: `tcp_benchmark`
-* Native test under ideal conditions: `tcp_benchmark --ideal`
-* Netstack client under ideal conditions: `tcp_benchmark --client --ideal`
-* Netstack client with 5% packet loss: `tcp_benchmark --client --ideal --loss
- 5`
-
-Use `tcp_benchmark --help` for full arguments.
-
-This tool may be used to easily generate data for graphing. For example, to
-generate a CSV for various latencies, you might do:
-
-```
-rm -f /tmp/netstack_latency.csv /tmp/native_latency.csv
-latencies=$(seq 0 5 50;
- seq 60 10 100;
- seq 125 25 250;
- seq 300 50 500)
-for latency in $latencies; do
- read throughput client_cpu server_cpu <<< \
- $(./tcp_benchmark --duration 30 --client --ideal --latency $latency)
- echo $latency,$throughput,$client_cpu >> /tmp/netstack_latency.csv
-done
-for latency in $latencies; do
- read throughput client_cpu server_cpu <<< \
- $(./tcp_benchmark --duration 30 --ideal --latency $latency)
- echo $latency,$throughput,$client_cpu >> /tmp/native_latency.csv
-done
-```
-
-Similarly, to generate a CSV for various levels of packet loss, the following
-would be appropriate:
-
-```
-rm -f /tmp/netstack_loss.csv /tmp/native_loss.csv
-losses=$(seq 0 0.1 1.0;
- seq 1.2 0.2 2.0;
- seq 2.5 0.5 5.0;
- seq 6.0 1.0 10.0)
-for loss in $losses; do
- read throughput client_cpu server_cpu <<< \
- $(./tcp_benchmark --duration 30 --client --ideal --latency 10 --loss $loss)
- echo $loss,$throughput,$client_cpu >> /tmp/netstack_loss.csv
-done
-for loss in $losses; do
- read throughput client_cpu server_cpu <<< \
- $(./tcp_benchmark --duration 30 --ideal --latency 10 --loss $loss)
- echo $loss,$throughput,$client_cpu >> /tmp/native_loss.csv
-done
-```
diff --git a/test/benchmarks/tcp/nsjoin.c b/test/benchmarks/tcp/nsjoin.c
deleted file mode 100644
index 524b4d549..000000000
--- a/test/benchmarks/tcp/nsjoin.c
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#include <errno.h>
-#include <fcntl.h>
-#include <sched.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-int main(int argc, char** argv) {
- if (argc <= 2) {
- fprintf(stderr, "error: must provide a namespace file.\n");
- fprintf(stderr, "usage: %s <file> [arguments...]\n", argv[0]);
- return 1;
- }
-
- int fd = open(argv[1], O_RDONLY);
- if (fd < 0) {
- fprintf(stderr, "error opening %s: %s\n", argv[1], strerror(errno));
- return 1;
- }
- if (setns(fd, 0) < 0) {
- fprintf(stderr, "error joining %s: %s\n", argv[1], strerror(errno));
- return 1;
- }
-
- execvp(argv[2], &argv[2]);
- return 1;
-}
diff --git a/test/benchmarks/tcp/tcp_benchmark.sh b/test/benchmarks/tcp/tcp_benchmark.sh
deleted file mode 100755
index 6a4f33b96..000000000
--- a/test/benchmarks/tcp/tcp_benchmark.sh
+++ /dev/null
@@ -1,396 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TCP benchmark; see README.md for documentation.
-
-# Fixed parameters.
-iperf_port=45201 # Not likely to be privileged.
-proxy_port=44000 # Ditto.
-client_addr=10.0.0.1
-client_proxy_addr=10.0.0.2
-server_proxy_addr=10.0.0.3
-server_addr=10.0.0.4
-mask=8
-
-# Defaults; this provides a reasonable approximation of a decent internet link.
-# Parameters can be varied independently from this set to see response to
-# various changes in the kind of link available.
-client=false
-server=false
-verbose=false
-gso=0
-swgso=false
-mtu=1280 # 1280 is a reasonable lowest-common-denominator.
-latency=10 # 10ms approximates a fast, dedicated connection.
-latency_variation=1 # +/- 1ms is a relatively low amount of jitter.
-loss=0.1 # 0.1% loss is non-zero, but not extremely high.
-duplicate=0.1 # 0.1% means duplicates are 1/10x as frequent as losses.
-duration=30 # 30s is enough time to consistent results (experimentally).
-helper_dir=$(dirname $0)
-netstack_opts=
-disable_linux_gso=
-num_client_threads=1
-
-# Check for netem support.
-lsmod_output=$(lsmod | grep sch_netem)
-if [ "$?" != "0" ]; then
- echo "warning: sch_netem may not be installed." >&2
-fi
-
-while [ $# -gt 0 ]; do
- case "$1" in
- --client)
- client=true
- ;;
- --client_tcp_probe_file)
- shift
- netstack_opts="${netstack_opts} -client_tcp_probe_file=$1"
- ;;
- --server)
- server=true
- ;;
- --verbose)
- verbose=true
- ;;
- --gso)
- shift
- gso=$1
- ;;
- --swgso)
- swgso=true
- ;;
- --server_tcp_probe_file)
- shift
- netstack_opts="${netstack_opts} -server_tcp_probe_file=$1"
- ;;
- --ideal)
- mtu=1500 # Standard ethernet.
- latency=0 # No latency.
- latency_variation=0 # No jitter.
- loss=0 # No loss.
- duplicate=0 # No duplicates.
- ;;
- --mtu)
- shift
- [ "$#" -le 0 ] && echo "no mtu provided" && exit 1
- mtu=$1
- ;;
- --sack)
- netstack_opts="${netstack_opts} -sack"
- ;;
- --rack)
- netstack_opts="${netstack_opts} -rack"
- ;;
- --cubic)
- netstack_opts="${netstack_opts} -cubic"
- ;;
- --moderate-recv-buf)
- netstack_opts="${netstack_opts} -moderate_recv_buf"
- ;;
- --duration)
- shift
- [ "$#" -le 0 ] && echo "no duration provided" && exit 1
- duration=$1
- ;;
- --latency)
- shift
- [ "$#" -le 0 ] && echo "no latency provided" && exit 1
- latency=$1
- ;;
- --latency-variation)
- shift
- [ "$#" -le 0 ] && echo "no latency variation provided" && exit 1
- latency_variation=$1
- ;;
- --loss)
- shift
- [ "$#" -le 0 ] && echo "no loss probability provided" && exit 1
- loss=$1
- ;;
- --duplicate)
- shift
- [ "$#" -le 0 ] && echo "no duplicate provided" && exit 1
- duplicate=$1
- ;;
- --cpuprofile)
- shift
- netstack_opts="${netstack_opts} -cpuprofile=$1"
- ;;
- --memprofile)
- shift
- netstack_opts="${netstack_opts} -memprofile=$1"
- ;;
- --disable-linux-gso)
- disable_linux_gso=1
- ;;
- --num-client-threads)
- shift
- num_client_threads=$1
- ;;
- --helpers)
- shift
- [ "$#" -le 0 ] && echo "no helper dir provided" && exit 1
- helper_dir=$1
- ;;
- *)
- echo "usage: $0 [options]"
- echo "options:"
- echo " --help show this message"
- echo " --verbose verbose output"
- echo " --client use netstack as the client"
- echo " --ideal reset all network emulation"
- echo " --server use netstack as the server"
- echo " --mtu set the mtu (bytes)"
- echo " --sack enable SACK support"
- echo " --rack enable RACK support"
- echo " --moderate-recv-buf enable TCP receive buffer auto-tuning"
- echo " --cubic enable CUBIC congestion control for Netstack"
- echo " --duration set the test duration (s)"
- echo " --latency set the latency (ms)"
- echo " --latency-variation set the latency variation"
- echo " --loss set the loss probability (%)"
- echo " --duplicate set the duplicate probability (%)"
- echo " --helpers set the helper directory"
- echo " --num-client-threads number of parallel client threads to run"
- echo " --disable-linux-gso disable segmentation offload in the Linux network stack"
- echo ""
- echo "The output will of the script will be:"
- echo " <throughput> <client-cpu-usage> <server-cpu-usage>"
- exit 1
- esac
- shift
-done
-
-if [ ${verbose} == "true" ]; then
- set -x
-fi
-
-# Latency needs to be halved, since it's applied on both ways.
-half_latency=$(echo ${latency}/2 | bc -l | awk '{printf "%1.2f", $0}')
-half_loss=$(echo ${loss}/2 | bc -l | awk '{printf "%1.6f", $0}')
-half_duplicate=$(echo ${duplicate}/2 | bc -l | awk '{printf "%1.6f", $0}')
-helper_dir=${helper_dir#$(pwd)/} # Use relative paths.
-proxy_binary=${helper_dir}/tcp_proxy
-nsjoin_binary=${helper_dir}/nsjoin
-
-if [ ! -e ${proxy_binary} ]; then
- echo "Could not locate ${proxy_binary}, please make sure you've built the binary"
- exit 1
-fi
-
-if [ ! -e ${nsjoin_binary} ]; then
- echo "Could not locate ${nsjoin_binary}, please make sure you've built the binary"
- exit 1
-fi
-
-if [ $(echo ${latency_variation} | awk '{printf "%1.2f", $0}') != "0.00" ]; then
- # As long as there's some jitter, then we use the paretonormal distribution.
- # This will preserve the minimum RTT, but add a realistic amount of jitter to
- # the connection and cause re-ordering, etc. The regular pareto distribution
- # appears to an unreasonable level of delay (we want only small spikes.)
- distribution="distribution paretonormal"
-else
- distribution=""
-fi
-
-# Client proxy that will listen on the client's iperf target forward traffic
-# using the host networking stack.
-client_args="${proxy_binary} -port ${proxy_port} -forward ${server_proxy_addr}:${proxy_port}"
-if ${client}; then
- # Client proxy that will listen on the client's iperf target
- # and forward traffic using netstack.
- client_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -client \\
- -mtu ${mtu} -iface client.0 -addr ${client_proxy_addr} -mask ${mask} \\
- -forward ${server_proxy_addr}:${proxy_port} -gso=${gso} -swgso=${swgso}"
-fi
-
-# Server proxy that will listen on the proxy port and forward to the server's
-# iperf server using the host networking stack.
-server_args="${proxy_binary} -port ${proxy_port} -forward ${server_addr}:${iperf_port}"
-if ${server}; then
- # Server proxy that will listen on the proxy port and forward to the servers'
- # iperf server using netstack.
- server_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -server \\
- -mtu ${mtu} -iface server.0 -addr ${server_proxy_addr} -mask ${mask} \\
- -forward ${server_addr}:${iperf_port} -gso=${gso} -swgso=${swgso}"
-fi
-
-# Specify loss and duplicate parameters only if they are non-zero
-loss_opt=""
-if [ "$(echo $half_loss | bc -q)" != "0" ]; then
- loss_opt="loss random ${half_loss}%"
-fi
-duplicate_opt=""
-if [ "$(echo $half_duplicate | bc -q)" != "0" ]; then
- duplicate_opt="duplicate ${half_duplicate}%"
-fi
-
-exec unshare -U -m -n -r -f -p --mount-proc /bin/bash << EOF
-set -e -m
-
-if [ ${verbose} == "true" ]; then
- set -x
-fi
-
-mount -t tmpfs netstack-bench /tmp
-
-# We may have reset the path in the unshare if the shell loaded some public
-# profiles. Ensure that tools are discoverable via the parent's PATH.
-export PATH=${PATH}
-
-# Add client, server interfaces.
-ip link add client.0 type veth peer name client.1
-ip link add server.0 type veth peer name server.1
-
-# Add network emulation devices.
-ip link add wan.0 type veth peer name wan.1
-ip link set wan.0 up
-ip link set wan.1 up
-
-# Enroll on the bridge.
-ip link add name br0 type bridge
-ip link add name br1 type bridge
-ip link set client.1 master br0
-ip link set server.1 master br1
-ip link set wan.0 master br0
-ip link set wan.1 master br1
-ip link set br0 up
-ip link set br1 up
-
-# Set the MTU appropriately.
-ip link set client.0 mtu ${mtu}
-ip link set server.0 mtu ${mtu}
-ip link set wan.0 mtu ${mtu}
-ip link set wan.1 mtu ${mtu}
-
-# Add appropriate latency, loss and duplication.
-#
-# This is added in at the point of bridge connection.
-for device in wan.0 wan.1; do
- # NOTE: We don't support a loss correlation as testing has shown that it
- # actually doesn't work. The man page actually has a small comment about this
- # "It is also possible to add a correlation, but this option is now deprecated
- # due to the noticed bad behavior." For more information see netem(8).
- tc qdisc add dev \$device root netem \\
- delay ${half_latency}ms ${latency_variation}ms ${distribution} \\
- ${loss_opt} ${duplicate_opt}
-done
-
-# Start a client proxy.
-touch /tmp/client.netns
-unshare -n mount --bind /proc/self/ns/net /tmp/client.netns
-
-# Move the endpoint into the namespace.
-while ip link | grep client.0 > /dev/null; do
- ip link set dev client.0 netns /tmp/client.netns
-done
-
-if ! ${client}; then
- # Only add the address to NIC if netstack is not in use. Otherwise the host
- # will also process the inbound SYN and send a RST back.
- ${nsjoin_binary} /tmp/client.netns ip addr add ${client_proxy_addr}/${mask} dev client.0
-fi
-
-# Start a server proxy.
-touch /tmp/server.netns
-unshare -n mount --bind /proc/self/ns/net /tmp/server.netns
-# Move the endpoint into the namespace.
-while ip link | grep server.0 > /dev/null; do
- ip link set dev server.0 netns /tmp/server.netns
-done
-if ! ${server}; then
- # Only add the address to NIC if netstack is not in use. Otherwise the host
- # will also process the inbound SYN and send a RST back.
- ${nsjoin_binary} /tmp/server.netns ip addr add ${server_proxy_addr}/${mask} dev server.0
-fi
-
-# Add client and server addresses, and bring everything up.
-${nsjoin_binary} /tmp/client.netns ip addr add ${client_addr}/${mask} dev client.0
-${nsjoin_binary} /tmp/server.netns ip addr add ${server_addr}/${mask} dev server.0
-if [ "${disable_linux_gso}" == "1" ]; then
- ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 tso off
- ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gro off
- ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gso off
- ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 tso off
- ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gso off
- ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gro off
-fi
-${nsjoin_binary} /tmp/client.netns ip link set client.0 up
-${nsjoin_binary} /tmp/client.netns ip link set lo up
-${nsjoin_binary} /tmp/server.netns ip link set server.0 up
-${nsjoin_binary} /tmp/server.netns ip link set lo up
-ip link set dev client.1 up
-ip link set dev server.1 up
-
-${nsjoin_binary} /tmp/client.netns ${client_args} &
-client_pid=\$!
-${nsjoin_binary} /tmp/server.netns ${server_args} &
-server_pid=\$!
-
-# Start the iperf server.
-${nsjoin_binary} /tmp/server.netns iperf -p ${iperf_port} -s >&2 &
-iperf_pid=\$!
-
-# Show traffic information.
-if ! ${client} && ! ${server}; then
- ${nsjoin_binary} /tmp/client.netns ping -c 100 -i 0.001 -W 1 ${server_addr} >&2 || true
-fi
-
-results_file=\$(mktemp)
-function cleanup {
- rm -f \$results_file
- kill -TERM \$client_pid
- kill -TERM \$server_pid
- wait \$client_pid
- wait \$server_pid
- kill -9 \$iperf_pid 2>/dev/null
-}
-
-# Allow failure from this point.
-set +e
-trap cleanup EXIT
-
-# Run the benchmark, recording the results file.
-while ${nsjoin_binary} /tmp/client.netns iperf \\
- -p ${proxy_port} -c ${client_addr} -t ${duration} -f m -P ${num_client_threads} 2>&1 \\
- | tee \$results_file \\
- | grep "connect failed" >/dev/null; do
- sleep 0.1 # Wait for all services.
-done
-
-# Unlink all relevant devices from the bridge. This is because when the bridge
-# is deleted, the kernel may hang. It appears that this problem is fixed in
-# upstream commit 1ce5cce895309862d2c35d922816adebe094fe4a.
-ip link set client.1 nomaster
-ip link set server.1 nomaster
-ip link set wan.0 nomaster
-ip link set wan.1 nomaster
-
-# Emit raw results.
-cat \$results_file >&2
-
-# Emit a useful result (final throughput).
-mbits=\$(grep Mbits/sec \$results_file \\
- | sed -n -e 's/^.*[[:space:]]\\([[:digit:]]\\+\\(\\.[[:digit:]]\\+\\)\\?\\)[[:space:]]*Mbits\\/sec.*/\\1/p')
-client_cpu_ticks=\$(cat /proc/\$client_pid/stat \\
- | awk '{print (\$14+\$15);}')
-server_cpu_ticks=\$(cat /proc/\$server_pid/stat \\
- | awk '{print (\$14+\$15);}')
-ticks_per_sec=\$(getconf CLK_TCK)
-client_cpu_load=\$(bc -l <<< \$client_cpu_ticks/\$ticks_per_sec/${duration})
-server_cpu_load=\$(bc -l <<< \$server_cpu_ticks/\$ticks_per_sec/${duration})
-echo \$mbits \$client_cpu_load \$server_cpu_load
-EOF
diff --git a/test/benchmarks/tcp/tcp_proxy.go b/test/benchmarks/tcp/tcp_proxy.go
deleted file mode 100644
index 8273b12a7..000000000
--- a/test/benchmarks/tcp/tcp_proxy.go
+++ /dev/null
@@ -1,466 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary tcp_proxy is a simple TCP proxy.
-package main
-
-import (
- "encoding/gob"
- "flag"
- "fmt"
- "io"
- "log"
- "math/rand"
- "net"
- "os"
- "os/signal"
- "regexp"
- "runtime"
- "runtime/pprof"
- "strconv"
- "time"
-
- "golang.org/x/sys/unix"
- "gvisor.dev/gvisor/pkg/tcpip"
- "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
- "gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
- "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
- "gvisor.dev/gvisor/pkg/tcpip/network/arp"
- "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
- "gvisor.dev/gvisor/pkg/tcpip/stack"
- "gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
- "gvisor.dev/gvisor/pkg/tcpip/transport/udp"
-)
-
-var (
- port = flag.Int("port", 0, "bind port (all addresses)")
- forward = flag.String("forward", "", "forwarding target")
- client = flag.Bool("client", false, "use netstack for listen")
- server = flag.Bool("server", false, "use netstack for dial")
-
- // Netstack-specific options.
- mtu = flag.Int("mtu", 1280, "mtu for network stack")
- addr = flag.String("addr", "", "address for tap-based netstack")
- mask = flag.Int("mask", 8, "mask size for address")
- iface = flag.String("iface", "", "network interface name to bind for netstack")
- sack = flag.Bool("sack", false, "enable SACK support for netstack")
- rack = flag.Bool("rack", false, "enable RACK in TCP")
- moderateRecvBuf = flag.Bool("moderate_recv_buf", false, "enable TCP Receive Buffer Auto-tuning")
- cubic = flag.Bool("cubic", false, "enable use of CUBIC congestion control for netstack")
- gso = flag.Int("gso", 0, "GSO maximum size")
- swgso = flag.Bool("swgso", false, "software-level GSO")
- clientTCPProbeFile = flag.String("client_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.")
- serverTCPProbeFile = flag.String("server_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.")
- cpuprofile = flag.String("cpuprofile", "", "write cpu profile to the specified file.")
- memprofile = flag.String("memprofile", "", "write memory profile to the specified file.")
-)
-
-type impl interface {
- dial(address string) (net.Conn, error)
- listen(port int) (net.Listener, error)
- printStats()
-}
-
-type netImpl struct{}
-
-func (netImpl) dial(address string) (net.Conn, error) {
- return net.Dial("tcp", address)
-}
-
-func (netImpl) listen(port int) (net.Listener, error) {
- return net.Listen("tcp", fmt.Sprintf(":%d", port))
-}
-
-func (netImpl) printStats() {
-}
-
-const (
- nicID = 1 // Fixed.
- bufSize = 4 << 20 // 4MB.
-)
-
-type netstackImpl struct {
- s *stack.Stack
- addr tcpip.Address
- mode string
-}
-
-func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) {
- // Get all interfaces in the namespace.
- ifaces, err := net.Interfaces()
- if err != nil {
- return nil, fmt.Errorf("querying interfaces: %v", err)
- }
-
- for _, iface := range ifaces {
- if iface.Name != ifaceName {
- continue
- }
- // Create the socket.
- const protocol = 0x0300 // htons(ETH_P_ALL)
- fds := make([]int, numChannels)
- for i := range fds {
- fd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, protocol)
- if err != nil {
- return nil, fmt.Errorf("unable to create raw socket: %v", err)
- }
-
- // Bind to the appropriate device.
- ll := unix.SockaddrLinklayer{
- Protocol: protocol,
- Ifindex: iface.Index,
- Pkttype: unix.PACKET_HOST,
- }
- if err := unix.Bind(fd, &ll); err != nil {
- return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
- }
-
- // RAW Sockets by default have a very small SO_RCVBUF of 256KB,
- // up it to at least 4MB to reduce packet drops.
- if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize); err != nil {
- return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", bufSize, err)
- }
-
- if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize); err != nil {
- return nil, fmt.Errorf("setsockopt(..., SO_SNDBUF, %v,..) = %v", bufSize, err)
- }
-
- if !*swgso && *gso != 0 {
- if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
- return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
- }
- }
- fds[i] = fd
- }
- return fds, nil
- }
- return nil, fmt.Errorf("failed to find interface: %v", ifaceName)
-}
-
-func newNetstackImpl(mode string) (impl, error) {
- fds, err := setupNetwork(*iface, runtime.GOMAXPROCS(-1))
- if err != nil {
- return nil, err
- }
-
- // Parse details.
- parsedAddr := tcpip.Address(net.ParseIP(*addr).To4())
- parsedDest := tcpip.Address("") // Filled in below.
- parsedMask := tcpip.AddressMask("") // Filled in below.
- switch *mask {
- case 8:
- parsedDest = tcpip.Address([]byte{parsedAddr[0], 0, 0, 0})
- parsedMask = tcpip.AddressMask([]byte{0xff, 0, 0, 0})
- case 16:
- parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], 0, 0})
- parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0, 0})
- case 24:
- parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], parsedAddr[2], 0})
- parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0xff, 0})
- default:
- // This is just laziness; we don't expect a different mask.
- return nil, fmt.Errorf("mask %d not supported", mask)
- }
-
- // Create a new network stack.
- netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, arp.NewProtocol}
- transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol}
- s := stack.New(stack.Options{
- NetworkProtocols: netProtos,
- TransportProtocols: transProtos,
- })
-
- // Generate a new mac for the eth device.
- mac := make(net.HardwareAddr, 6)
- rand.Read(mac) // Fill with random data.
- mac[0] &^= 0x1 // Clear multicast bit.
- mac[0] |= 0x2 // Set local assignment bit (IEEE802).
- ep, err := fdbased.New(&fdbased.Options{
- FDs: fds,
- MTU: uint32(*mtu),
- EthernetHeader: true,
- Address: tcpip.LinkAddress(mac),
- // Enable checksum generation as we need to generate valid
- // checksums for the veth device to deliver our packets to the
- // peer. But we do want to disable checksum verification as veth
- // devices do perform GRO and the linux host kernel may not
- // regenerate valid checksums after GRO.
- TXChecksumOffload: false,
- RXChecksumOffload: true,
- PacketDispatchMode: fdbased.RecvMMsg,
- GSOMaxSize: uint32(*gso),
- SoftwareGSOEnabled: *swgso,
- })
- if err != nil {
- return nil, fmt.Errorf("failed to create FD endpoint: %v", err)
- }
- if err := s.CreateNIC(nicID, fifo.New(ep, runtime.GOMAXPROCS(0), 1000)); err != nil {
- return nil, fmt.Errorf("error creating NIC %q: %v", *iface, err)
- }
- protocolAddr := tcpip.ProtocolAddress{
- Protocol: ipv4.ProtocolNumber,
- AddressWithPrefix: parsedAddr.WithPrefix(),
- }
- if err := s.AddProtocolAddress(nicID, protocolAddr, stack.AddressProperties{}); err != nil {
- return nil, fmt.Errorf("error adding IP address %+v to %q: %s", protocolAddr, *iface, err)
- }
-
- subnet, err := tcpip.NewSubnet(parsedDest, parsedMask)
- if err != nil {
- return nil, fmt.Errorf("tcpip.Subnet(%s, %s): %s", parsedDest, parsedMask, err)
- }
- // Add default route; we only support
- s.SetRouteTable([]tcpip.Route{
- {
- Destination: subnet,
- NIC: nicID,
- },
- })
-
- // Set protocol options.
- {
- opt := tcpip.TCPSACKEnabled(*sack)
- if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
- }
- }
-
- if *rack {
- opt := tcpip.TCPRecovery(tcpip.TCPRACKLossDetection)
- if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- return nil, fmt.Errorf("enabling RACK failed: %v", err)
- }
- }
-
- // Enable Receive Buffer Auto-Tuning.
- {
- opt := tcpip.TCPModerateReceiveBufferOption(*moderateRecvBuf)
- if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
- }
- }
-
- // Set Congestion Control to cubic if requested.
- if *cubic {
- opt := tcpip.CongestionControlOption("cubic")
- if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, opt, err)
- }
- }
-
- return netstackImpl{
- s: s,
- addr: parsedAddr,
- mode: mode,
- }, nil
-}
-
-func (n netstackImpl) dial(address string) (net.Conn, error) {
- host, port, err := net.SplitHostPort(address)
- if err != nil {
- return nil, err
- }
- if host == "" {
- // A host must be provided for the dial.
- return nil, fmt.Errorf("no host provided")
- }
- portNumber, err := strconv.Atoi(port)
- if err != nil {
- return nil, err
- }
- addr := tcpip.FullAddress{
- NIC: nicID,
- Addr: tcpip.Address(net.ParseIP(host).To4()),
- Port: uint16(portNumber),
- }
- conn, err := gonet.DialTCP(n.s, addr, ipv4.ProtocolNumber)
- if err != nil {
- return nil, err
- }
- return conn, nil
-}
-
-func (n netstackImpl) listen(port int) (net.Listener, error) {
- addr := tcpip.FullAddress{
- NIC: nicID,
- Port: uint16(port),
- }
- listener, err := gonet.ListenTCP(n.s, addr, ipv4.ProtocolNumber)
- if err != nil {
- return nil, err
- }
- return listener, nil
-}
-
-var zeroFieldsRegexp = regexp.MustCompile(`\s*[a-zA-Z0-9]*:0`)
-
-func (n netstackImpl) printStats() {
- // Don't show zero fields.
- stats := zeroFieldsRegexp.ReplaceAllString(fmt.Sprintf("%+v", n.s.Stats()), "")
- log.Printf("netstack %s Stats: %+v\n", n.mode, stats)
-}
-
-// installProbe installs a TCP Probe function that will dump endpoint
-// state to the specified file. It also returns a close func() that
-// can be used to close the probeFile.
-func (n netstackImpl) installProbe(probeFileName string) (close func()) {
- // Install Probe to dump out end point state.
- probeFile, err := os.Create(probeFileName)
- if err != nil {
- log.Fatalf("failed to create tcp_probe file %s: %v", probeFileName, err)
- }
- probeEncoder := gob.NewEncoder(probeFile)
- // Install a TCP Probe.
- n.s.AddTCPProbe(func(state stack.TCPEndpointState) {
- probeEncoder.Encode(state)
- })
- return func() { probeFile.Close() }
-}
-
-func main() {
- flag.Parse()
- if *port == 0 {
- log.Fatalf("no port provided")
- }
- if *forward == "" {
- log.Fatalf("no forward provided")
- }
- // Seed the random number generator to ensure that we are given MAC addresses that don't
- // for the case of the client and server stack.
- rand.Seed(time.Now().UTC().UnixNano())
-
- if *cpuprofile != "" {
- f, err := os.Create(*cpuprofile)
- if err != nil {
- log.Fatal("could not create CPU profile: ", err)
- }
- defer func() {
- if err := f.Close(); err != nil {
- log.Print("error closing CPU profile: ", err)
- }
- }()
- if err := pprof.StartCPUProfile(f); err != nil {
- log.Fatal("could not start CPU profile: ", err)
- }
- defer pprof.StopCPUProfile()
- }
-
- var (
- in impl
- out impl
- err error
- )
- if *server {
- in, err = newNetstackImpl("server")
- if *serverTCPProbeFile != "" {
- defer in.(netstackImpl).installProbe(*serverTCPProbeFile)()
- }
-
- } else {
- in = netImpl{}
- }
- if err != nil {
- log.Fatalf("netstack error: %v", err)
- }
- if *client {
- out, err = newNetstackImpl("client")
- if *clientTCPProbeFile != "" {
- defer out.(netstackImpl).installProbe(*clientTCPProbeFile)()
- }
- } else {
- out = netImpl{}
- }
- if err != nil {
- log.Fatalf("netstack error: %v", err)
- }
-
- // Dial forward before binding.
- var next net.Conn
- for {
- next, err = out.dial(*forward)
- if err == nil {
- break
- }
- time.Sleep(50 * time.Millisecond)
- log.Printf("connect failed retrying: %v", err)
- }
-
- // Bind once to the server socket.
- listener, err := in.listen(*port)
- if err != nil {
- // Should not happen, everything must be bound by this time
- // this proxy is started.
- log.Fatalf("unable to listen: %v", err)
- }
- log.Printf("client=%v, server=%v, ready.", *client, *server)
-
- sigs := make(chan os.Signal, 1)
- signal.Notify(sigs, unix.SIGTERM)
- go func() {
- <-sigs
- if *cpuprofile != "" {
- pprof.StopCPUProfile()
- }
- if *memprofile != "" {
- f, err := os.Create(*memprofile)
- if err != nil {
- log.Fatal("could not create memory profile: ", err)
- }
- defer func() {
- if err := f.Close(); err != nil {
- log.Print("error closing memory profile: ", err)
- }
- }()
- runtime.GC() // get up-to-date statistics
- if err := pprof.WriteHeapProfile(f); err != nil {
- log.Fatalf("Unable to write heap profile: %v", err)
- }
- }
- os.Exit(0)
- }()
-
- for {
- // Forward all connections.
- inConn, err := listener.Accept()
- if err != nil {
- // This should not happen; we are listening
- // successfully. Exhausted all available FDs?
- log.Fatalf("accept error: %v", err)
- }
- log.Printf("incoming connection established.")
-
- // Copy both ways.
- go io.Copy(inConn, next)
- go io.Copy(next, inConn)
-
- // Print stats every second.
- go func() {
- t := time.NewTicker(time.Second)
- defer t.Stop()
- for {
- <-t.C
- in.printStats()
- out.printStats()
- }
- }()
-
- for {
- // Dial again.
- next, err = out.dial(*forward)
- if err == nil {
- break
- }
- }
- }
-}