diff options
Diffstat (limited to 'test/benchmarks/tcp')
-rw-r--r-- | test/benchmarks/tcp/BUILD | 41 | ||||
-rw-r--r-- | test/benchmarks/tcp/README.md | 87 | ||||
-rw-r--r-- | test/benchmarks/tcp/nsjoin.c | 47 | ||||
-rwxr-xr-x | test/benchmarks/tcp/tcp_benchmark.sh | 396 | ||||
-rw-r--r-- | test/benchmarks/tcp/tcp_proxy.go | 463 |
5 files changed, 0 insertions, 1034 deletions
diff --git a/test/benchmarks/tcp/BUILD b/test/benchmarks/tcp/BUILD deleted file mode 100644 index 6dde7d9e6..000000000 --- a/test/benchmarks/tcp/BUILD +++ /dev/null @@ -1,41 +0,0 @@ -load("//tools:defs.bzl", "cc_binary", "go_binary") - -package(licenses = ["notice"]) - -go_binary( - name = "tcp_proxy", - srcs = ["tcp_proxy.go"], - visibility = ["//:sandbox"], - deps = [ - "//pkg/tcpip", - "//pkg/tcpip/adapters/gonet", - "//pkg/tcpip/link/fdbased", - "//pkg/tcpip/link/qdisc/fifo", - "//pkg/tcpip/network/arp", - "//pkg/tcpip/network/ipv4", - "//pkg/tcpip/stack", - "//pkg/tcpip/transport/tcp", - "//pkg/tcpip/transport/udp", - "@org_golang_x_sys//unix:go_default_library", - ], -) - -# nsjoin is a trivial replacement for nsenter. This is used because nsenter is -# not available on all systems where this benchmark is run (and we aim to -# minimize external dependencies.) - -cc_binary( - name = "nsjoin", - srcs = ["nsjoin.c"], - visibility = ["//:sandbox"], -) - -sh_binary( - name = "tcp_benchmark", - srcs = ["tcp_benchmark.sh"], - data = [ - ":nsjoin", - ":tcp_proxy", - ], - visibility = ["//:sandbox"], -) diff --git a/test/benchmarks/tcp/README.md b/test/benchmarks/tcp/README.md deleted file mode 100644 index 38e6e69f0..000000000 --- a/test/benchmarks/tcp/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# TCP Benchmarks - -This directory contains a standardized TCP benchmark. This helps to evaluate the -performance of netstack and native networking stacks under various conditions. - -## `tcp_benchmark` - -This benchmark allows TCP throughput testing under various conditions. The setup -consists of an iperf client, a client proxy, a server proxy and an iperf server. -The client proxy and server proxy abstract the network mechanism used to -communicate between the iperf client and server. - -The setup looks like the following: - -``` - +--------------+ (native) +--------------+ - | iperf client |[lo @ 10.0.0.1]------>| client proxy | - +--------------+ +--------------+ - [client.0 @ 10.0.0.2] - (netstack) | | (native) - +------+-----+ - | - [br0] - | - Network emulation applied ---> [wan.0:wan.1] - | - [br1] - | - +------+-----+ - (netstack) | | (native) - [server.0 @ 10.0.0.3] - +--------------+ +--------------+ - | iperf server |<------[lo @ 10.0.0.4]| server proxy | - +--------------+ (native) +--------------+ -``` - -Different configurations can be run using different arguments. For example: - -* Native test under normal internet conditions: `tcp_benchmark` -* Native test under ideal conditions: `tcp_benchmark --ideal` -* Netstack client under ideal conditions: `tcp_benchmark --client --ideal` -* Netstack client with 5% packet loss: `tcp_benchmark --client --ideal --loss - 5` - -Use `tcp_benchmark --help` for full arguments. - -This tool may be used to easily generate data for graphing. For example, to -generate a CSV for various latencies, you might do: - -``` -rm -f /tmp/netstack_latency.csv /tmp/native_latency.csv -latencies=$(seq 0 5 50; - seq 60 10 100; - seq 125 25 250; - seq 300 50 500) -for latency in $latencies; do - read throughput client_cpu server_cpu <<< \ - $(./tcp_benchmark --duration 30 --client --ideal --latency $latency) - echo $latency,$throughput,$client_cpu >> /tmp/netstack_latency.csv -done -for latency in $latencies; do - read throughput client_cpu server_cpu <<< \ - $(./tcp_benchmark --duration 30 --ideal --latency $latency) - echo $latency,$throughput,$client_cpu >> /tmp/native_latency.csv -done -``` - -Similarly, to generate a CSV for various levels of packet loss, the following -would be appropriate: - -``` -rm -f /tmp/netstack_loss.csv /tmp/native_loss.csv -losses=$(seq 0 0.1 1.0; - seq 1.2 0.2 2.0; - seq 2.5 0.5 5.0; - seq 6.0 1.0 10.0) -for loss in $losses; do - read throughput client_cpu server_cpu <<< \ - $(./tcp_benchmark --duration 30 --client --ideal --latency 10 --loss $loss) - echo $loss,$throughput,$client_cpu >> /tmp/netstack_loss.csv -done -for loss in $losses; do - read throughput client_cpu server_cpu <<< \ - $(./tcp_benchmark --duration 30 --ideal --latency 10 --loss $loss) - echo $loss,$throughput,$client_cpu >> /tmp/native_loss.csv -done -``` diff --git a/test/benchmarks/tcp/nsjoin.c b/test/benchmarks/tcp/nsjoin.c deleted file mode 100644 index 524b4d549..000000000 --- a/test/benchmarks/tcp/nsjoin.c +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - -#include <errno.h> -#include <fcntl.h> -#include <sched.h> -#include <stdio.h> -#include <string.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <unistd.h> - -int main(int argc, char** argv) { - if (argc <= 2) { - fprintf(stderr, "error: must provide a namespace file.\n"); - fprintf(stderr, "usage: %s <file> [arguments...]\n", argv[0]); - return 1; - } - - int fd = open(argv[1], O_RDONLY); - if (fd < 0) { - fprintf(stderr, "error opening %s: %s\n", argv[1], strerror(errno)); - return 1; - } - if (setns(fd, 0) < 0) { - fprintf(stderr, "error joining %s: %s\n", argv[1], strerror(errno)); - return 1; - } - - execvp(argv[2], &argv[2]); - return 1; -} diff --git a/test/benchmarks/tcp/tcp_benchmark.sh b/test/benchmarks/tcp/tcp_benchmark.sh deleted file mode 100755 index 6a4f33b96..000000000 --- a/test/benchmarks/tcp/tcp_benchmark.sh +++ /dev/null @@ -1,396 +0,0 @@ -#!/bin/bash - -# Copyright 2018 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# TCP benchmark; see README.md for documentation. - -# Fixed parameters. -iperf_port=45201 # Not likely to be privileged. -proxy_port=44000 # Ditto. -client_addr=10.0.0.1 -client_proxy_addr=10.0.0.2 -server_proxy_addr=10.0.0.3 -server_addr=10.0.0.4 -mask=8 - -# Defaults; this provides a reasonable approximation of a decent internet link. -# Parameters can be varied independently from this set to see response to -# various changes in the kind of link available. -client=false -server=false -verbose=false -gso=0 -swgso=false -mtu=1280 # 1280 is a reasonable lowest-common-denominator. -latency=10 # 10ms approximates a fast, dedicated connection. -latency_variation=1 # +/- 1ms is a relatively low amount of jitter. -loss=0.1 # 0.1% loss is non-zero, but not extremely high. -duplicate=0.1 # 0.1% means duplicates are 1/10x as frequent as losses. -duration=30 # 30s is enough time to consistent results (experimentally). -helper_dir=$(dirname $0) -netstack_opts= -disable_linux_gso= -num_client_threads=1 - -# Check for netem support. -lsmod_output=$(lsmod | grep sch_netem) -if [ "$?" != "0" ]; then - echo "warning: sch_netem may not be installed." >&2 -fi - -while [ $# -gt 0 ]; do - case "$1" in - --client) - client=true - ;; - --client_tcp_probe_file) - shift - netstack_opts="${netstack_opts} -client_tcp_probe_file=$1" - ;; - --server) - server=true - ;; - --verbose) - verbose=true - ;; - --gso) - shift - gso=$1 - ;; - --swgso) - swgso=true - ;; - --server_tcp_probe_file) - shift - netstack_opts="${netstack_opts} -server_tcp_probe_file=$1" - ;; - --ideal) - mtu=1500 # Standard ethernet. - latency=0 # No latency. - latency_variation=0 # No jitter. - loss=0 # No loss. - duplicate=0 # No duplicates. - ;; - --mtu) - shift - [ "$#" -le 0 ] && echo "no mtu provided" && exit 1 - mtu=$1 - ;; - --sack) - netstack_opts="${netstack_opts} -sack" - ;; - --rack) - netstack_opts="${netstack_opts} -rack" - ;; - --cubic) - netstack_opts="${netstack_opts} -cubic" - ;; - --moderate-recv-buf) - netstack_opts="${netstack_opts} -moderate_recv_buf" - ;; - --duration) - shift - [ "$#" -le 0 ] && echo "no duration provided" && exit 1 - duration=$1 - ;; - --latency) - shift - [ "$#" -le 0 ] && echo "no latency provided" && exit 1 - latency=$1 - ;; - --latency-variation) - shift - [ "$#" -le 0 ] && echo "no latency variation provided" && exit 1 - latency_variation=$1 - ;; - --loss) - shift - [ "$#" -le 0 ] && echo "no loss probability provided" && exit 1 - loss=$1 - ;; - --duplicate) - shift - [ "$#" -le 0 ] && echo "no duplicate provided" && exit 1 - duplicate=$1 - ;; - --cpuprofile) - shift - netstack_opts="${netstack_opts} -cpuprofile=$1" - ;; - --memprofile) - shift - netstack_opts="${netstack_opts} -memprofile=$1" - ;; - --disable-linux-gso) - disable_linux_gso=1 - ;; - --num-client-threads) - shift - num_client_threads=$1 - ;; - --helpers) - shift - [ "$#" -le 0 ] && echo "no helper dir provided" && exit 1 - helper_dir=$1 - ;; - *) - echo "usage: $0 [options]" - echo "options:" - echo " --help show this message" - echo " --verbose verbose output" - echo " --client use netstack as the client" - echo " --ideal reset all network emulation" - echo " --server use netstack as the server" - echo " --mtu set the mtu (bytes)" - echo " --sack enable SACK support" - echo " --rack enable RACK support" - echo " --moderate-recv-buf enable TCP receive buffer auto-tuning" - echo " --cubic enable CUBIC congestion control for Netstack" - echo " --duration set the test duration (s)" - echo " --latency set the latency (ms)" - echo " --latency-variation set the latency variation" - echo " --loss set the loss probability (%)" - echo " --duplicate set the duplicate probability (%)" - echo " --helpers set the helper directory" - echo " --num-client-threads number of parallel client threads to run" - echo " --disable-linux-gso disable segmentation offload in the Linux network stack" - echo "" - echo "The output will of the script will be:" - echo " <throughput> <client-cpu-usage> <server-cpu-usage>" - exit 1 - esac - shift -done - -if [ ${verbose} == "true" ]; then - set -x -fi - -# Latency needs to be halved, since it's applied on both ways. -half_latency=$(echo ${latency}/2 | bc -l | awk '{printf "%1.2f", $0}') -half_loss=$(echo ${loss}/2 | bc -l | awk '{printf "%1.6f", $0}') -half_duplicate=$(echo ${duplicate}/2 | bc -l | awk '{printf "%1.6f", $0}') -helper_dir=${helper_dir#$(pwd)/} # Use relative paths. -proxy_binary=${helper_dir}/tcp_proxy -nsjoin_binary=${helper_dir}/nsjoin - -if [ ! -e ${proxy_binary} ]; then - echo "Could not locate ${proxy_binary}, please make sure you've built the binary" - exit 1 -fi - -if [ ! -e ${nsjoin_binary} ]; then - echo "Could not locate ${nsjoin_binary}, please make sure you've built the binary" - exit 1 -fi - -if [ $(echo ${latency_variation} | awk '{printf "%1.2f", $0}') != "0.00" ]; then - # As long as there's some jitter, then we use the paretonormal distribution. - # This will preserve the minimum RTT, but add a realistic amount of jitter to - # the connection and cause re-ordering, etc. The regular pareto distribution - # appears to an unreasonable level of delay (we want only small spikes.) - distribution="distribution paretonormal" -else - distribution="" -fi - -# Client proxy that will listen on the client's iperf target forward traffic -# using the host networking stack. -client_args="${proxy_binary} -port ${proxy_port} -forward ${server_proxy_addr}:${proxy_port}" -if ${client}; then - # Client proxy that will listen on the client's iperf target - # and forward traffic using netstack. - client_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -client \\ - -mtu ${mtu} -iface client.0 -addr ${client_proxy_addr} -mask ${mask} \\ - -forward ${server_proxy_addr}:${proxy_port} -gso=${gso} -swgso=${swgso}" -fi - -# Server proxy that will listen on the proxy port and forward to the server's -# iperf server using the host networking stack. -server_args="${proxy_binary} -port ${proxy_port} -forward ${server_addr}:${iperf_port}" -if ${server}; then - # Server proxy that will listen on the proxy port and forward to the servers' - # iperf server using netstack. - server_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -server \\ - -mtu ${mtu} -iface server.0 -addr ${server_proxy_addr} -mask ${mask} \\ - -forward ${server_addr}:${iperf_port} -gso=${gso} -swgso=${swgso}" -fi - -# Specify loss and duplicate parameters only if they are non-zero -loss_opt="" -if [ "$(echo $half_loss | bc -q)" != "0" ]; then - loss_opt="loss random ${half_loss}%" -fi -duplicate_opt="" -if [ "$(echo $half_duplicate | bc -q)" != "0" ]; then - duplicate_opt="duplicate ${half_duplicate}%" -fi - -exec unshare -U -m -n -r -f -p --mount-proc /bin/bash << EOF -set -e -m - -if [ ${verbose} == "true" ]; then - set -x -fi - -mount -t tmpfs netstack-bench /tmp - -# We may have reset the path in the unshare if the shell loaded some public -# profiles. Ensure that tools are discoverable via the parent's PATH. -export PATH=${PATH} - -# Add client, server interfaces. -ip link add client.0 type veth peer name client.1 -ip link add server.0 type veth peer name server.1 - -# Add network emulation devices. -ip link add wan.0 type veth peer name wan.1 -ip link set wan.0 up -ip link set wan.1 up - -# Enroll on the bridge. -ip link add name br0 type bridge -ip link add name br1 type bridge -ip link set client.1 master br0 -ip link set server.1 master br1 -ip link set wan.0 master br0 -ip link set wan.1 master br1 -ip link set br0 up -ip link set br1 up - -# Set the MTU appropriately. -ip link set client.0 mtu ${mtu} -ip link set server.0 mtu ${mtu} -ip link set wan.0 mtu ${mtu} -ip link set wan.1 mtu ${mtu} - -# Add appropriate latency, loss and duplication. -# -# This is added in at the point of bridge connection. -for device in wan.0 wan.1; do - # NOTE: We don't support a loss correlation as testing has shown that it - # actually doesn't work. The man page actually has a small comment about this - # "It is also possible to add a correlation, but this option is now deprecated - # due to the noticed bad behavior." For more information see netem(8). - tc qdisc add dev \$device root netem \\ - delay ${half_latency}ms ${latency_variation}ms ${distribution} \\ - ${loss_opt} ${duplicate_opt} -done - -# Start a client proxy. -touch /tmp/client.netns -unshare -n mount --bind /proc/self/ns/net /tmp/client.netns - -# Move the endpoint into the namespace. -while ip link | grep client.0 > /dev/null; do - ip link set dev client.0 netns /tmp/client.netns -done - -if ! ${client}; then - # Only add the address to NIC if netstack is not in use. Otherwise the host - # will also process the inbound SYN and send a RST back. - ${nsjoin_binary} /tmp/client.netns ip addr add ${client_proxy_addr}/${mask} dev client.0 -fi - -# Start a server proxy. -touch /tmp/server.netns -unshare -n mount --bind /proc/self/ns/net /tmp/server.netns -# Move the endpoint into the namespace. -while ip link | grep server.0 > /dev/null; do - ip link set dev server.0 netns /tmp/server.netns -done -if ! ${server}; then - # Only add the address to NIC if netstack is not in use. Otherwise the host - # will also process the inbound SYN and send a RST back. - ${nsjoin_binary} /tmp/server.netns ip addr add ${server_proxy_addr}/${mask} dev server.0 -fi - -# Add client and server addresses, and bring everything up. -${nsjoin_binary} /tmp/client.netns ip addr add ${client_addr}/${mask} dev client.0 -${nsjoin_binary} /tmp/server.netns ip addr add ${server_addr}/${mask} dev server.0 -if [ "${disable_linux_gso}" == "1" ]; then - ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 tso off - ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gro off - ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gso off - ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 tso off - ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gso off - ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gro off -fi -${nsjoin_binary} /tmp/client.netns ip link set client.0 up -${nsjoin_binary} /tmp/client.netns ip link set lo up -${nsjoin_binary} /tmp/server.netns ip link set server.0 up -${nsjoin_binary} /tmp/server.netns ip link set lo up -ip link set dev client.1 up -ip link set dev server.1 up - -${nsjoin_binary} /tmp/client.netns ${client_args} & -client_pid=\$! -${nsjoin_binary} /tmp/server.netns ${server_args} & -server_pid=\$! - -# Start the iperf server. -${nsjoin_binary} /tmp/server.netns iperf -p ${iperf_port} -s >&2 & -iperf_pid=\$! - -# Show traffic information. -if ! ${client} && ! ${server}; then - ${nsjoin_binary} /tmp/client.netns ping -c 100 -i 0.001 -W 1 ${server_addr} >&2 || true -fi - -results_file=\$(mktemp) -function cleanup { - rm -f \$results_file - kill -TERM \$client_pid - kill -TERM \$server_pid - wait \$client_pid - wait \$server_pid - kill -9 \$iperf_pid 2>/dev/null -} - -# Allow failure from this point. -set +e -trap cleanup EXIT - -# Run the benchmark, recording the results file. -while ${nsjoin_binary} /tmp/client.netns iperf \\ - -p ${proxy_port} -c ${client_addr} -t ${duration} -f m -P ${num_client_threads} 2>&1 \\ - | tee \$results_file \\ - | grep "connect failed" >/dev/null; do - sleep 0.1 # Wait for all services. -done - -# Unlink all relevant devices from the bridge. This is because when the bridge -# is deleted, the kernel may hang. It appears that this problem is fixed in -# upstream commit 1ce5cce895309862d2c35d922816adebe094fe4a. -ip link set client.1 nomaster -ip link set server.1 nomaster -ip link set wan.0 nomaster -ip link set wan.1 nomaster - -# Emit raw results. -cat \$results_file >&2 - -# Emit a useful result (final throughput). -mbits=\$(grep Mbits/sec \$results_file \\ - | sed -n -e 's/^.*[[:space:]]\\([[:digit:]]\\+\\(\\.[[:digit:]]\\+\\)\\?\\)[[:space:]]*Mbits\\/sec.*/\\1/p') -client_cpu_ticks=\$(cat /proc/\$client_pid/stat \\ - | awk '{print (\$14+\$15);}') -server_cpu_ticks=\$(cat /proc/\$server_pid/stat \\ - | awk '{print (\$14+\$15);}') -ticks_per_sec=\$(getconf CLK_TCK) -client_cpu_load=\$(bc -l <<< \$client_cpu_ticks/\$ticks_per_sec/${duration}) -server_cpu_load=\$(bc -l <<< \$server_cpu_ticks/\$ticks_per_sec/${duration}) -echo \$mbits \$client_cpu_load \$server_cpu_load -EOF diff --git a/test/benchmarks/tcp/tcp_proxy.go b/test/benchmarks/tcp/tcp_proxy.go deleted file mode 100644 index 780e4f7ae..000000000 --- a/test/benchmarks/tcp/tcp_proxy.go +++ /dev/null @@ -1,463 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Binary tcp_proxy is a simple TCP proxy. -package main - -import ( - "encoding/gob" - "flag" - "fmt" - "io" - "log" - "math/rand" - "net" - "os" - "os/signal" - "regexp" - "runtime" - "runtime/pprof" - "strconv" - "syscall" - "time" - - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" - "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" - "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" - "gvisor.dev/gvisor/pkg/tcpip/network/arp" - "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" - "gvisor.dev/gvisor/pkg/tcpip/stack" - "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" - "gvisor.dev/gvisor/pkg/tcpip/transport/udp" -) - -var ( - port = flag.Int("port", 0, "bind port (all addresses)") - forward = flag.String("forward", "", "forwarding target") - client = flag.Bool("client", false, "use netstack for listen") - server = flag.Bool("server", false, "use netstack for dial") - - // Netstack-specific options. - mtu = flag.Int("mtu", 1280, "mtu for network stack") - addr = flag.String("addr", "", "address for tap-based netstack") - mask = flag.Int("mask", 8, "mask size for address") - iface = flag.String("iface", "", "network interface name to bind for netstack") - sack = flag.Bool("sack", false, "enable SACK support for netstack") - rack = flag.Bool("rack", false, "enable RACK in TCP") - moderateRecvBuf = flag.Bool("moderate_recv_buf", false, "enable TCP Receive Buffer Auto-tuning") - cubic = flag.Bool("cubic", false, "enable use of CUBIC congestion control for netstack") - gso = flag.Int("gso", 0, "GSO maximum size") - swgso = flag.Bool("swgso", false, "software-level GSO") - clientTCPProbeFile = flag.String("client_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.") - serverTCPProbeFile = flag.String("server_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.") - cpuprofile = flag.String("cpuprofile", "", "write cpu profile to the specified file.") - memprofile = flag.String("memprofile", "", "write memory profile to the specified file.") -) - -type impl interface { - dial(address string) (net.Conn, error) - listen(port int) (net.Listener, error) - printStats() -} - -type netImpl struct{} - -func (netImpl) dial(address string) (net.Conn, error) { - return net.Dial("tcp", address) -} - -func (netImpl) listen(port int) (net.Listener, error) { - return net.Listen("tcp", fmt.Sprintf(":%d", port)) -} - -func (netImpl) printStats() { -} - -const ( - nicID = 1 // Fixed. - bufSize = 4 << 20 // 4MB. -) - -type netstackImpl struct { - s *stack.Stack - addr tcpip.Address - mode string -} - -func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) { - // Get all interfaces in the namespace. - ifaces, err := net.Interfaces() - if err != nil { - return nil, fmt.Errorf("querying interfaces: %v", err) - } - - for _, iface := range ifaces { - if iface.Name != ifaceName { - continue - } - // Create the socket. - const protocol = 0x0300 // htons(ETH_P_ALL) - fds := make([]int, numChannels) - for i := range fds { - fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) - if err != nil { - return nil, fmt.Errorf("unable to create raw socket: %v", err) - } - - // Bind to the appropriate device. - ll := syscall.SockaddrLinklayer{ - Protocol: protocol, - Ifindex: iface.Index, - Pkttype: syscall.PACKET_HOST, - } - if err := syscall.Bind(fd, &ll); err != nil { - return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) - } - - // RAW Sockets by default have a very small SO_RCVBUF of 256KB, - // up it to at least 4MB to reduce packet drops. - if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize); err != nil { - return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", bufSize, err) - } - - if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize); err != nil { - return nil, fmt.Errorf("setsockopt(..., SO_SNDBUF, %v,..) = %v", bufSize, err) - } - - if !*swgso && *gso != 0 { - if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { - return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) - } - } - fds[i] = fd - } - return fds, nil - } - return nil, fmt.Errorf("failed to find interface: %v", ifaceName) -} - -func newNetstackImpl(mode string) (impl, error) { - fds, err := setupNetwork(*iface, runtime.GOMAXPROCS(-1)) - if err != nil { - return nil, err - } - - // Parse details. - parsedAddr := tcpip.Address(net.ParseIP(*addr).To4()) - parsedDest := tcpip.Address("") // Filled in below. - parsedMask := tcpip.AddressMask("") // Filled in below. - switch *mask { - case 8: - parsedDest = tcpip.Address([]byte{parsedAddr[0], 0, 0, 0}) - parsedMask = tcpip.AddressMask([]byte{0xff, 0, 0, 0}) - case 16: - parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], 0, 0}) - parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0, 0}) - case 24: - parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], parsedAddr[2], 0}) - parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0xff, 0}) - default: - // This is just laziness; we don't expect a different mask. - return nil, fmt.Errorf("mask %d not supported", mask) - } - - // Create a new network stack. - netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, arp.NewProtocol} - transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol} - s := stack.New(stack.Options{ - NetworkProtocols: netProtos, - TransportProtocols: transProtos, - }) - - // Generate a new mac for the eth device. - mac := make(net.HardwareAddr, 6) - rand.Read(mac) // Fill with random data. - mac[0] &^= 0x1 // Clear multicast bit. - mac[0] |= 0x2 // Set local assignment bit (IEEE802). - ep, err := fdbased.New(&fdbased.Options{ - FDs: fds, - MTU: uint32(*mtu), - EthernetHeader: true, - Address: tcpip.LinkAddress(mac), - // Enable checksum generation as we need to generate valid - // checksums for the veth device to deliver our packets to the - // peer. But we do want to disable checksum verification as veth - // devices do perform GRO and the linux host kernel may not - // regenerate valid checksums after GRO. - TXChecksumOffload: false, - RXChecksumOffload: true, - PacketDispatchMode: fdbased.RecvMMsg, - GSOMaxSize: uint32(*gso), - SoftwareGSOEnabled: *swgso, - }) - if err != nil { - return nil, fmt.Errorf("failed to create FD endpoint: %v", err) - } - if err := s.CreateNIC(nicID, fifo.New(ep, runtime.GOMAXPROCS(0), 1000)); err != nil { - return nil, fmt.Errorf("error creating NIC %q: %v", *iface, err) - } - if err := s.AddAddress(nicID, ipv4.ProtocolNumber, parsedAddr); err != nil { - return nil, fmt.Errorf("error adding IP address to %q: %v", *iface, err) - } - - subnet, err := tcpip.NewSubnet(parsedDest, parsedMask) - if err != nil { - return nil, fmt.Errorf("tcpip.Subnet(%s, %s): %s", parsedDest, parsedMask, err) - } - // Add default route; we only support - s.SetRouteTable([]tcpip.Route{ - { - Destination: subnet, - NIC: nicID, - }, - }) - - // Set protocol options. - { - opt := tcpip.TCPSACKEnabled(*sack) - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) - } - } - - if *rack { - opt := tcpip.TCPRecovery(tcpip.TCPRACKLossDetection) - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - return nil, fmt.Errorf("enabling RACK failed: %v", err) - } - } - - // Enable Receive Buffer Auto-Tuning. - { - opt := tcpip.TCPModerateReceiveBufferOption(*moderateRecvBuf) - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) - } - } - - // Set Congestion Control to cubic if requested. - if *cubic { - opt := tcpip.CongestionControlOption("cubic") - if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, opt, err) - } - } - - return netstackImpl{ - s: s, - addr: parsedAddr, - mode: mode, - }, nil -} - -func (n netstackImpl) dial(address string) (net.Conn, error) { - host, port, err := net.SplitHostPort(address) - if err != nil { - return nil, err - } - if host == "" { - // A host must be provided for the dial. - return nil, fmt.Errorf("no host provided") - } - portNumber, err := strconv.Atoi(port) - if err != nil { - return nil, err - } - addr := tcpip.FullAddress{ - NIC: nicID, - Addr: tcpip.Address(net.ParseIP(host).To4()), - Port: uint16(portNumber), - } - conn, err := gonet.DialTCP(n.s, addr, ipv4.ProtocolNumber) - if err != nil { - return nil, err - } - return conn, nil -} - -func (n netstackImpl) listen(port int) (net.Listener, error) { - addr := tcpip.FullAddress{ - NIC: nicID, - Port: uint16(port), - } - listener, err := gonet.ListenTCP(n.s, addr, ipv4.ProtocolNumber) - if err != nil { - return nil, err - } - return listener, nil -} - -var zeroFieldsRegexp = regexp.MustCompile(`\s*[a-zA-Z0-9]*:0`) - -func (n netstackImpl) printStats() { - // Don't show zero fields. - stats := zeroFieldsRegexp.ReplaceAllString(fmt.Sprintf("%+v", n.s.Stats()), "") - log.Printf("netstack %s Stats: %+v\n", n.mode, stats) -} - -// installProbe installs a TCP Probe function that will dump endpoint -// state to the specified file. It also returns a close func() that -// can be used to close the probeFile. -func (n netstackImpl) installProbe(probeFileName string) (close func()) { - // Install Probe to dump out end point state. - probeFile, err := os.Create(probeFileName) - if err != nil { - log.Fatalf("failed to create tcp_probe file %s: %v", probeFileName, err) - } - probeEncoder := gob.NewEncoder(probeFile) - // Install a TCP Probe. - n.s.AddTCPProbe(func(state stack.TCPEndpointState) { - probeEncoder.Encode(state) - }) - return func() { probeFile.Close() } -} - -func main() { - flag.Parse() - if *port == 0 { - log.Fatalf("no port provided") - } - if *forward == "" { - log.Fatalf("no forward provided") - } - // Seed the random number generator to ensure that we are given MAC addresses that don't - // for the case of the client and server stack. - rand.Seed(time.Now().UTC().UnixNano()) - - if *cpuprofile != "" { - f, err := os.Create(*cpuprofile) - if err != nil { - log.Fatal("could not create CPU profile: ", err) - } - defer func() { - if err := f.Close(); err != nil { - log.Print("error closing CPU profile: ", err) - } - }() - if err := pprof.StartCPUProfile(f); err != nil { - log.Fatal("could not start CPU profile: ", err) - } - defer pprof.StopCPUProfile() - } - - var ( - in impl - out impl - err error - ) - if *server { - in, err = newNetstackImpl("server") - if *serverTCPProbeFile != "" { - defer in.(netstackImpl).installProbe(*serverTCPProbeFile)() - } - - } else { - in = netImpl{} - } - if err != nil { - log.Fatalf("netstack error: %v", err) - } - if *client { - out, err = newNetstackImpl("client") - if *clientTCPProbeFile != "" { - defer out.(netstackImpl).installProbe(*clientTCPProbeFile)() - } - } else { - out = netImpl{} - } - if err != nil { - log.Fatalf("netstack error: %v", err) - } - - // Dial forward before binding. - var next net.Conn - for { - next, err = out.dial(*forward) - if err == nil { - break - } - time.Sleep(50 * time.Millisecond) - log.Printf("connect failed retrying: %v", err) - } - - // Bind once to the server socket. - listener, err := in.listen(*port) - if err != nil { - // Should not happen, everything must be bound by this time - // this proxy is started. - log.Fatalf("unable to listen: %v", err) - } - log.Printf("client=%v, server=%v, ready.", *client, *server) - - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGTERM) - go func() { - <-sigs - if *cpuprofile != "" { - pprof.StopCPUProfile() - } - if *memprofile != "" { - f, err := os.Create(*memprofile) - if err != nil { - log.Fatal("could not create memory profile: ", err) - } - defer func() { - if err := f.Close(); err != nil { - log.Print("error closing memory profile: ", err) - } - }() - runtime.GC() // get up-to-date statistics - if err := pprof.WriteHeapProfile(f); err != nil { - log.Fatalf("Unable to write heap profile: %v", err) - } - } - os.Exit(0) - }() - - for { - // Forward all connections. - inConn, err := listener.Accept() - if err != nil { - // This should not happen; we are listening - // successfully. Exhausted all available FDs? - log.Fatalf("accept error: %v", err) - } - log.Printf("incoming connection established.") - - // Copy both ways. - go io.Copy(inConn, next) - go io.Copy(next, inConn) - - // Print stats every second. - go func() { - t := time.NewTicker(time.Second) - defer t.Stop() - for { - <-t.C - in.printStats() - out.printStats() - } - }() - - for { - // Dial again. - next, err = out.dial(*forward) - if err == nil { - break - } - } - } -} |