5 files changed, 1007 insertions, 0 deletions
diff --git a/benchmarks/tcp/BUILD b/benchmarks/tcp/BUILD
new file mode 100644
index 000000000..735d7127f
--- /dev/null
+++ b/benchmarks/tcp/BUILD
@@ -0,0 +1,41 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "tcp_proxy",
+    srcs = ["tcp_proxy.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/adapters/gonet",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/network/arp",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+# nsjoin is a trivial replacement for nsenter. This is used because nsenter is
+# not available on all systems where this benchmark is run (and we aim to
+# minimize external dependencies.)
+
+cc_binary(
+    name = "nsjoin",
+    srcs = ["nsjoin.c"],
+    visibility = ["//:sandbox"],
+)
+
+sh_binary(
+    name = "tcp_benchmark",
+    srcs = ["tcp_benchmark.sh"],
+    data = [
+        ":nsjoin",
+        ":tcp_proxy",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/benchmarks/tcp/README.md b/benchmarks/tcp/README.md
new file mode 100644
index 000000000..38e6e69f0
--- /dev/null
+++ b/benchmarks/tcp/README.md
@@ -0,0 +1,87 @@
+# TCP Benchmarks
+
+This directory contains a standardized TCP benchmark. This helps to evaluate the
+performance of netstack and native networking stacks under various conditions.
+
+## `tcp_benchmark`
+
+This benchmark allows TCP throughput testing under various conditions. The setup
+consists of an iperf client, a client proxy, a server proxy and an iperf server.
+The client proxy and server proxy abstract the network mechanism used to
+communicate between the iperf client and server.
+
+The setup looks like the following:
+
+```
+ +--------------+  (native)            +--------------+
+ | iperf client |[lo @ 10.0.0.1]------>| client proxy |
+ +--------------+                      +--------------+
+                                    [client.0 @ 10.0.0.2]
+                            (netstack)  |            |  (native)
+                                        +------+-----+
+                                               |
+                                             [br0]
+                                               |
+          Network emulation applied ---> [wan.0:wan.1]
+                                               |
+                                             [br1]
+                                               |
+                                        +------+-----+
+                            (netstack)  |            |  (native)
+                                     [server.0 @ 10.0.0.3]
+ +--------------+                      +--------------+
+ | iperf server |<------[lo @ 10.0.0.4]| server proxy |
+ +--------------+            (native)  +--------------+
+```
+
+Different configurations can be run using different arguments. For example:
+
+*   Native test under normal internet conditions: `tcp_benchmark`
+*   Native test under ideal conditions: `tcp_benchmark --ideal`
+*   Netstack client under ideal conditions: `tcp_benchmark --client --ideal`
+*   Netstack client with 5% packet loss: `tcp_benchmark --client --ideal --loss
+    5`
+
+Use `tcp_benchmark --help` for full arguments.
+
+This tool may be used to easily generate data for graphing. For example, to
+generate a CSV for various latencies, you might do:
+
+```
+rm -f /tmp/netstack_latency.csv /tmp/native_latency.csv
+latencies=$(seq 0 5 50;
+            seq 60 10 100;
+            seq 125 25 250;
+            seq 300 50 500)
+for latency in $latencies; do
+  read throughput client_cpu server_cpu <<< \
+    $(./tcp_benchmark --duration 30 --client --ideal --latency $latency)
+  echo $latency,$throughput,$client_cpu >> /tmp/netstack_latency.csv
+done
+for latency in $latencies; do
+  read throughput client_cpu server_cpu <<< \
+    $(./tcp_benchmark --duration 30 --ideal --latency $latency)
+  echo $latency,$throughput,$client_cpu >> /tmp/native_latency.csv
+done
+```
+
+Similarly, to generate a CSV for various levels of packet loss, the following
+would be appropriate:
+
+```
+rm -f /tmp/netstack_loss.csv /tmp/native_loss.csv
+losses=$(seq 0 0.1 1.0;
+         seq 1.2 0.2 2.0;
+         seq 2.5 0.5 5.0;
+         seq 6.0 1.0 10.0)
+for loss in $losses; do
+  read throughput client_cpu server_cpu <<< \
+    $(./tcp_benchmark --duration 30 --client --ideal --latency 10 --loss $loss)
+  echo $loss,$throughput,$client_cpu >> /tmp/netstack_loss.csv
+done
+for loss in $losses; do
+  read throughput client_cpu server_cpu <<< \
+    $(./tcp_benchmark --duration 30 --ideal --latency 10 --loss $loss)
+  echo $loss,$throughput,$client_cpu >> /tmp/native_loss.csv
+done
+```
diff --git a/benchmarks/tcp/nsjoin.c b/benchmarks/tcp/nsjoin.c
new file mode 100644
index 000000000..524b4d549
--- /dev/null
+++ b/benchmarks/tcp/nsjoin.c
@@ -0,0 +1,47 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int main(int argc, char** argv) {
+  if (argc <= 2) {
+    fprintf(stderr, "error: must provide a namespace file.\n");
+    fprintf(stderr, "usage: %s <file> [arguments...]\n", argv[0]);
+    return 1;
+  }
+
+  int fd = open(argv[1], O_RDONLY);
+  if (fd < 0) {
+    fprintf(stderr, "error opening %s: %s\n", argv[1], strerror(errno));
+    return 1;
+  }
+  if (setns(fd, 0) < 0) {
+    fprintf(stderr, "error joining %s: %s\n", argv[1], strerror(errno));
+    return 1;
+  }
+
+  execvp(argv[2], &argv[2]);
+  return 1;
+}
diff --git a/benchmarks/tcp/tcp_benchmark.sh b/benchmarks/tcp/tcp_benchmark.sh
new file mode 100755
index 000000000..e65801a7b
--- /dev/null
+++ b/benchmarks/tcp/tcp_benchmark.sh
@@ -0,0 +1,388 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TCP benchmark; see README.md for documentation.
+
+# Fixed parameters.
+iperf_port=45201 # Not likely to be privileged.
+proxy_port=44000 # Ditto.
+client_addr=10.0.0.1
+client_proxy_addr=10.0.0.2
+server_proxy_addr=10.0.0.3
+server_addr=10.0.0.4
+mask=8
+
+# Defaults; this provides a reasonable approximation of a decent internet link.
+# Parameters can be varied independently from this set to see response to
+# various changes in the kind of link available.
+client=false
+server=false
+verbose=false
+gso=0
+swgso=false
+mtu=1280                # 1280 is a reasonable lowest-common-denominator.
+latency=10              # 10ms approximates a fast, dedicated connection.
+latency_variation=1     # +/- 1ms is a relatively low amount of jitter.
+loss=0.1                # 0.1% loss is non-zero, but not extremely high.
+duplicate=0.1           # 0.1% means duplicates are 1/10x as frequent as losses.
+duration=30             # 30s is enough time to consistent results (experimentally).
+helper_dir=$(dirname $0)
+netstack_opts=
+disable_linux_gso=
+num_client_threads=1
+
+# Check for netem support.
+lsmod_output=$(lsmod | grep sch_netem)
+if [ "$?" != "0" ]; then
+  echo "warning: sch_netem may not be installed." >&2
+fi
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --client)
+      client=true
+      ;;
+    --client_tcp_probe_file)
+      shift
+      netstack_opts="${netstack_opts} -client_tcp_probe_file=$1"
+      ;;
+    --server)
+      server=true
+      ;;
+    --verbose)
+      verbose=true
+      ;;
+    --gso)
+      shift
+      gso=$1
+      ;;
+    --swgso)
+      swgso=true
+      ;;
+    --server_tcp_probe_file)
+      shift
+      netstack_opts="${netstack_opts} -server_tcp_probe_file=$1"
+      ;;
+    --ideal)
+      mtu=1500            # Standard ethernet.
+      latency=0           # No latency.
+      latency_variation=0 # No jitter.
+      loss=0              # No loss.
+      duplicate=0         # No duplicates.
+      ;;
+    --mtu)
+      shift
+      [ "$#" -le 0 ] && echo "no mtu provided" && exit 1
+      mtu=$1
+      ;;
+    --sack)
+      netstack_opts="${netstack_opts} -sack"
+      ;;
+    --cubic)
+      netstack_opts="${netstack_opts} -cubic"
+      ;;
+    --duration)
+      shift
+      [ "$#" -le 0 ] && echo "no duration provided" && exit 1
+      duration=$1
+      ;;
+    --latency)
+      shift
+      [ "$#" -le 0 ] && echo "no latency provided" && exit 1
+      latency=$1
+      ;;
+    --latency-variation)
+      shift
+      [ "$#" -le 0 ] && echo "no latency variation provided" && exit 1
+      latency_variation=$1
+      ;;
+    --loss)
+      shift
+      [ "$#" -le 0 ] && echo "no loss probability provided" && exit 1
+      loss=$1
+      ;;
+    --duplicate)
+      shift
+      [ "$#" -le 0 ] && echo "no duplicate provided" && exit 1
+      duplicate=$1
+      ;;
+    --cpuprofile)
+      shift
+      netstack_opts="${netstack_opts} -cpuprofile=$1"
+      ;;
+    --memprofile)
+      shift
+      netstack_opts="${netstack_opts} -memprofile=$1"
+      ;;
+    --disable-linux-gso)
+      disable_linux_gso=1
+      ;;
+    --num-client-threads)
+      shift
+      num_client_threads=$1
+      ;;
+    --helpers)
+      shift
+      [ "$#" -le 0 ] && echo "no helper dir provided" && exit 1
+      helper_dir=$1
+      ;;
+    *)
+      echo "usage: $0 [options]"
+      echo "options:"
+      echo " --help                show this message"
+      echo " --verbose             verbose output"
+      echo " --client              use netstack as the client"
+      echo " --ideal               reset all network emulation"
+      echo " --server              use netstack as the server"
+      echo " --mtu                 set the mtu (bytes)"
+      echo " --sack                enable SACK support"
+      echo " --cubic               enable CUBIC congestion control for Netstack"
+      echo " --duration            set the test duration (s)"
+      echo " --latency             set the latency (ms)"
+      echo " --latency-variation   set the latency variation"
+      echo " --loss                set the loss probability (%)"
+      echo " --duplicate           set the duplicate probability (%)"
+      echo " --helpers             set the helper directory"
+      echo " --num-client-threads  number of parallel client threads to run"
+      echo " --disable-linux-gso   disable segmentation offload in the Linux network stack"
+      echo ""
+      echo "The output will of the script will be:"
+      echo "  <throughput> <client-cpu-usage> <server-cpu-usage>"
+      exit 1
+  esac
+  shift
+done
+
+if [ ${verbose} == "true" ]; then
+  set -x
+fi
+
+# Latency needs to be halved, since it's applied on both ways.
+half_latency=$(echo ${latency}/2 | bc -l | awk '{printf "%1.2f", $0}')
+half_loss=$(echo ${loss}/2 | bc -l | awk '{printf "%1.6f", $0}')
+half_duplicate=$(echo ${duplicate}/2 | bc -l | awk '{printf "%1.6f", $0}')
+helper_dir=${helper_dir#$(pwd)/} # Use relative paths.
+proxy_binary=${helper_dir}/tcp_proxy
+nsjoin_binary=${helper_dir}/nsjoin
+
+if [ ! -e ${proxy_binary} ]; then
+  echo "Could not locate ${proxy_binary}, please make sure you've built the binary"
+  exit 1
+fi
+
+if [ ! -e ${nsjoin_binary} ]; then
+  echo "Could not locate ${nsjoin_binary}, please make sure you've built the binary"
+  exit 1
+fi
+
+if [ $(echo ${latency_variation} | awk '{printf "%1.2f", $0}') != "0.00" ]; then
+  # As long as there's some jitter, then we use the paretonormal distribution.
+  # This will preserve the minimum RTT, but add a realistic amount of jitter to
+  # the connection and cause re-ordering, etc. The regular pareto distribution
+  # appears to an unreasonable level of delay (we want only small spikes.)
+  distribution="distribution paretonormal"
+else
+  distribution=""
+fi
+
+# Client proxy that will listen on the client's iperf target forward traffic
+# using the host networking stack.
+client_args="${proxy_binary} -port ${proxy_port} -forward ${server_proxy_addr}:${proxy_port}"
+if ${client}; then
+  # Client proxy that will listen on the client's iperf target
+  # and forward traffic using netstack.
+  client_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -client \\
+      -mtu ${mtu} -iface client.0 -addr ${client_proxy_addr} -mask ${mask} \\
+      -forward ${server_proxy_addr}:${proxy_port} -gso=${gso} -swgso=${swgso}"
+fi
+
+# Server proxy that will listen on the proxy port and forward to the server's
+# iperf server using the host networking stack.
+server_args="${proxy_binary} -port ${proxy_port} -forward ${server_addr}:${iperf_port}"
+if ${server}; then
+  # Server proxy that will listen on the proxy port and forward to the servers'
+  # iperf server using netstack.
+  server_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -server \\
+      -mtu ${mtu} -iface server.0 -addr ${server_proxy_addr} -mask ${mask} \\
+      -forward ${server_addr}:${iperf_port} -gso=${gso} -swgso=${swgso}"
+fi
+
+# Specify loss and duplicate parameters only if they are non-zero
+loss_opt=""
+if [ "$(echo $half_loss | bc -q)" != "0" ]; then
+  loss_opt="loss random ${half_loss}%"
+fi
+duplicate_opt=""
+if [ "$(echo $half_duplicate | bc -q)" != "0" ]; then
+  duplicate_opt="duplicate ${half_duplicate}%"
+fi
+
+exec unshare -U -m -n -r -f -p --mount-proc /bin/bash << EOF
+set -e -m
+
+if [ ${verbose} == "true" ]; then
+  set -x
+fi
+
+mount -t tmpfs netstack-bench /tmp
+
+# We may have reset the path in the unshare if the shell loaded some public
+# profiles. Ensure that tools are discoverable via the parent's PATH.
+export PATH=${PATH}
+
+# Add client, server interfaces.
+ip link add client.0 type veth peer name client.1
+ip link add server.0 type veth peer name server.1
+
+# Add network emulation devices.
+ip link add wan.0 type veth peer name wan.1
+ip link set wan.0 up
+ip link set wan.1 up
+
+# Enroll on the bridge.
+ip link add name br0 type bridge
+ip link add name br1 type bridge
+ip link set client.1 master br0
+ip link set server.1 master br1
+ip link set wan.0 master br0
+ip link set wan.1 master br1
+ip link set br0 up
+ip link set br1 up
+
+# Set the MTU appropriately.
+ip link set client.0 mtu ${mtu}
+ip link set server.0 mtu ${mtu}
+ip link set wan.0 mtu ${mtu}
+ip link set wan.1 mtu ${mtu}
+
+# Add appropriate latency, loss and duplication.
+#
+# This is added in at the point of bridge connection.
+for device in wan.0 wan.1; do
+  # NOTE: We don't support a loss correlation as testing has shown that it
+  # actually doesn't work. The man page actually has a small comment about this
+  # "It is also possible to add a correlation, but this option is now deprecated
+  # due to the noticed bad behavior." For more information see netem(8).
+  tc qdisc add dev \$device root netem \\
+    delay ${half_latency}ms ${latency_variation}ms ${distribution} \\
+    ${loss_opt} ${duplicate_opt}
+done
+
+# Start a client proxy.
+touch /tmp/client.netns
+unshare -n mount --bind /proc/self/ns/net /tmp/client.netns
+
+# Move the endpoint into the namespace.
+while ip link | grep client.0 > /dev/null; do
+  ip link set dev client.0 netns /tmp/client.netns
+done
+
+if ! ${client}; then
+  # Only add the address to NIC if netstack is not in use. Otherwise the host
+  # will also process the inbound SYN and send a RST back.
+  ${nsjoin_binary} /tmp/client.netns ip addr add ${client_proxy_addr}/${mask} dev client.0
+fi
+
+# Start a server proxy.
+touch /tmp/server.netns
+unshare -n mount --bind /proc/self/ns/net /tmp/server.netns
+# Move the endpoint into the namespace.
+while ip link | grep server.0 > /dev/null; do
+  ip link set dev server.0 netns /tmp/server.netns
+done
+if ! ${server}; then
+  # Only add the address to NIC if netstack is not in use. Otherwise the host
+  # will also process the inbound SYN and send a RST back.
+  ${nsjoin_binary} /tmp/server.netns ip addr add ${server_proxy_addr}/${mask} dev server.0
+fi
+
+# Add client and server addresses, and bring everything up.
+${nsjoin_binary} /tmp/client.netns ip addr add ${client_addr}/${mask} dev client.0
+${nsjoin_binary} /tmp/server.netns ip addr add ${server_addr}/${mask} dev server.0
+if [ "${disable_linux_gso}" == "1" ]; then
+  ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 tso off
+  ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gro off
+  ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gso off
+  ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 tso off
+  ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gso off
+  ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gro off
+fi
+${nsjoin_binary} /tmp/client.netns ip link set client.0 up
+${nsjoin_binary} /tmp/client.netns ip link set lo up
+${nsjoin_binary} /tmp/server.netns ip link set server.0 up
+${nsjoin_binary} /tmp/server.netns ip link set lo up
+ip link set dev client.1 up
+ip link set dev server.1 up
+
+${nsjoin_binary} /tmp/client.netns ${client_args} &
+client_pid=\$!
+${nsjoin_binary} /tmp/server.netns ${server_args} &
+server_pid=\$!
+
+# Start the iperf server.
+${nsjoin_binary} /tmp/server.netns iperf -p ${iperf_port} -s >&2 &
+iperf_pid=\$!
+
+# Show traffic information.
+if ! ${client} && ! ${server}; then
+  ${nsjoin_binary} /tmp/client.netns ping -c 100 -i 0.001 -W 1 ${server_addr} >&2 || true
+fi
+
+results_file=\$(mktemp)
+function cleanup {
+  rm -f \$results_file
+  kill -TERM \$client_pid
+  kill -TERM \$server_pid
+  wait \$client_pid
+  wait \$server_pid
+  kill -9 \$iperf_pid 2>/dev/null
+}
+
+# Allow failure from this point.
+set +e
+trap cleanup EXIT
+
+# Run the benchmark, recording the results file.
+while ${nsjoin_binary} /tmp/client.netns iperf \\
+    -p ${proxy_port} -c ${client_addr} -t ${duration} -f m -P ${num_client_threads} 2>&1 \\
+    | tee \$results_file \\
+    | grep "connect failed" >/dev/null; do
+  sleep 0.1 # Wait for all services.
+done
+
+# Unlink all relevant devices from the bridge. This is because when the bridge
+# is deleted, the kernel may hang. It appears that this problem is fixed in
+# upstream commit 1ce5cce895309862d2c35d922816adebe094fe4a.
+ip link set client.1 nomaster
+ip link set server.1 nomaster
+ip link set wan.0 nomaster
+ip link set wan.1 nomaster
+
+# Emit raw results.
+cat \$results_file >&2
+
+# Emit a useful result (final throughput).
+mbits=\$(grep Mbits/sec \$results_file \\
+  | sed -n -e 's/^.*[[:space:]]\\([[:digit:]]\\+\\(\\.[[:digit:]]\\+\\)\\?\\)[[:space:]]*Mbits\\/sec.*/\\1/p')
+client_cpu_ticks=\$(cat /proc/\$client_pid/stat \\
+  | awk '{print (\$14+\$15);}')
+server_cpu_ticks=\$(cat /proc/\$server_pid/stat \\
+  | awk '{print (\$14+\$15);}')
+ticks_per_sec=\$(getconf CLK_TCK)
+client_cpu_load=\$(bc -l <<< \$client_cpu_ticks/\$ticks_per_sec/${duration})
+server_cpu_load=\$(bc -l <<< \$server_cpu_ticks/\$ticks_per_sec/${duration})
+echo \$mbits \$client_cpu_load \$server_cpu_load
+EOF
diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
new file mode 100644
index 000000000..72ada5700
--- /dev/null
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -0,0 +1,444 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary tcp_proxy is a simple TCP proxy.
+package main
+
+import (
+	"encoding/gob"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"math/rand"
+	"net"
+	"os"
+	"os/signal"
+	"regexp"
+	"runtime"
+	"runtime/pprof"
+	"strconv"
+	"syscall"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+)
+
+var (
+	port    = flag.Int("port", 0, "bind port (all addresses)")
+	forward = flag.String("forward", "", "forwarding target")
+	client  = flag.Bool("client", false, "use netstack for listen")
+	server  = flag.Bool("server", false, "use netstack for dial")
+
+	// Netstack-specific options.
+	mtu                = flag.Int("mtu", 1280, "mtu for network stack")
+	addr               = flag.String("addr", "", "address for tap-based netstack")
+	mask               = flag.Int("mask", 8, "mask size for address")
+	iface              = flag.String("iface", "", "network interface name to bind for netstack")
+	sack               = flag.Bool("sack", false, "enable SACK support for netstack")
+	cubic              = flag.Bool("cubic", false, "enable use of CUBIC congestion control for netstack")
+	gso                = flag.Int("gso", 0, "GSO maximum size")
+	swgso              = flag.Bool("swgso", false, "software-level GSO")
+	clientTCPProbeFile = flag.String("client_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.")
+	serverTCPProbeFile = flag.String("server_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.")
+	cpuprofile         = flag.String("cpuprofile", "", "write cpu profile to the specified file.")
+	memprofile         = flag.String("memprofile", "", "write memory profile to the specified file.")
+)
+
+type impl interface {
+	dial(address string) (net.Conn, error)
+	listen(port int) (net.Listener, error)
+	printStats()
+}
+
+type netImpl struct{}
+
+func (netImpl) dial(address string) (net.Conn, error) {
+	return net.Dial("tcp", address)
+}
+
+func (netImpl) listen(port int) (net.Listener, error) {
+	return net.Listen("tcp", fmt.Sprintf(":%d", port))
+}
+
+func (netImpl) printStats() {
+}
+
+const (
+	nicID   = 1       // Fixed.
+	bufSize = 4 << 20 // 4MB.
+)
+
+type netstackImpl struct {
+	s    *stack.Stack
+	addr tcpip.Address
+	mode string
+}
+
+func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) {
+	// Get all interfaces in the namespace.
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return nil, fmt.Errorf("querying interfaces: %v", err)
+	}
+
+	for _, iface := range ifaces {
+		if iface.Name != ifaceName {
+			continue
+		}
+		// Create the socket.
+		const protocol = 0x0300 // htons(ETH_P_ALL)
+		fds := make([]int, numChannels)
+		for i := range fds {
+			fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+			if err != nil {
+				return nil, fmt.Errorf("unable to create raw socket: %v", err)
+			}
+
+			// Bind to the appropriate device.
+			ll := syscall.SockaddrLinklayer{
+				Protocol: protocol,
+				Ifindex:  iface.Index,
+				Pkttype:  syscall.PACKET_HOST,
+			}
+			if err := syscall.Bind(fd, &ll); err != nil {
+				return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+			}
+
+			// RAW Sockets by default have a very small SO_RCVBUF of 256KB,
+			// up it to at least 4MB to reduce packet drops.
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize); err != nil {
+				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", bufSize, err)
+			}
+
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize); err != nil {
+				return nil, fmt.Errorf("setsockopt(..., SO_SNDBUF, %v,..) = %v", bufSize, err)
+			}
+
+			if !*swgso && *gso != 0 {
+				if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+					return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+				}
+			}
+			fds[i] = fd
+		}
+		return fds, nil
+	}
+	return nil, fmt.Errorf("failed to find interface: %v", ifaceName)
+}
+
+func newNetstackImpl(mode string) (impl, error) {
+	fds, err := setupNetwork(*iface, runtime.GOMAXPROCS(-1))
+	if err != nil {
+		return nil, err
+	}
+
+	// Parse details.
+	parsedAddr := tcpip.Address(net.ParseIP(*addr).To4())
+	parsedDest := tcpip.Address("")     // Filled in below.
+	parsedMask := tcpip.AddressMask("") // Filled in below.
+	switch *mask {
+	case 8:
+		parsedDest = tcpip.Address([]byte{parsedAddr[0], 0, 0, 0})
+		parsedMask = tcpip.AddressMask([]byte{0xff, 0, 0, 0})
+	case 16:
+		parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], 0, 0})
+		parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0, 0})
+	case 24:
+		parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], parsedAddr[2], 0})
+		parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0xff, 0})
+	default:
+		// This is just laziness; we don't expect a different mask.
+		return nil, fmt.Errorf("mask %d not supported", mask)
+	}
+
+	// Create a new network stack.
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol()}
+	s := stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+	})
+
+	// Generate a new mac for the eth device.
+	mac := make(net.HardwareAddr, 6)
+	rand.Read(mac) // Fill with random data.
+	mac[0] &^= 0x1 // Clear multicast bit.
+	mac[0] |= 0x2  // Set local assignment bit (IEEE802).
+	ep, err := fdbased.New(&fdbased.Options{
+		FDs:            fds,
+		MTU:            uint32(*mtu),
+		EthernetHeader: true,
+		Address:        tcpip.LinkAddress(mac),
+		// Enable checksum generation as we need to generate valid
+		// checksums for the veth device to deliver our packets to the
+		// peer. But we do want to disable checksum verification as veth
+		// devices do perform GRO and the linux host kernel may not
+		// regenerate valid checksums after GRO.
+		TXChecksumOffload:  false,
+		RXChecksumOffload:  true,
+		PacketDispatchMode: fdbased.RecvMMsg,
+		GSOMaxSize:         uint32(*gso),
+		SoftwareGSOEnabled: *swgso,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to create FD endpoint: %v", err)
+	}
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		return nil, fmt.Errorf("error creating NIC %q: %v", *iface, err)
+	}
+	if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+		return nil, fmt.Errorf("error adding ARP address to %q: %v", *iface, err)
+	}
+	if err := s.AddAddress(nicID, ipv4.ProtocolNumber, parsedAddr); err != nil {
+		return nil, fmt.Errorf("error adding IP address to %q: %v", *iface, err)
+	}
+
+	subnet, err := tcpip.NewSubnet(parsedDest, parsedMask)
+	if err != nil {
+		return nil, fmt.Errorf("tcpip.Subnet(%s, %s): %s", parsedDest, parsedMask, err)
+	}
+	// Add default route; we only support
+	s.SetRouteTable([]tcpip.Route{
+		{
+			Destination: subnet,
+			NIC:         nicID,
+		},
+	})
+
+	// Set protocol options.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(*sack)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption for SACKEnabled failed: %v", err)
+	}
+
+	// Set Congestion Control to cubic if requested.
+	if *cubic {
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.CongestionControlOption("cubic")); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption for CongestionControlOption(cubic) failed: %v", err)
+		}
+	}
+
+	return netstackImpl{
+		s:    s,
+		addr: parsedAddr,
+		mode: mode,
+	}, nil
+}
+
+func (n netstackImpl) dial(address string) (net.Conn, error) {
+	host, port, err := net.SplitHostPort(address)
+	if err != nil {
+		return nil, err
+	}
+	if host == "" {
+		// A host must be provided for the dial.
+		return nil, fmt.Errorf("no host provided")
+	}
+	portNumber, err := strconv.Atoi(port)
+	if err != nil {
+		return nil, err
+	}
+	addr := tcpip.FullAddress{
+		NIC:  nicID,
+		Addr: tcpip.Address(net.ParseIP(host).To4()),
+		Port: uint16(portNumber),
+	}
+	conn, err := gonet.DialTCP(n.s, addr, ipv4.ProtocolNumber)
+	if err != nil {
+		return nil, err
+	}
+	return conn, nil
+}
+
+func (n netstackImpl) listen(port int) (net.Listener, error) {
+	addr := tcpip.FullAddress{
+		NIC:  nicID,
+		Port: uint16(port),
+	}
+	listener, err := gonet.NewListener(n.s, addr, ipv4.ProtocolNumber)
+	if err != nil {
+		return nil, err
+	}
+	return listener, nil
+}
+
+var zeroFieldsRegexp = regexp.MustCompile(`\s*[a-zA-Z0-9]*:0`)
+
+func (n netstackImpl) printStats() {
+	// Don't show zero fields.
+	stats := zeroFieldsRegexp.ReplaceAllString(fmt.Sprintf("%+v", n.s.Stats()), "")
+	log.Printf("netstack %s Stats: %+v\n", n.mode, stats)
+}
+
+// installProbe installs a TCP Probe function that will dump endpoint
+// state to the specified file. It also returns a close func() that
+// can be used to close the probeFile.
+func (n netstackImpl) installProbe(probeFileName string) (close func()) {
+	// Install Probe to dump out end point state.
+	probeFile, err := os.Create(probeFileName)
+	if err != nil {
+		log.Fatalf("failed to create tcp_probe file %s: %v", probeFileName, err)
+	}
+	probeEncoder := gob.NewEncoder(probeFile)
+	// Install a TCP Probe.
+	n.s.AddTCPProbe(func(state stack.TCPEndpointState) {
+		probeEncoder.Encode(state)
+	})
+	return func() { probeFile.Close() }
+}
+
+func main() {
+	flag.Parse()
+	if *port == 0 {
+		log.Fatalf("no port provided")
+	}
+	if *forward == "" {
+		log.Fatalf("no forward provided")
+	}
+	// Seed the random number generator to ensure that we are given MAC addresses that don't
+	// for the case of the client and server stack.
+	rand.Seed(time.Now().UTC().UnixNano())
+
+	if *cpuprofile != "" {
+		f, err := os.Create(*cpuprofile)
+		if err != nil {
+			log.Fatal("could not create CPU profile: ", err)
+		}
+		defer func() {
+			if err := f.Close(); err != nil {
+				log.Print("error closing CPU profile: ", err)
+			}
+		}()
+		if err := pprof.StartCPUProfile(f); err != nil {
+			log.Fatal("could not start CPU profile: ", err)
+		}
+		defer pprof.StopCPUProfile()
+	}
+
+	var (
+		in  impl
+		out impl
+		err error
+	)
+	if *server {
+		in, err = newNetstackImpl("server")
+		if *serverTCPProbeFile != "" {
+			defer in.(netstackImpl).installProbe(*serverTCPProbeFile)()
+		}
+
+	} else {
+		in = netImpl{}
+	}
+	if err != nil {
+		log.Fatalf("netstack error: %v", err)
+	}
+	if *client {
+		out, err = newNetstackImpl("client")
+		if *clientTCPProbeFile != "" {
+			defer out.(netstackImpl).installProbe(*clientTCPProbeFile)()
+		}
+	} else {
+		out = netImpl{}
+	}
+	if err != nil {
+		log.Fatalf("netstack error: %v", err)
+	}
+
+	// Dial forward before binding.
+	var next net.Conn
+	for {
+		next, err = out.dial(*forward)
+		if err == nil {
+			break
+		}
+		time.Sleep(50 * time.Millisecond)
+		log.Printf("connect failed retrying: %v", err)
+	}
+
+	// Bind once to the server socket.
+	listener, err := in.listen(*port)
+	if err != nil {
+		// Should not happen, everything must be bound by this time
+		// this proxy is started.
+		log.Fatalf("unable to listen: %v", err)
+	}
+	log.Printf("client=%v, server=%v, ready.", *client, *server)
+
+	sigs := make(chan os.Signal, 1)
+	signal.Notify(sigs, syscall.SIGTERM)
+	go func() {
+		<-sigs
+		if *cpuprofile != "" {
+			pprof.StopCPUProfile()
+		}
+		if *memprofile != "" {
+			f, err := os.Create(*memprofile)
+			if err != nil {
+				log.Fatal("could not create memory profile: ", err)
+			}
+			defer func() {
+				if err := f.Close(); err != nil {
+					log.Print("error closing memory profile: ", err)
+				}
+			}()
+			runtime.GC() // get up-to-date statistics
+			if err := pprof.WriteHeapProfile(f); err != nil {
+				log.Fatalf("Unable to write heap profile: %v", err)
+			}
+		}
+		os.Exit(0)
+	}()
+
+	for {
+		// Forward all connections.
+		inConn, err := listener.Accept()
+		if err != nil {
+			// This should not happen; we are listening
+			// successfully. Exhausted all available FDs?
+			log.Fatalf("accept error: %v", err)
+		}
+		log.Printf("incoming connection established.")
+
+		// Copy both ways.
+		go io.Copy(inConn, next)
+		go io.Copy(next, inConn)
+
+		// Print stats every second.
+		go func() {
+			t := time.NewTicker(time.Second)
+			defer t.Stop()
+			for {
+				<-t.C
+				in.printStats()
+				out.printStats()
+			}
+		}()
+
+		for {
+			// Dial again.
+			next, err = out.dial(*forward)
+			if err == nil {
+				break
+			}
+		}
+	}
+}