6 files changed, 476 insertions, 9 deletions
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index 8f1572bf4..1b11f4b2d 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -198,8 +198,8 @@ func (s *Stack) IPTables() (iptables.IPTables, error) {
 
 // FillDefaultIPTables sets the stack's iptables to the default tables, which
 // allow and do not modify all traffic.
-func (s *Stack) FillDefaultIPTables() error {
-	return netfilter.FillDefaultIPTables(s.Stack)
+func (s *Stack) FillDefaultIPTables() {
+	netfilter.FillDefaultIPTables(s.Stack)
 }
 
 // Resume implements inet.Stack.Resume.
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 3021f83e7..354a0d6ee 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -13,6 +13,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/binary",
         "//pkg/sentry/kernel",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index efdb42903..9f87c32f1 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -17,7 +17,10 @@
 package netfilter
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserr"
@@ -26,21 +29,258 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
+// errorTargetName is used to mark targets as error targets. Error targets
+// shouldn't be reached - an error has occurred if we fall through to one.
+const errorTargetName = "ERROR"
+
+// metadata is opaque to netstack. It holds data that we need to translate
+// between Linux's and netstack's iptables representations.
+type metadata struct {
+	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
+	Underflow  [linux.NF_INET_NUMHOOKS]uint32
+	NumEntries uint32
+	Size       uint32
+}
+
 // GetInfo returns information about iptables.
 func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
-	// TODO(b/129292233): Implement.
-	return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
+	// Read in the struct and table name.
+	var info linux.IPTGetinfo
+	if _, err := t.CopyIn(outPtr, &info); err != nil {
+		return linux.IPTGetinfo{}, syserr.FromError(err)
+	}
+
+	// Find the appropriate table.
+	table, err := findTable(ep, info.TableName())
+	if err != nil {
+		return linux.IPTGetinfo{}, err
+	}
+
+	// Get the hooks that apply to this table.
+	info.ValidHooks = table.ValidHooks()
+
+	// Grab the metadata struct, which is used to store information (e.g.
+	// the number of entries) that applies to the user's encoding of
+	// iptables, but not netstack's.
+	metadata := table.Metadata().(metadata)
+
+	// Set values from metadata.
+	info.HookEntry = metadata.HookEntry
+	info.Underflow = metadata.Underflow
+	info.NumEntries = metadata.NumEntries
+	info.Size = metadata.Size
+
+	return info, nil
 }
 
 // GetEntries returns netstack's iptables rules encoded for the iptables tool.
 func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
-	// TODO(b/129292233): Implement.
-	return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
+	// Read in the struct and table name.
+	var userEntries linux.IPTGetEntries
+	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
+		return linux.KernelIPTGetEntries{}, syserr.FromError(err)
+	}
+
+	// Find the appropriate table.
+	table, err := findTable(ep, userEntries.TableName())
+	if err != nil {
+		return linux.KernelIPTGetEntries{}, err
+	}
+
+	// Convert netstack's iptables rules to something that the iptables
+	// tool can understand.
+	entries, _, err := convertNetstackToBinary(userEntries.TableName(), table)
+	if err != nil {
+		return linux.KernelIPTGetEntries{}, err
+	}
+	if binary.Size(entries) > uintptr(outLen) {
+		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
+	}
+
+	return entries, nil
+}
+
+func findTable(ep tcpip.Endpoint, tableName string) (iptables.Table, *syserr.Error) {
+	ipt, err := ep.IPTables()
+	if err != nil {
+		return iptables.Table{}, syserr.FromError(err)
+	}
+	table, ok := ipt.Tables[tableName]
+	if !ok {
+		return iptables.Table{}, syserr.ErrInvalidArgument
+	}
+	return table, nil
 }
 
 // FillDefaultIPTables sets stack's IPTables to the default tables and
 // populates them with metadata.
-func FillDefaultIPTables(stack *stack.Stack) error {
-	stack.SetIPTables(iptables.DefaultTables())
-	return nil
+func FillDefaultIPTables(stack *stack.Stack) {
+	ipt := iptables.DefaultTables()
+
+	// In order to fill in the metadata, we have to translate ipt from its
+	// netstack format to Linux's giant-binary-blob format.
+	for name, table := range ipt.Tables {
+		_, metadata, err := convertNetstackToBinary(name, table)
+		if err != nil {
+			panic(fmt.Errorf("Unable to set default IP tables: %v", err))
+		}
+		table.SetMetadata(metadata)
+		ipt.Tables[name] = table
+	}
+
+	stack.SetIPTables(ipt)
+}
+
+// convertNetstackToBinary converts the iptables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a bit, reading some offsets,
+// jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary(name string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
+	// Return values.
+	var entries linux.KernelIPTGetEntries
+	var meta metadata
+
+	// The table name has to fit in the struct.
+	if linux.XT_TABLE_MAXNAMELEN < len(name) {
+		return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
+	}
+	copy(entries.Name[:], name)
+
+	// Deal with the built in chains first (INPUT, OUTPUT, etc.). Each of
+	// these chains ends with an unconditional policy entry.
+	for hook := iptables.Prerouting; hook < iptables.NumHooks; hook++ {
+		chain, ok := table.BuiltinChains[hook]
+		if !ok {
+			// This table doesn't support this hook.
+			continue
+		}
+
+		// Sanity check.
+		if len(chain.Rules) < 1 {
+			return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
+		}
+
+		for ruleIdx, rule := range chain.Rules {
+			// If this is the first rule of a builtin chain, set
+			// the metadata hook entry point.
+			if ruleIdx == 0 {
+				meta.HookEntry[hook] = entries.Size
+			}
+
+			// Each rule corresponds to an entry.
+			entry := linux.KernelIPTEntry{
+				IPTEntry: linux.IPTEntry{
+					NextOffset:   linux.SizeOfIPTEntry,
+					TargetOffset: linux.SizeOfIPTEntry,
+				},
+			}
+
+			for _, matcher := range rule.Matchers {
+				// Serialize the matcher and add it to the
+				// entry.
+				serialized := marshalMatcher(matcher)
+				entry.Elems = append(entry.Elems, serialized...)
+				entry.NextOffset += uint16(len(serialized))
+				entry.TargetOffset += uint16(len(serialized))
+			}
+
+			// Serialize and append the target.
+			serialized := marshalTarget(rule.Target)
+			entry.Elems = append(entry.Elems, serialized...)
+			entry.NextOffset += uint16(len(serialized))
+
+			// The underflow rule is the last rule in the chain,
+			// and is an unconditional rule (i.e. it matches any
+			// packet). This is enforced when saving iptables.
+			if ruleIdx == len(chain.Rules)-1 {
+				meta.Underflow[hook] = entries.Size
+			}
+
+			entries.Size += uint32(entry.NextOffset)
+			entries.Entrytable = append(entries.Entrytable, entry)
+			meta.NumEntries++
+		}
+
+	}
+
+	// TODO(gvisor.dev/issue/170): Deal with the user chains here. Each of
+	// these starts with an error node holding the chain's name and ends
+	// with an unconditional return.
+
+	// Lastly, each table ends with an unconditional error target rule as
+	// its final entry.
+	errorEntry := linux.KernelIPTEntry{
+		IPTEntry: linux.IPTEntry{
+			NextOffset:   linux.SizeOfIPTEntry,
+			TargetOffset: linux.SizeOfIPTEntry,
+		},
+	}
+	var errorTarget linux.XTErrorTarget
+	errorTarget.Target.TargetSize = linux.SizeOfXTErrorTarget
+	copy(errorTarget.ErrorName[:], errorTargetName)
+	copy(errorTarget.Target.Name[:], errorTargetName)
+
+	// Serialize and add it to the list of entries.
+	errorTargetBuf := make([]byte, 0, linux.SizeOfXTErrorTarget)
+	serializedErrorTarget := binary.Marshal(errorTargetBuf, usermem.ByteOrder, errorTarget)
+	errorEntry.Elems = append(errorEntry.Elems, serializedErrorTarget...)
+	errorEntry.NextOffset += uint16(len(serializedErrorTarget))
+
+	entries.Size += uint32(errorEntry.NextOffset)
+	entries.Entrytable = append(entries.Entrytable, errorEntry)
+	meta.NumEntries++
+	meta.Size = entries.Size
+
+	return entries, meta, nil
+}
+
+func marshalMatcher(matcher iptables.Matcher) []byte {
+	switch matcher.(type) {
+	default:
+		// TODO(gvisor.dev/issue/170): We don't support any matchers yet, so
+		// any call to marshalMatcher will panic.
+		panic(fmt.Errorf("unknown matcher of type %T", matcher))
+	}
+}
+
+func marshalTarget(target iptables.Target) []byte {
+	switch target.(type) {
+	case iptables.UnconditionalAcceptTarget:
+		return marshalUnconditionalAcceptTarget()
+	default:
+		panic(fmt.Errorf("unknown target of type %T", target))
+	}
+}
+
+func marshalUnconditionalAcceptTarget() []byte {
+	// The target's name will be the empty string.
+	target := linux.XTStandardTarget{
+		Target: linux.XTEntryTarget{
+			TargetSize: linux.SizeOfXTStandardTarget,
+		},
+		Verdict: translateStandardVerdict(iptables.Accept),
+	}
+
+	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+// translateStandardVerdict translates verdicts the same way as the iptables
+// tool.
+func translateStandardVerdict(verdict iptables.Verdict) int32 {
+	switch verdict {
+	case iptables.Accept:
+		return -linux.NF_ACCEPT - 1
+	case iptables.Drop:
+		return -linux.NF_DROP - 1
+	case iptables.Queue:
+		return -linux.NF_QUEUE - 1
+	case iptables.Return:
+		return linux.NF_RETURN
+	case iptables.Jump:
+		// TODO(gvisor.dev/issue/170): Support Jump.
+		panic("Jump isn't supported yet")
+	default:
+		panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+	}
 }
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index f50d83f38..6b8c4a39d 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -180,6 +180,10 @@ syscall_test(
 )
 
 syscall_test(
+    test = "//test/syscalls/linux:iptables_test",
+)
+
+syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:itimer_test",
 )
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index db0a1e661..e2b0716ef 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -913,6 +913,24 @@ cc_library(
 )
 
 cc_binary(
+    name = "iptables_test",
+    testonly = 1,
+    srcs = [
+        "iptables.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":iptables_types",
+        ":socket_test_util",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
     name = "itimer_test",
     testonly = 1,
     srcs = ["itimer.cc"],
diff --git a/test/syscalls/linux/iptables.cc b/test/syscalls/linux/iptables.cc
new file mode 100644
index 000000000..b8e4ece64
--- /dev/null
+++ b/test/syscalls/linux/iptables.cc
@@ -0,0 +1,204 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/iptables.h"
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <stdio.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr char kNatTablename[] = "nat";
+constexpr char kErrorTarget[] = "ERROR";
+constexpr size_t kEmptyStandardEntrySize =
+    sizeof(struct ipt_entry) + sizeof(struct ipt_standard_target);
+constexpr size_t kEmptyErrorEntrySize =
+    sizeof(struct ipt_entry) + sizeof(struct ipt_error_target);
+
+TEST(IPTablesBasic, CreateSocket) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP),
+              SyscallSucceeds());
+
+  ASSERT_THAT(close(sock), SyscallSucceeds());
+}
+
+TEST(IPTablesBasic, FailSockoptNonRaw) {
+  // Even if the user has CAP_NET_RAW, they shouldn't be able to use the
+  // iptables sockopts with a non-raw socket.
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info);
+  EXPECT_THAT(getsockopt(sock, IPPROTO_IP, SO_GET_INFO, &info, &info_size),
+              SyscallFailsWithErrno(ENOPROTOOPT));
+
+  ASSERT_THAT(close(sock), SyscallSucceeds());
+}
+
+// Fixture for iptables tests.
+class IPTablesTest : public ::testing::Test {
+ protected:
+  // Creates a socket to be used in tests.
+  void SetUp() override;
+
+  // Closes the socket created by SetUp().
+  void TearDown() override;
+
+  // The socket via which to manipulate iptables.
+  int s_;
+};
+
+void IPTablesTest::SetUp() {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds());
+}
+
+void IPTablesTest::TearDown() {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  EXPECT_THAT(close(s_), SyscallSucceeds());
+}
+
+// This tests the initial state of a machine with empty iptables. We don't have
+// a guarantee that the iptables are empty when running in native, but we can
+// test that gVisor has the same initial state that a newly-booted Linux machine
+// would have.
+TEST_F(IPTablesTest, InitialState) {
+  SKIP_IF(!IsRunningOnGvisor());
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  //
+  // Get info via sockopt.
+  //
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info);
+  ASSERT_THAT(getsockopt(s_, IPPROTO_IP, SO_GET_INFO, &info, &info_size),
+              SyscallSucceeds());
+
+  // The nat table supports PREROUTING, and OUTPUT.
+  unsigned int valid_hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT) |
+                             (1 << NF_IP_POST_ROUTING) | (1 << NF_IP_LOCAL_IN);
+
+  EXPECT_EQ(info.valid_hooks, valid_hooks);
+
+  // Each chain consists of an empty entry with a standard target..
+  EXPECT_EQ(info.hook_entry[NF_IP_PRE_ROUTING], 0);
+  EXPECT_EQ(info.hook_entry[NF_IP_LOCAL_IN], kEmptyStandardEntrySize);
+  EXPECT_EQ(info.hook_entry[NF_IP_LOCAL_OUT], kEmptyStandardEntrySize * 2);
+  EXPECT_EQ(info.hook_entry[NF_IP_POST_ROUTING], kEmptyStandardEntrySize * 3);
+
+  // The underflow points are the same as the entry points.
+  EXPECT_EQ(info.underflow[NF_IP_PRE_ROUTING], 0);
+  EXPECT_EQ(info.underflow[NF_IP_LOCAL_IN], kEmptyStandardEntrySize);
+  EXPECT_EQ(info.underflow[NF_IP_LOCAL_OUT], kEmptyStandardEntrySize * 2);
+  EXPECT_EQ(info.underflow[NF_IP_POST_ROUTING], kEmptyStandardEntrySize * 3);
+
+  // One entry for each chain, plus an error entry at the end.
+  EXPECT_EQ(info.num_entries, 5);
+
+  EXPECT_EQ(info.size, 4 * kEmptyStandardEntrySize + kEmptyErrorEntrySize);
+  EXPECT_EQ(strcmp(info.name, kNatTablename), 0);
+
+  //
+  // Use info to get entries.
+  //
+  socklen_t entries_size = sizeof(struct ipt_get_entries) + info.size;
+  struct ipt_get_entries* entries =
+      static_cast<struct ipt_get_entries*>(malloc(entries_size));
+  snprintf(entries->name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  entries->size = info.size;
+  ASSERT_THAT(
+      getsockopt(s_, IPPROTO_IP, SO_GET_ENTRIES, entries, &entries_size),
+      SyscallSucceeds());
+
+  // Verify the name and size.
+  ASSERT_EQ(info.size, entries->size);
+  ASSERT_EQ(strcmp(entries->name, kNatTablename), 0);
+
+  // Verify that the entrytable is 4 entries with accept targets and no matches
+  // followed by a single error target.
+  size_t entry_offset = 0;
+  while (entry_offset < entries->size) {
+    struct ipt_entry* entry = reinterpret_cast<struct ipt_entry*>(
+        reinterpret_cast<char*>(entries->entrytable) + entry_offset);
+
+    // ip should be zeroes.
+    struct ipt_ip zeroed = {};
+    EXPECT_EQ(memcmp(static_cast<void*>(&zeroed),
+                     static_cast<void*>(&entry->ip), sizeof(zeroed)),
+              0);
+
+    // target_offset should be zero.
+    EXPECT_EQ(entry->target_offset, sizeof(ipt_entry));
+
+    if (entry_offset < kEmptyStandardEntrySize * 4) {
+      // The first 4 entries are standard targets
+      struct ipt_standard_target* target =
+          reinterpret_cast<struct ipt_standard_target*>(entry->elems);
+      EXPECT_EQ(entry->next_offset, kEmptyStandardEntrySize);
+      EXPECT_EQ(target->target.u.user.target_size, sizeof(*target));
+      EXPECT_EQ(strcmp(target->target.u.user.name, ""), 0);
+      EXPECT_EQ(target->target.u.user.revision, 0);
+      // This is what's returned for an accept verdict. I don't know why.
+      EXPECT_EQ(target->verdict, -NF_ACCEPT - 1);
+    } else {
+      // The last entry is an error target
+      struct ipt_error_target* target =
+          reinterpret_cast<struct ipt_error_target*>(entry->elems);
+      EXPECT_EQ(entry->next_offset, kEmptyErrorEntrySize);
+      EXPECT_EQ(target->target.u.user.target_size, sizeof(*target));
+      EXPECT_EQ(strcmp(target->target.u.user.name, kErrorTarget), 0);
+      EXPECT_EQ(target->target.u.user.revision, 0);
+      EXPECT_EQ(strcmp(target->errorname, kErrorTarget), 0);
+    }
+
+    entry_offset += entry->next_offset;
+  }
+
+  free(entries);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor