diff options
-rw-r--r-- | pkg/sentry/socket/epsocket/stack.go | 4 | ||||
-rw-r--r-- | pkg/sentry/socket/netfilter/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/socket/netfilter/netfilter.go | 254 | ||||
-rw-r--r-- | test/syscalls/BUILD | 4 | ||||
-rw-r--r-- | test/syscalls/linux/BUILD | 18 | ||||
-rw-r--r-- | test/syscalls/linux/iptables.cc | 204 |
6 files changed, 476 insertions, 9 deletions
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go index 8f1572bf4..1b11f4b2d 100644 --- a/pkg/sentry/socket/epsocket/stack.go +++ b/pkg/sentry/socket/epsocket/stack.go @@ -198,8 +198,8 @@ func (s *Stack) IPTables() (iptables.IPTables, error) { // FillDefaultIPTables sets the stack's iptables to the default tables, which // allow and do not modify all traffic. -func (s *Stack) FillDefaultIPTables() error { - return netfilter.FillDefaultIPTables(s.Stack) +func (s *Stack) FillDefaultIPTables() { + netfilter.FillDefaultIPTables(s.Stack) } // Resume implements inet.Stack.Resume. diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index 3021f83e7..354a0d6ee 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -13,6 +13,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/binary", "//pkg/sentry/kernel", "//pkg/sentry/usermem", "//pkg/syserr", diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index efdb42903..9f87c32f1 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -17,7 +17,10 @@ package netfilter import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" @@ -26,21 +29,258 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/stack" ) +// errorTargetName is used to mark targets as error targets. Error targets +// shouldn't be reached - an error has occurred if we fall through to one. +const errorTargetName = "ERROR" + +// metadata is opaque to netstack. It holds data that we need to translate +// between Linux's and netstack's iptables representations. +type metadata struct { + HookEntry [linux.NF_INET_NUMHOOKS]uint32 + Underflow [linux.NF_INET_NUMHOOKS]uint32 + NumEntries uint32 + Size uint32 +} + // GetInfo returns information about iptables. func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) { - // TODO(b/129292233): Implement. - return linux.IPTGetinfo{}, syserr.ErrInvalidArgument + // Read in the struct and table name. + var info linux.IPTGetinfo + if _, err := t.CopyIn(outPtr, &info); err != nil { + return linux.IPTGetinfo{}, syserr.FromError(err) + } + + // Find the appropriate table. + table, err := findTable(ep, info.TableName()) + if err != nil { + return linux.IPTGetinfo{}, err + } + + // Get the hooks that apply to this table. + info.ValidHooks = table.ValidHooks() + + // Grab the metadata struct, which is used to store information (e.g. + // the number of entries) that applies to the user's encoding of + // iptables, but not netstack's. + metadata := table.Metadata().(metadata) + + // Set values from metadata. + info.HookEntry = metadata.HookEntry + info.Underflow = metadata.Underflow + info.NumEntries = metadata.NumEntries + info.Size = metadata.Size + + return info, nil } // GetEntries returns netstack's iptables rules encoded for the iptables tool. func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) { - // TODO(b/129292233): Implement. - return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument + // Read in the struct and table name. + var userEntries linux.IPTGetEntries + if _, err := t.CopyIn(outPtr, &userEntries); err != nil { + return linux.KernelIPTGetEntries{}, syserr.FromError(err) + } + + // Find the appropriate table. + table, err := findTable(ep, userEntries.TableName()) + if err != nil { + return linux.KernelIPTGetEntries{}, err + } + + // Convert netstack's iptables rules to something that the iptables + // tool can understand. + entries, _, err := convertNetstackToBinary(userEntries.TableName(), table) + if err != nil { + return linux.KernelIPTGetEntries{}, err + } + if binary.Size(entries) > uintptr(outLen) { + return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument + } + + return entries, nil +} + +func findTable(ep tcpip.Endpoint, tableName string) (iptables.Table, *syserr.Error) { + ipt, err := ep.IPTables() + if err != nil { + return iptables.Table{}, syserr.FromError(err) + } + table, ok := ipt.Tables[tableName] + if !ok { + return iptables.Table{}, syserr.ErrInvalidArgument + } + return table, nil } // FillDefaultIPTables sets stack's IPTables to the default tables and // populates them with metadata. -func FillDefaultIPTables(stack *stack.Stack) error { - stack.SetIPTables(iptables.DefaultTables()) - return nil +func FillDefaultIPTables(stack *stack.Stack) { + ipt := iptables.DefaultTables() + + // In order to fill in the metadata, we have to translate ipt from its + // netstack format to Linux's giant-binary-blob format. + for name, table := range ipt.Tables { + _, metadata, err := convertNetstackToBinary(name, table) + if err != nil { + panic(fmt.Errorf("Unable to set default IP tables: %v", err)) + } + table.SetMetadata(metadata) + ipt.Tables[name] = table + } + + stack.SetIPTables(ipt) +} + +// convertNetstackToBinary converts the iptables as stored in netstack to the +// format expected by the iptables tool. Linux stores each table as a binary +// blob that can only be traversed by parsing a bit, reading some offsets, +// jumping to those offsets, parsing again, etc. +func convertNetstackToBinary(name string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) { + // Return values. + var entries linux.KernelIPTGetEntries + var meta metadata + + // The table name has to fit in the struct. + if linux.XT_TABLE_MAXNAMELEN < len(name) { + return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument + } + copy(entries.Name[:], name) + + // Deal with the built in chains first (INPUT, OUTPUT, etc.). Each of + // these chains ends with an unconditional policy entry. + for hook := iptables.Prerouting; hook < iptables.NumHooks; hook++ { + chain, ok := table.BuiltinChains[hook] + if !ok { + // This table doesn't support this hook. + continue + } + + // Sanity check. + if len(chain.Rules) < 1 { + return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument + } + + for ruleIdx, rule := range chain.Rules { + // If this is the first rule of a builtin chain, set + // the metadata hook entry point. + if ruleIdx == 0 { + meta.HookEntry[hook] = entries.Size + } + + // Each rule corresponds to an entry. + entry := linux.KernelIPTEntry{ + IPTEntry: linux.IPTEntry{ + NextOffset: linux.SizeOfIPTEntry, + TargetOffset: linux.SizeOfIPTEntry, + }, + } + + for _, matcher := range rule.Matchers { + // Serialize the matcher and add it to the + // entry. + serialized := marshalMatcher(matcher) + entry.Elems = append(entry.Elems, serialized...) + entry.NextOffset += uint16(len(serialized)) + entry.TargetOffset += uint16(len(serialized)) + } + + // Serialize and append the target. + serialized := marshalTarget(rule.Target) + entry.Elems = append(entry.Elems, serialized...) + entry.NextOffset += uint16(len(serialized)) + + // The underflow rule is the last rule in the chain, + // and is an unconditional rule (i.e. it matches any + // packet). This is enforced when saving iptables. + if ruleIdx == len(chain.Rules)-1 { + meta.Underflow[hook] = entries.Size + } + + entries.Size += uint32(entry.NextOffset) + entries.Entrytable = append(entries.Entrytable, entry) + meta.NumEntries++ + } + + } + + // TODO(gvisor.dev/issue/170): Deal with the user chains here. Each of + // these starts with an error node holding the chain's name and ends + // with an unconditional return. + + // Lastly, each table ends with an unconditional error target rule as + // its final entry. + errorEntry := linux.KernelIPTEntry{ + IPTEntry: linux.IPTEntry{ + NextOffset: linux.SizeOfIPTEntry, + TargetOffset: linux.SizeOfIPTEntry, + }, + } + var errorTarget linux.XTErrorTarget + errorTarget.Target.TargetSize = linux.SizeOfXTErrorTarget + copy(errorTarget.ErrorName[:], errorTargetName) + copy(errorTarget.Target.Name[:], errorTargetName) + + // Serialize and add it to the list of entries. + errorTargetBuf := make([]byte, 0, linux.SizeOfXTErrorTarget) + serializedErrorTarget := binary.Marshal(errorTargetBuf, usermem.ByteOrder, errorTarget) + errorEntry.Elems = append(errorEntry.Elems, serializedErrorTarget...) + errorEntry.NextOffset += uint16(len(serializedErrorTarget)) + + entries.Size += uint32(errorEntry.NextOffset) + entries.Entrytable = append(entries.Entrytable, errorEntry) + meta.NumEntries++ + meta.Size = entries.Size + + return entries, meta, nil +} + +func marshalMatcher(matcher iptables.Matcher) []byte { + switch matcher.(type) { + default: + // TODO(gvisor.dev/issue/170): We don't support any matchers yet, so + // any call to marshalMatcher will panic. + panic(fmt.Errorf("unknown matcher of type %T", matcher)) + } +} + +func marshalTarget(target iptables.Target) []byte { + switch target.(type) { + case iptables.UnconditionalAcceptTarget: + return marshalUnconditionalAcceptTarget() + default: + panic(fmt.Errorf("unknown target of type %T", target)) + } +} + +func marshalUnconditionalAcceptTarget() []byte { + // The target's name will be the empty string. + target := linux.XTStandardTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTStandardTarget, + }, + Verdict: translateStandardVerdict(iptables.Accept), + } + + ret := make([]byte, 0, linux.SizeOfXTStandardTarget) + return binary.Marshal(ret, usermem.ByteOrder, target) +} + +// translateStandardVerdict translates verdicts the same way as the iptables +// tool. +func translateStandardVerdict(verdict iptables.Verdict) int32 { + switch verdict { + case iptables.Accept: + return -linux.NF_ACCEPT - 1 + case iptables.Drop: + return -linux.NF_DROP - 1 + case iptables.Queue: + return -linux.NF_QUEUE - 1 + case iptables.Return: + return linux.NF_RETURN + case iptables.Jump: + // TODO(gvisor.dev/issue/170): Support Jump. + panic("Jump isn't supported yet") + default: + panic(fmt.Sprintf("unknown standard verdict: %d", verdict)) + } } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index f50d83f38..6b8c4a39d 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -180,6 +180,10 @@ syscall_test( ) syscall_test( + test = "//test/syscalls/linux:iptables_test", +) + +syscall_test( size = "medium", test = "//test/syscalls/linux:itimer_test", ) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index db0a1e661..e2b0716ef 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -913,6 +913,24 @@ cc_library( ) cc_binary( + name = "iptables_test", + testonly = 1, + srcs = [ + "iptables.cc", + ], + linkstatic = 1, + deps = [ + ":iptables_types", + ":socket_test_util", + "//test/util:capability_util", + "//test/util:file_descriptor", + "//test/util:test_main", + "//test/util:test_util", + "@com_google_googletest//:gtest", + ], +) + +cc_binary( name = "itimer_test", testonly = 1, srcs = ["itimer.cc"], diff --git a/test/syscalls/linux/iptables.cc b/test/syscalls/linux/iptables.cc new file mode 100644 index 000000000..b8e4ece64 --- /dev/null +++ b/test/syscalls/linux/iptables.cc @@ -0,0 +1,204 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/syscalls/linux/iptables.h" + +#include <arpa/inet.h> +#include <linux/capability.h> +#include <linux/netfilter/x_tables.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <stdio.h> +#include <sys/poll.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <unistd.h> + +#include <algorithm> + +#include "gtest/gtest.h" +#include "test/util/capability_util.h" +#include "test/util/file_descriptor.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +constexpr char kNatTablename[] = "nat"; +constexpr char kErrorTarget[] = "ERROR"; +constexpr size_t kEmptyStandardEntrySize = + sizeof(struct ipt_entry) + sizeof(struct ipt_standard_target); +constexpr size_t kEmptyErrorEntrySize = + sizeof(struct ipt_entry) + sizeof(struct ipt_error_target); + +TEST(IPTablesBasic, CreateSocket) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int sock; + ASSERT_THAT(sock = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), + SyscallSucceeds()); + + ASSERT_THAT(close(sock), SyscallSucceeds()); +} + +TEST(IPTablesBasic, FailSockoptNonRaw) { + // Even if the user has CAP_NET_RAW, they shouldn't be able to use the + // iptables sockopts with a non-raw socket. + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int sock; + ASSERT_THAT(sock = socket(AF_INET, SOCK_DGRAM, 0), SyscallSucceeds()); + + struct ipt_getinfo info = {}; + snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename); + socklen_t info_size = sizeof(info); + EXPECT_THAT(getsockopt(sock, IPPROTO_IP, SO_GET_INFO, &info, &info_size), + SyscallFailsWithErrno(ENOPROTOOPT)); + + ASSERT_THAT(close(sock), SyscallSucceeds()); +} + +// Fixture for iptables tests. +class IPTablesTest : public ::testing::Test { + protected: + // Creates a socket to be used in tests. + void SetUp() override; + + // Closes the socket created by SetUp(). + void TearDown() override; + + // The socket via which to manipulate iptables. + int s_; +}; + +void IPTablesTest::SetUp() { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds()); +} + +void IPTablesTest::TearDown() { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + EXPECT_THAT(close(s_), SyscallSucceeds()); +} + +// This tests the initial state of a machine with empty iptables. We don't have +// a guarantee that the iptables are empty when running in native, but we can +// test that gVisor has the same initial state that a newly-booted Linux machine +// would have. +TEST_F(IPTablesTest, InitialState) { + SKIP_IF(!IsRunningOnGvisor()); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + // + // Get info via sockopt. + // + struct ipt_getinfo info = {}; + snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename); + socklen_t info_size = sizeof(info); + ASSERT_THAT(getsockopt(s_, IPPROTO_IP, SO_GET_INFO, &info, &info_size), + SyscallSucceeds()); + + // The nat table supports PREROUTING, and OUTPUT. + unsigned int valid_hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT) | + (1 << NF_IP_POST_ROUTING) | (1 << NF_IP_LOCAL_IN); + + EXPECT_EQ(info.valid_hooks, valid_hooks); + + // Each chain consists of an empty entry with a standard target.. + EXPECT_EQ(info.hook_entry[NF_IP_PRE_ROUTING], 0); + EXPECT_EQ(info.hook_entry[NF_IP_LOCAL_IN], kEmptyStandardEntrySize); + EXPECT_EQ(info.hook_entry[NF_IP_LOCAL_OUT], kEmptyStandardEntrySize * 2); + EXPECT_EQ(info.hook_entry[NF_IP_POST_ROUTING], kEmptyStandardEntrySize * 3); + + // The underflow points are the same as the entry points. + EXPECT_EQ(info.underflow[NF_IP_PRE_ROUTING], 0); + EXPECT_EQ(info.underflow[NF_IP_LOCAL_IN], kEmptyStandardEntrySize); + EXPECT_EQ(info.underflow[NF_IP_LOCAL_OUT], kEmptyStandardEntrySize * 2); + EXPECT_EQ(info.underflow[NF_IP_POST_ROUTING], kEmptyStandardEntrySize * 3); + + // One entry for each chain, plus an error entry at the end. + EXPECT_EQ(info.num_entries, 5); + + EXPECT_EQ(info.size, 4 * kEmptyStandardEntrySize + kEmptyErrorEntrySize); + EXPECT_EQ(strcmp(info.name, kNatTablename), 0); + + // + // Use info to get entries. + // + socklen_t entries_size = sizeof(struct ipt_get_entries) + info.size; + struct ipt_get_entries* entries = + static_cast<struct ipt_get_entries*>(malloc(entries_size)); + snprintf(entries->name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename); + entries->size = info.size; + ASSERT_THAT( + getsockopt(s_, IPPROTO_IP, SO_GET_ENTRIES, entries, &entries_size), + SyscallSucceeds()); + + // Verify the name and size. + ASSERT_EQ(info.size, entries->size); + ASSERT_EQ(strcmp(entries->name, kNatTablename), 0); + + // Verify that the entrytable is 4 entries with accept targets and no matches + // followed by a single error target. + size_t entry_offset = 0; + while (entry_offset < entries->size) { + struct ipt_entry* entry = reinterpret_cast<struct ipt_entry*>( + reinterpret_cast<char*>(entries->entrytable) + entry_offset); + + // ip should be zeroes. + struct ipt_ip zeroed = {}; + EXPECT_EQ(memcmp(static_cast<void*>(&zeroed), + static_cast<void*>(&entry->ip), sizeof(zeroed)), + 0); + + // target_offset should be zero. + EXPECT_EQ(entry->target_offset, sizeof(ipt_entry)); + + if (entry_offset < kEmptyStandardEntrySize * 4) { + // The first 4 entries are standard targets + struct ipt_standard_target* target = + reinterpret_cast<struct ipt_standard_target*>(entry->elems); + EXPECT_EQ(entry->next_offset, kEmptyStandardEntrySize); + EXPECT_EQ(target->target.u.user.target_size, sizeof(*target)); + EXPECT_EQ(strcmp(target->target.u.user.name, ""), 0); + EXPECT_EQ(target->target.u.user.revision, 0); + // This is what's returned for an accept verdict. I don't know why. + EXPECT_EQ(target->verdict, -NF_ACCEPT - 1); + } else { + // The last entry is an error target + struct ipt_error_target* target = + reinterpret_cast<struct ipt_error_target*>(entry->elems); + EXPECT_EQ(entry->next_offset, kEmptyErrorEntrySize); + EXPECT_EQ(target->target.u.user.target_size, sizeof(*target)); + EXPECT_EQ(strcmp(target->target.u.user.name, kErrorTarget), 0); + EXPECT_EQ(target->target.u.user.revision, 0); + EXPECT_EQ(strcmp(target->errorname, kErrorTarget), 0); + } + + entry_offset += entry->next_offset; + } + + free(entries); +} + +} // namespace + +} // namespace testing +} // namespace gvisor |