From 9143fcd7fd38243dd40f927dafaeb75f6ef8ef49 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 14:47:17 -0800
Subject: Add UDP matchers.

---
 pkg/sentry/socket/netfilter/netfilter.go | 164 ++++++++++++++++++++++++++-----
 1 file changed, 141 insertions(+), 23 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index e1f2bacce..45296b339 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -131,6 +131,7 @@ func FillDefaultIPTables(stack *stack.Stack) {
 	stack.SetIPTables(ipt)
 }
 
+// TODO: Return proto.
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
@@ -318,10 +319,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 		var entry linux.IPTEntry
 		buf := optVal[:linux.SizeOfIPTEntry]
-		optVal = optVal[linux.SizeOfIPTEntry:]
 		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
-		if entry.TargetOffset != linux.SizeOfIPTEntry {
-			// TODO(gvisor.dev/issue/170): Support matchers.
+		initialOptValLen := len(optVal)
+		optVal = optVal[linux.SizeOfIPTEntry:]
+
+		if entry.TargetOffset < linux.SizeOfIPTEntry {
+			log.Warningf("netfilter: entry has too-small target offset %d", entry.TargetOffset)
 			return syserr.ErrInvalidArgument
 		}
 
@@ -332,19 +335,41 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			return err
 		}
 
+		// TODO: Matchers (and maybe targets) can specify that they only work for certiain protocols, hooks, tables.
+		// Get matchers.
+		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
+		if len(optVal) < int(matchersSize) {
+			log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+		}
+		matchers, err := parseMatchers(filter, optVal[:matchersSize])
+		if err != nil {
+			log.Warningf("netfilter: failed to parse matchers: %v", err)
+			return err
+		}
+		optVal = optVal[matchersSize:]
+
 		// Get the target of the rule.
-		target, consumed, err := parseTarget(optVal)
+		targetSize := entry.NextOffset - entry.TargetOffset
+		if len(optVal) < int(targetSize) {
+			log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+		}
+		target, err := parseTarget(optVal[:targetSize])
 		if err != nil {
 			return err
 		}
-		optVal = optVal[consumed:]
+		optVal = optVal[targetSize:]
 
 		table.Rules = append(table.Rules, iptables.Rule{
-			Filter: filter,
-			Target: target,
+			Filter:   filter,
+			Target:   target,
+			Matchers: matchers,
 		})
 		offsets = append(offsets, offset)
-		offset += linux.SizeOfIPTEntry + consumed
+		offset += uint32(entry.NextOffset)
+
+		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+			log.Warningf("netfilter: entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+		}
 	}
 
 	// Go through the list of supported hooks for this table and, for each
@@ -401,12 +426,105 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	return nil
 }
 
-// parseTarget parses a target from the start of optVal and returns the target
-// along with the number of bytes it occupies in optVal.
-func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
+// parseMatchers parses 0 or more matchers from optVal. optVal should contain
+// only the matchers.
+func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
+	var matchers []iptables.Matcher
+	for len(optVal) > 0 {
+		log.Infof("parseMatchers: optVal has len %d", len(optVal))
+		// Get the XTEntryMatch.
+		if len(optVal) < linux.SizeOfXTEntryMatch {
+			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		var match linux.XTEntryMatch
+		buf := optVal[:linux.SizeOfXTEntryMatch]
+		binary.Unmarshal(buf, usermem.ByteOrder, &match)
+		log.Infof("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
+
+		// Check some invariants.
+		if match.MatchSize < linux.SizeOfXTEntryMatch {
+			log.Warningf("netfilter: match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
+			return nil, syserr.ErrInvalidArgument
+		}
+		if len(optVal) < int(match.MatchSize) {
+			log.Warningf("netfilter: optVal has insufficient size for match: %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		buf = optVal[linux.SizeOfXTEntryMatch:match.MatchSize]
+		var matcher iptables.Matcher
+		var err error
+		switch match.Name.String() {
+		case "tcp":
+			if len(buf) < linux.SizeOfXTTCP {
+				log.Warningf("netfilter: optVal has insufficient size for TCP match: %d", len(optVal))
+				return nil, syserr.ErrInvalidArgument
+			}
+			var matchData linux.XTTCP
+			// For alignment reasons, the match's total size may exceed what's
+			// strictly necessary to hold matchData.
+			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
+			log.Infof("parseMatchers: parsed XTTCP: %+v", matchData)
+			matcher, err = iptables.NewTCPMatcher(filter, iptables.TCPMatcherData{
+				SourcePortStart:      matchData.SourcePortStart,
+				SourcePortEnd:        matchData.SourcePortEnd,
+				DestinationPortStart: matchData.DestinationPortStart,
+				DestinationPortEnd:   matchData.DestinationPortEnd,
+				Option:               matchData.Option,
+				FlagMask:             matchData.FlagMask,
+				FlagCompare:          matchData.FlagCompare,
+				InverseFlags:         matchData.InverseFlags,
+			})
+			if err != nil {
+				log.Warningf("netfilter: failed to create TCP matcher: %v", err)
+				return nil, syserr.ErrInvalidArgument
+			}
+
+		case "udp":
+			if len(buf) < linux.SizeOfXTUDP {
+				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
+				return nil, syserr.ErrInvalidArgument
+			}
+			var matchData linux.XTUDP
+			// For alignment reasons, the match's total size may exceed what's
+			// strictly necessary to hold matchData.
+			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
+			log.Infof("parseMatchers: parsed XTUDP: %+v", matchData)
+			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherData{
+				SourcePortStart:      matchData.SourcePortStart,
+				SourcePortEnd:        matchData.SourcePortEnd,
+				DestinationPortStart: matchData.DestinationPortStart,
+				DestinationPortEnd:   matchData.DestinationPortEnd,
+				InverseFlags:         matchData.InverseFlags,
+			})
+			if err != nil {
+				log.Warningf("netfilter: failed to create UDP matcher: %v", err)
+				return nil, syserr.ErrInvalidArgument
+			}
+
+		default:
+			log.Warningf("netfilter: unsupported matcher with name %q", match.Name.String())
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		matchers = append(matchers, matcher)
+
+		// TODO: Support revision.
+		// TODO: Support proto -- matchers usually specify which proto(s) they work with.
+		optVal = optVal[match.MatchSize:]
+	}
+
+	// TODO: Check that optVal is exhausted.
+	return matchers, nil
+}
+
+// parseTarget parses a target from optVal. optVal should contain only the
+// target.
+func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
-		return nil, 0, syserr.ErrInvalidArgument
+		return nil, syserr.ErrInvalidArgument
 	}
 	var target linux.XTEntryTarget
 	buf := optVal[:linux.SizeOfXTEntryTarget]
@@ -414,9 +532,9 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 	switch target.Name.String() {
 	case "":
 		// Standard target.
-		if len(optVal) < linux.SizeOfXTStandardTarget {
-			log.Warningf("netfilter.SetEntries: optVal has insufficient size for standard target %d", len(optVal))
-			return nil, 0, syserr.ErrInvalidArgument
+		if len(optVal) != linux.SizeOfXTStandardTarget {
+			log.Warningf("netfilter.SetEntries: optVal has wrong size for standard target %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
 		}
 		var standardTarget linux.XTStandardTarget
 		buf = optVal[:linux.SizeOfXTStandardTarget]
@@ -424,22 +542,22 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 
 		verdict, err := translateToStandardVerdict(standardTarget.Verdict)
 		if err != nil {
-			return nil, 0, err
+			return nil, err
 		}
 		switch verdict {
 		case iptables.Accept:
-			return iptables.UnconditionalAcceptTarget{}, linux.SizeOfXTStandardTarget, nil
+			return iptables.UnconditionalAcceptTarget{}, nil
 		case iptables.Drop:
-			return iptables.UnconditionalDropTarget{}, linux.SizeOfXTStandardTarget, nil
+			return iptables.UnconditionalDropTarget{}, nil
 		default:
 			panic(fmt.Sprintf("Unknown verdict: %v", verdict))
 		}
 
 	case errorTargetName:
 		// Error target.
-		if len(optVal) < linux.SizeOfXTErrorTarget {
+		if len(optVal) != linux.SizeOfXTErrorTarget {
 			log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal))
-			return nil, 0, syserr.ErrInvalidArgument
+			return nil, syserr.ErrInvalidArgument
 		}
 		var errorTarget linux.XTErrorTarget
 		buf = optVal[:linux.SizeOfXTErrorTarget]
@@ -454,16 +572,16 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
 		//   rules have an error with the name of the chain.
 		switch errorTarget.Name.String() {
 		case errorTargetName:
-			return iptables.ErrorTarget{}, linux.SizeOfXTErrorTarget, nil
+			return iptables.ErrorTarget{}, nil
 		default:
 			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
-			return nil, 0, syserr.ErrInvalidArgument
+			return nil, syserr.ErrInvalidArgument
 		}
 	}
 
 	// Unknown target.
 	log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
-	return nil, 0, syserr.ErrInvalidArgument
+	return nil, syserr.ErrInvalidArgument
 }
 
 func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) {
-- 
cgit v1.2.3


From 2661101ad470548cb15dce0afc694296668d780a Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 14:51:28 -0800
Subject: Removed TCP work (saved in ipt-tcp-match).

---
 pkg/abi/linux/netfilter.go               |  52 -------------
 pkg/sentry/socket/netfilter/netfilter.go |  26 -------
 pkg/tcpip/iptables/tcp_matcher.go        | 122 -------------------------------
 3 files changed, 200 deletions(-)
 delete mode 100644 pkg/tcpip/iptables/tcp_matcher.go

(limited to 'pkg/sentry/socket')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index fb4588272..f0e544f9c 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -341,58 +341,6 @@ func goString(cstring []byte) string {
 	return string(cstring)
 }
 
-// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp
-// in include/uapi/linux/netfilter/xt_tcpudp.h.
-type XTTCP struct {
-	// SourcePortStart specifies the inclusive start of the range of source
-	// ports to which the matcher applies.
-	SourcePortStart uint16
-
-	// SourcePortEnd specifies the inclusive end of the range of source ports
-	// to which the matcher applies.
-	SourcePortEnd uint16
-
-	// DestinationPortStart specifies the start of the destination port
-	// range to which the matcher applies.
-	DestinationPortStart uint16
-
-	// DestinationPortEnd specifies the start of the destination port
-	// range to which the matcher applies.
-	DestinationPortEnd uint16
-
-	// Option specifies that a particular TCP option must be set.
-	Option uint8
-
-	// FlagMask masks the FlagCompare byte when comparing to the TCP flag
-	// fields.
-	FlagMask uint8
-
-	// FlagCompare is binary and-ed with the TCP flag fields.
-	FlagCompare uint8
-
-	// InverseFlags flips the meaning of certain fields. See the
-	// TX_TCP_INV_* flags.
-	InverseFlags uint8
-}
-
-// SizeOfXTTCP is the size of an XTTCP.
-const SizeOfXTTCP = 12
-
-// Flags in XTTCP.InverseFlags. Corresponding constants are in
-// include/uapi/linux/netfilter/xt_tcpudp.h.
-const (
-	// Invert the meaning of SourcePortStart/End.
-	XT_TCP_INV_SRCPT = 0x01
-	// Invert the meaning of DestinationPortStart/End.
-	XT_TCP_INV_DSTPT = 0x02
-	// Invert the meaning of FlagCompare.
-	XT_TCP_INV_FLAGS = 0x04
-	// Invert the meaning of Option.
-	XT_TCP_INV_OPTION = 0x08
-	// Enable all flags.
-	XT_TCP_INV_MASK = 0x0F
-)
-
 // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
 // in include/uapi/linux/netfilter/xt_tcpudp.h.
 type XTUDP struct {
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 45296b339..f8ed1acbc 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -131,7 +131,6 @@ func FillDefaultIPTables(stack *stack.Stack) {
 	stack.SetIPTables(ipt)
 }
 
-// TODO: Return proto.
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
@@ -456,31 +455,6 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var matcher iptables.Matcher
 		var err error
 		switch match.Name.String() {
-		case "tcp":
-			if len(buf) < linux.SizeOfXTTCP {
-				log.Warningf("netfilter: optVal has insufficient size for TCP match: %d", len(optVal))
-				return nil, syserr.ErrInvalidArgument
-			}
-			var matchData linux.XTTCP
-			// For alignment reasons, the match's total size may exceed what's
-			// strictly necessary to hold matchData.
-			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
-			log.Infof("parseMatchers: parsed XTTCP: %+v", matchData)
-			matcher, err = iptables.NewTCPMatcher(filter, iptables.TCPMatcherData{
-				SourcePortStart:      matchData.SourcePortStart,
-				SourcePortEnd:        matchData.SourcePortEnd,
-				DestinationPortStart: matchData.DestinationPortStart,
-				DestinationPortEnd:   matchData.DestinationPortEnd,
-				Option:               matchData.Option,
-				FlagMask:             matchData.FlagMask,
-				FlagCompare:          matchData.FlagCompare,
-				InverseFlags:         matchData.InverseFlags,
-			})
-			if err != nil {
-				log.Warningf("netfilter: failed to create TCP matcher: %v", err)
-				return nil, syserr.ErrInvalidArgument
-			}
-
 		case "udp":
 			if len(buf) < linux.SizeOfXTUDP {
 				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
diff --git a/pkg/tcpip/iptables/tcp_matcher.go b/pkg/tcpip/iptables/tcp_matcher.go
deleted file mode 100644
index 6acbd6eb9..000000000
--- a/pkg/tcpip/iptables/tcp_matcher.go
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package iptables
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-type TCPMatcher struct {
-	data TCPMatcherData
-
-	// tablename string
-	// unsigned int matchsize;
-	// unsigned int usersize;
-	// #ifdef CONFIG_COMPAT
-	// unsigned int compatsize;
-	// #endif
-	// unsigned int hooks;
-	// unsigned short proto;
-	// unsigned short family;
-}
-
-// TODO: Delete?
-// MatchCheckEntryParams
-
-type TCPMatcherData struct {
-	// Filter IPHeaderFilter
-
-	SourcePortStart      uint16
-	SourcePortEnd        uint16
-	DestinationPortStart uint16
-	DestinationPortEnd   uint16
-	Option               uint8
-	FlagMask             uint8
-	FlagCompare          uint8
-	InverseFlags         uint8
-}
-
-func NewTCPMatcher(filter IPHeaderFilter, data TCPMatcherData) (Matcher, error) {
-	// TODO: We currently only support source port and destination port.
-	log.Infof("Adding rule with TCPMatcherData: %+v", data)
-
-	if data.Option != 0 ||
-		data.FlagMask != 0 ||
-		data.FlagCompare != 0 ||
-		data.InverseFlags != 0 {
-		return nil, fmt.Errorf("unsupported TCP matcher flags set")
-	}
-
-	if filter.Protocol != header.TCPProtocolNumber {
-		log.Warningf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
-	}
-
-	return &TCPMatcher{data: data}, nil
-}
-
-// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
-func (tm *TCPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader)
-
-	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
-	// codesearch).
-	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
-		return false, false
-	}
-
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			log.Warningf("Dropping TCP packet: malicious packet with fragment with fragment offest of 1.")
-			return false, true
-		}
-		return false, false
-	}
-
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO
-	var tcpHeader header.TCP
-	if pkt.TransportHeader != nil {
-		tcpHeader = header.TCP(pkt.TransportHeader)
-	} else {
-		// The TCP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.TCPMinimumSize {
-			// There's no valid TCP header here, so we hotdrop the
-			// packet.
-			// TODO: Stats.
-			log.Warningf("Dropping TCP packet: size to small.")
-			return false, true
-		}
-		tcpHeader = header.TCP(pkt.Data.First())
-	}
-
-	// Check whether the source and destination ports are within the
-	// matching range.
-	sourcePort := tcpHeader.SourcePort()
-	destinationPort := tcpHeader.DestinationPort()
-	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
-		return false, false
-	}
-	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
-		return false, false
-	}
-
-	return true, false
-}
-- 
cgit v1.2.3


From 538053538dfb378aa8bc512d484ea305177e617b Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Jan 2020 16:51:17 -0800
Subject: Adding serialization.

---
 pkg/sentry/socket/netfilter/netfilter.go | 29 ++++++++++++++++++++++++++++-
 pkg/tcpip/iptables/udp_matcher.go        | 14 +++++++-------
 2 files changed, 35 insertions(+), 8 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index f8ed1acbc..3caabca9a 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -196,7 +196,9 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 }
 
 func marshalMatcher(matcher iptables.Matcher) []byte {
-	switch matcher.(type) {
+	switch m := matcher.(type) {
+	case *iptables.UDPMatcher:
+		return marshalUDPMatcher(m)
 	default:
 		// TODO(gvisor.dev/issue/170): We don't support any matchers
 		// yet, so any call to marshalMatcher will panic.
@@ -204,6 +206,31 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 	}
 }
 
+func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
+	type udpMatch struct {
+		linux.XTEntryMatch
+		linux.XTUDP
+	}
+	linuxMatcher := udpMatch{
+		XTEntryMatch: linux.XTEntryMatch{
+			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP,
+			// Name:      "udp",
+		},
+		XTUDP: linux.XTUDP{
+			SourcePortStart:      matcher.Data.SourcePortStart,
+			SourcePortEnd:        matcher.Data.SourcePortEnd,
+			DestinationPortStart: matcher.Data.DestinationPortStart,
+			DestinationPortEnd:   matcher.Data.DestinationPortEnd,
+			InverseFlags:         matcher.Data.InverseFlags,
+		},
+	}
+	copy(linuxMatcher.Name[:], "udp")
+
+	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP]byte
+	binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher)
+	return buf[:]
+}
+
 func marshalTarget(target iptables.Target) []byte {
 	switch target.(type) {
 	case iptables.UnconditionalAcceptTarget:
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index ce4368a3d..fca457199 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -24,7 +24,7 @@ import (
 )
 
 type UDPMatcher struct {
-	data UDPMatcherData
+	Data UDPMatcherData
 
 	// tablename string
 	// unsigned int matchsize;
@@ -62,11 +62,11 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error)
 		log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
 	}
 
-	return &UDPMatcher{data: data}, nil
+	return &UDPMatcher{Data: data}, nil
 }
 
 // TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
-func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
 	log.Infof("UDPMatcher called from: %s", string(debug.Stack()))
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
@@ -114,12 +114,12 @@ func (tm *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 	destinationPort := udpHeader.DestinationPort()
 	log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)",
 		udpHeader.SourcePort(), udpHeader.DestinationPort(),
-		tm.data.SourcePortStart, tm.data.SourcePortEnd,
-		tm.data.DestinationPortStart, tm.data.DestinationPortEnd)
-	if sourcePort < tm.data.SourcePortStart || tm.data.SourcePortEnd < sourcePort {
+		um.Data.SourcePortStart, um.Data.SourcePortEnd,
+		um.Data.DestinationPortStart, um.Data.DestinationPortEnd)
+	if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort {
 		return false, false
 	}
-	if destinationPort < tm.data.DestinationPortStart || tm.data.DestinationPortEnd < destinationPort {
+	if destinationPort < um.Data.DestinationPortStart || um.Data.DestinationPortEnd < destinationPort {
 		return false, false
 	}
 
-- 
cgit v1.2.3


From b7853f688b4bcd3465c0c3087fcbd8d53bdf26ae Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 22 Jan 2020 14:46:15 -0800
Subject: Error marshalling the matcher.

The iptables binary is looking for libxt_.so when it should be looking
for libxt_udp.so, so it's having an issue reading the data in
xt_match_entry. I think it may be an alignment issue.

Trying to fix this is leading to me fighting with the metadata struct,
so I'm gonna go kill that.
---
 pkg/abi/linux/netfilter.go               |  5 +++++
 pkg/sentry/socket/netfilter/netfilter.go | 35 ++++++++++++++++++++------------
 pkg/tcpip/iptables/udp_matcher.go        |  2 +-
 3 files changed, 28 insertions(+), 14 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index f0e544f9c..effed7976 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -198,6 +198,11 @@ type XTEntryMatch struct {
 // SizeOfXTEntryMatch is the size of an XTEntryMatch.
 const SizeOfXTEntryMatch = 32
 
+type KernelXTEntryMatch struct {
+	XTEntryMatch
+	Data []byte
+}
+
 // XTEntryTarget holds a target for a rule. For example, it can specify that
 // packets matching the rule should DROP, ACCEPT, or use an extension target.
 // iptables-extension(8) has a list of possible targets.
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3caabca9a..b49fe5b3e 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -207,26 +207,34 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 }
 
 func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
-	type udpMatch struct {
-		linux.XTEntryMatch
-		linux.XTUDP
-	}
-	linuxMatcher := udpMatch{
+	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
 			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP,
 			// Name:      "udp",
 		},
-		XTUDP: linux.XTUDP{
-			SourcePortStart:      matcher.Data.SourcePortStart,
-			SourcePortEnd:        matcher.Data.SourcePortEnd,
-			DestinationPortStart: matcher.Data.DestinationPortStart,
-			DestinationPortEnd:   matcher.Data.DestinationPortEnd,
-			InverseFlags:         matcher.Data.InverseFlags,
-		},
+		Data: make([]byte, linux.SizeOfXTUDP+22),
 	}
+	// copy(linuxMatcher.Name[:], "udp")
 	copy(linuxMatcher.Name[:], "udp")
 
-	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP]byte
+	// TODO: Must be aligned.
+	xtudp := linux.XTUDP{
+		SourcePortStart:      matcher.Data.SourcePortStart,
+		SourcePortEnd:        matcher.Data.SourcePortEnd,
+		DestinationPortStart: matcher.Data.DestinationPortStart,
+		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
+		InverseFlags:         matcher.Data.InverseFlags,
+	}
+	binary.Marshal(linuxMatcher.Data[:linux.SizeOfXTUDP], usermem.ByteOrder, xtudp)
+
+	if binary.Size(linuxMatcher)%64 != 0 {
+		panic(fmt.Sprintf("size is actually: %d", binary.Size(linuxMatcher)))
+	}
+
+	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 22]byte
+	if len(buf)%64 != 0 {
+		panic(fmt.Sprintf("len is actually: %d", len(buf)))
+	}
 	binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher)
 	return buf[:]
 }
@@ -245,6 +253,7 @@ func marshalTarget(target iptables.Target) []byte {
 }
 
 func marshalStandardTarget(verdict iptables.Verdict) []byte {
+	// TODO: Must be aligned.
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
 		Target: linux.XTEntryTarget{
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index fca457199..65ae7f9e0 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -59,7 +59,7 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error)
 	}
 
 	if filter.Protocol != header.UDPProtocolNumber {
-		log.Warningf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
 	}
 
 	return &UDPMatcher{Data: data}, nil
-- 
cgit v1.2.3


From 2946fe81627afa223853769ed736e2a56e0144b7 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 24 Jan 2020 17:12:03 -0800
Subject: We can now actually write out the udp matcher.

---
 pkg/sentry/socket/netfilter/netfilter.go | 78 ++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 20 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3ca22932d..6c88a50a6 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -36,7 +36,7 @@ const errorTargetName = "ERROR"
 
 // metadata is opaque to netstack. It holds data that we need to translate
 // between Linux's and netstack's iptables representations.
-// TODO(gvisor.dev/issue/170): This might be removable.
+// TODO(gvisor.dev/issue/170): Use metadata to check correctness.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -44,6 +44,14 @@ type metadata struct {
 	Size       uint32
 }
 
+const enableDebugLog = true
+
+func nflog(format string, args ...interface{}) {
+	if enableDebugLog {
+		log.Infof("netfilter: "+format, args...)
+	}
+}
+
 // GetInfo returns information about iptables.
 func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
 	// Read in the struct and table name.
@@ -72,6 +80,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	info.NumEntries = metadata.NumEntries
 	info.Size = metadata.Size
 
+	nflog("GetInfo returning info: %+v", info)
+
 	return info, nil
 }
 
@@ -80,21 +90,26 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
+		log.Warningf("netfilter: couldn't copy in entries %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, syserr.FromError(err)
 	}
 
 	// Find the appropriate table.
 	table, err := findTable(stack, userEntries.Name)
 	if err != nil {
+		log.Warningf("netfilter: couldn't find table %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, err
 	}
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
-	entries, _, err := convertNetstackToBinary(userEntries.Name.String(), table)
+	entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table)
 	if err != nil {
 		return linux.KernelIPTGetEntries{}, err
 	}
+	if meta != table.Metadata().(metadata) {
+		panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta))
+	}
 	if binary.Size(entries) > uintptr(outLen) {
 		log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen))
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
@@ -148,15 +163,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	copy(entries.Name[:], tablename)
 
 	for ruleIdx, rule := range table.Rules {
+		nflog("Current offset: %d", entries.Size)
+
 		// Is this a chain entry point?
 		for hook, hookRuleIdx := range table.BuiltinChains {
 			if hookRuleIdx == ruleIdx {
+				nflog("Found hook %d at offset %d", hook, entries.Size)
 				meta.HookEntry[hook] = entries.Size
 			}
 		}
 		// Is this a chain underflow point?
 		for underflow, underflowRuleIdx := range table.Underflows {
 			if underflowRuleIdx == ruleIdx {
+				nflog("Found underflow %d at offset %d", underflow, entries.Size)
 				meta.Underflow[underflow] = entries.Size
 			}
 		}
@@ -176,6 +195,10 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 			// Serialize the matcher and add it to the
 			// entry.
 			serialized := marshalMatcher(matcher)
+			nflog("matcher serialized as: %v", serialized)
+			if len(serialized)%8 != 0 {
+				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+			}
 			entry.Elems = append(entry.Elems, serialized...)
 			entry.NextOffset += uint16(len(serialized))
 			entry.TargetOffset += uint16(len(serialized))
@@ -183,18 +206,25 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 
 		// Serialize and append the target.
 		serialized := marshalTarget(rule.Target)
+		if len(serialized)%8 != 0 {
+			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+		}
 		entry.Elems = append(entry.Elems, serialized...)
 		entry.NextOffset += uint16(len(serialized))
 
+		nflog("Adding entry: %+v", entry)
+
 		entries.Size += uint32(entry.NextOffset)
 		entries.Entrytable = append(entries.Entrytable, entry)
 		meta.NumEntries++
 	}
 
+	nflog("Finished with an marshalled size of %d", meta.Size)
 	meta.Size = entries.Size
 	return entries, meta, nil
 }
 
+// TODO: SOMEHOW THIS IS NOT GETTING APPENDED!
 func marshalMatcher(matcher iptables.Matcher) []byte {
 	switch m := matcher.(type) {
 	case *iptables.UDPMatcher:
@@ -207,17 +237,17 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 }
 
 func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
+	nflog("Marshalling UDP matcher: %+v", matcher)
+
 	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP,
+			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6,
 			// Name:      "udp",
 		},
-		Data: make([]byte, linux.SizeOfXTUDP+22),
+		Data: make([]byte, 0, linux.SizeOfXTUDP),
 	}
-	// copy(linuxMatcher.Name[:], "udp")
 	copy(linuxMatcher.Name[:], "udp")
 
-	// TODO: Must be aligned.
 	xtudp := linux.XTUDP{
 		SourcePortStart:      matcher.Data.SourcePortStart,
 		SourcePortEnd:        matcher.Data.SourcePortEnd,
@@ -225,17 +255,17 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
 		InverseFlags:         matcher.Data.InverseFlags,
 	}
-	binary.Marshal(linuxMatcher.Data[:linux.SizeOfXTUDP], usermem.ByteOrder, xtudp)
-
-	if binary.Size(linuxMatcher)%64 != 0 {
-		panic(fmt.Sprintf("size is actually: %d", binary.Size(linuxMatcher)))
-	}
-
-	var buf [linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 22]byte
-	if len(buf)%64 != 0 {
-		panic(fmt.Sprintf("len is actually: %d", len(buf)))
-	}
-	binary.Marshal(buf[:], usermem.ByteOrder, linuxMatcher)
+	nflog("marshalUDPMatcher: xtudp: %+v", xtudp)
+	linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp)
+	nflog("marshalUDPMatcher: linuxMatcher: %+v", linuxMatcher)
+
+	// We have to pad this struct size to a multiple of 8 bytes, so we make
+	// this a little longer than it needs to be.
+	buf := make([]byte, 0, linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP+6)
+	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
+	buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...)
+	nflog("Marshalled into matcher of size %d", len(buf))
+	nflog("marshalUDPMatcher: buf is: %v", buf)
 	return buf[:]
 }
 
@@ -253,6 +283,8 @@ func marshalTarget(target iptables.Target) []byte {
 }
 
 func marshalStandardTarget(verdict iptables.Verdict) []byte {
+	nflog("Marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+
 	// TODO: Must be aligned.
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
@@ -321,7 +353,7 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
 func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
-	printReplace(optVal)
+	// printReplace(optVal)
 
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
@@ -343,10 +375,14 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
+	nflog("Setting entries in table %q", replace.Name.String())
+
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
 	var offsets []uint32
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		nflog("Processing entry at offset %d", offset)
+
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
 			log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal))
@@ -464,9 +500,10 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
 func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
+	nflog("Parsing matchers of size %d", len(optVal))
 	var matchers []iptables.Matcher
 	for len(optVal) > 0 {
-		log.Infof("parseMatchers: optVal has len %d", len(optVal))
+		nflog("parseMatchers: optVal has len %d", len(optVal))
 		// Get the XTEntryMatch.
 		if len(optVal) < linux.SizeOfXTEntryMatch {
 			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
@@ -475,7 +512,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var match linux.XTEntryMatch
 		buf := optVal[:linux.SizeOfXTEntryMatch]
 		binary.Unmarshal(buf, usermem.ByteOrder, &match)
-		log.Infof("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
+		nflog("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
@@ -532,6 +569,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
 func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
+	nflog("Parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
 		return nil, syserr.ErrInvalidArgument
-- 
cgit v1.2.3


From 29316e66adfc49c158425554761e34c12338f1d9 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Jan 2020 12:27:04 -0800
Subject: Cleanup for GH review.

---
 pkg/abi/linux/netfilter.go               |  14 +--
 pkg/sentry/socket/netfilter/netfilter.go | 144 ++++++++++++-------------------
 pkg/tcpip/iptables/types.go              |  15 ----
 pkg/tcpip/iptables/udp_matcher.go        |  62 ++++++-------
 test/iptables/filter_input.go            |   6 +-
 5 files changed, 88 insertions(+), 153 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index effed7976..8e40bcc62 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -198,6 +198,8 @@ type XTEntryMatch struct {
 // SizeOfXTEntryMatch is the size of an XTEntryMatch.
 const SizeOfXTEntryMatch = 32
 
+// KernelXTEntryMatch is identical to XTEntryMatch, but contains
+// variable-length Data field.
 type KernelXTEntryMatch struct {
 	XTEntryMatch
 	Data []byte
@@ -349,19 +351,19 @@ func goString(cstring []byte) string {
 // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
 // in include/uapi/linux/netfilter/xt_tcpudp.h.
 type XTUDP struct {
-	// SourcePortStart specifies the inclusive start of the range of source
-	// ports to which the matcher applies.
+	// SourcePortStart is the inclusive start of the range of source ports
+	// to which the matcher applies.
 	SourcePortStart uint16
 
-	// SourcePortEnd specifies the inclusive end of the range of source ports
-	// to which the matcher applies.
+	// SourcePortEnd is the inclusive end of the range of source ports to
+	// which the matcher applies.
 	SourcePortEnd uint16
 
-	// DestinationPortStart specifies the start of the destination port
+	// DestinationPortStart is the inclusive start of the destination port
 	// range to which the matcher applies.
 	DestinationPortStart uint16
 
-	// DestinationPortEnd specifies the start of the destination port
+	// DestinationPortEnd is the inclusive end of the destination port
 	// range to which the matcher applies.
 	DestinationPortEnd uint16
 
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 6c88a50a6..b8848f08a 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -34,9 +34,16 @@ import (
 // shouldn't be reached - an error has occurred if we fall through to one.
 const errorTargetName = "ERROR"
 
-// metadata is opaque to netstack. It holds data that we need to translate
-// between Linux's and netstack's iptables representations.
-// TODO(gvisor.dev/issue/170): Use metadata to check correctness.
+const (
+	matcherNameUDP = "udp"
+)
+
+// Metadata is used to verify that we are correctly serializing and
+// deserializing iptables into structs consumable by the iptables tool. We save
+// a metadata struct when the tables are written, and when they are read out we
+// verify that certain fields are the same.
+//
+// metadata is opaque to netstack.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -44,10 +51,12 @@ type metadata struct {
 	Size       uint32
 }
 
-const enableDebugLog = true
+const enableDebug = false
 
+// nflog logs messages related to the writing and reading of iptables, but only
+// when enableDebug is true.
 func nflog(format string, args ...interface{}) {
-	if enableDebugLog {
+	if enableDebug {
 		log.Infof("netfilter: "+format, args...)
 	}
 }
@@ -80,7 +89,7 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	info.NumEntries = metadata.NumEntries
 	info.Size = metadata.Size
 
-	nflog("GetInfo returning info: %+v", info)
+	nflog("returning info: %+v", info)
 
 	return info, nil
 }
@@ -163,19 +172,19 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	copy(entries.Name[:], tablename)
 
 	for ruleIdx, rule := range table.Rules {
-		nflog("Current offset: %d", entries.Size)
+		nflog("convert to binary: current offset: %d", entries.Size)
 
 		// Is this a chain entry point?
 		for hook, hookRuleIdx := range table.BuiltinChains {
 			if hookRuleIdx == ruleIdx {
-				nflog("Found hook %d at offset %d", hook, entries.Size)
+				nflog("convert to binary: found hook %d at offset %d", hook, entries.Size)
 				meta.HookEntry[hook] = entries.Size
 			}
 		}
 		// Is this a chain underflow point?
 		for underflow, underflowRuleIdx := range table.Underflows {
 			if underflowRuleIdx == ruleIdx {
-				nflog("Found underflow %d at offset %d", underflow, entries.Size)
+				nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size)
 				meta.Underflow[underflow] = entries.Size
 			}
 		}
@@ -195,7 +204,7 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 			// Serialize the matcher and add it to the
 			// entry.
 			serialized := marshalMatcher(matcher)
-			nflog("matcher serialized as: %v", serialized)
+			nflog("convert to binary: matcher serialized as: %v", serialized)
 			if len(serialized)%8 != 0 {
 				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
 			}
@@ -212,14 +221,14 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 		entry.Elems = append(entry.Elems, serialized...)
 		entry.NextOffset += uint16(len(serialized))
 
-		nflog("Adding entry: %+v", entry)
+		nflog("convert to binary: adding entry: %+v", entry)
 
 		entries.Size += uint32(entry.NextOffset)
 		entries.Entrytable = append(entries.Entrytable, entry)
 		meta.NumEntries++
 	}
 
-	nflog("Finished with an marshalled size of %d", meta.Size)
+	nflog("convert to binary: finished with an marshalled size of %d", meta.Size)
 	meta.Size = entries.Size
 	return entries, meta, nil
 }
@@ -237,16 +246,18 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
 }
 
 func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
-	nflog("Marshalling UDP matcher: %+v", matcher)
+	nflog("convert to binary: marshalling UDP matcher: %+v", matcher)
+
+	// We have to pad this struct size to a multiple of 8 bytes.
+	const size = linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6
 
 	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6,
-			// Name:      "udp",
+			MatchSize: size,
 		},
 		Data: make([]byte, 0, linux.SizeOfXTUDP),
 	}
-	copy(linuxMatcher.Name[:], "udp")
+	copy(linuxMatcher.Name[:], matcherNameUDP)
 
 	xtudp := linux.XTUDP{
 		SourcePortStart:      matcher.Data.SourcePortStart,
@@ -255,17 +266,12 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
 		InverseFlags:         matcher.Data.InverseFlags,
 	}
-	nflog("marshalUDPMatcher: xtudp: %+v", xtudp)
 	linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp)
-	nflog("marshalUDPMatcher: linuxMatcher: %+v", linuxMatcher)
 
-	// We have to pad this struct size to a multiple of 8 bytes, so we make
-	// this a little longer than it needs to be.
-	buf := make([]byte, 0, linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP+6)
+	buf := make([]byte, 0, size)
 	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
 	buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...)
-	nflog("Marshalled into matcher of size %d", len(buf))
-	nflog("marshalUDPMatcher: buf is: %v", buf)
+	nflog("convert to binary: marshalled UDP matcher into %v", buf)
 	return buf[:]
 }
 
@@ -283,9 +289,8 @@ func marshalTarget(target iptables.Target) []byte {
 }
 
 func marshalStandardTarget(verdict iptables.Verdict) []byte {
-	nflog("Marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
 
-	// TODO: Must be aligned.
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
 		Target: linux.XTEntryTarget{
@@ -353,8 +358,6 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
 func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
-	// printReplace(optVal)
-
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
 		log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
@@ -375,13 +378,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
-	nflog("Setting entries in table %q", replace.Name.String())
+	nflog("set entries: setting entries in table %q", replace.Name.String())
 
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
 	var offsets []uint32
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
-		nflog("Processing entry at offset %d", offset)
+		nflog("set entries: processing entry at offset %d", offset)
 
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
@@ -406,11 +409,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			return err
 		}
 
-		// TODO: Matchers (and maybe targets) can specify that they only work for certiain protocols, hooks, tables.
+		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
+		// that they only work for certiain protocols, hooks, tables.
 		// Get matchers.
 		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
 		if len(optVal) < int(matchersSize) {
 			log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 		matchers, err := parseMatchers(filter, optVal[:matchersSize])
 		if err != nil {
@@ -423,6 +428,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		targetSize := entry.NextOffset - entry.TargetOffset
 		if len(optVal) < int(targetSize) {
 			log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 		target, err := parseTarget(optVal[:targetSize])
 		if err != nil {
@@ -500,10 +506,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
 func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
-	nflog("Parsing matchers of size %d", len(optVal))
+	nflog("set entries: parsing matchers of size %d", len(optVal))
 	var matchers []iptables.Matcher
 	for len(optVal) > 0 {
-		nflog("parseMatchers: optVal has len %d", len(optVal))
+		nflog("set entries: optVal has len %d", len(optVal))
+
 		// Get the XTEntryMatch.
 		if len(optVal) < linux.SizeOfXTEntryMatch {
 			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
@@ -512,7 +519,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var match linux.XTEntryMatch
 		buf := optVal[:linux.SizeOfXTEntryMatch]
 		binary.Unmarshal(buf, usermem.ByteOrder, &match)
-		nflog("parseMatchers: parsed entry match %q: %+v", match.Name.String(), match)
+		nflog("set entries: parsed entry match %q: %+v", match.Name.String(), match)
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
@@ -528,17 +535,17 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 		var matcher iptables.Matcher
 		var err error
 		switch match.Name.String() {
-		case "udp":
+		case matcherNameUDP:
 			if len(buf) < linux.SizeOfXTUDP {
 				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
 				return nil, syserr.ErrInvalidArgument
 			}
+			// For alignment reasons, the match's total size may
+			// exceed what's strictly necessary to hold matchData.
 			var matchData linux.XTUDP
-			// For alignment reasons, the match's total size may exceed what's
-			// strictly necessary to hold matchData.
 			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
 			log.Infof("parseMatchers: parsed XTUDP: %+v", matchData)
-			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherData{
+			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherParams{
 				SourcePortStart:      matchData.SourcePortStart,
 				SourcePortEnd:        matchData.SourcePortEnd,
 				DestinationPortStart: matchData.DestinationPortStart,
@@ -557,19 +564,22 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 		matchers = append(matchers, matcher)
 
-		// TODO: Support revision.
-		// TODO: Support proto -- matchers usually specify which proto(s) they work with.
+		// TODO(gvisor.dev/issue/170): Check the revision field.
 		optVal = optVal[match.MatchSize:]
 	}
 
-	// TODO: Check that optVal is exhausted.
+	if len(optVal) != 0 {
+		log.Warningf("netfilter: optVal should be exhausted after parsing matchers")
+		return nil, syserr.ErrInvalidArgument
+	}
+
 	return matchers, nil
 }
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
 func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
-	nflog("Parsing target of size %d", len(optVal))
+	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
 		return nil, syserr.ErrInvalidArgument
@@ -598,7 +608,8 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 		case iptables.Drop:
 			return iptables.UnconditionalDropTarget{}, nil
 		default:
-			panic(fmt.Sprintf("Unknown verdict: %v", verdict))
+			log.Warningf("Unknown verdict: %v", verdict)
+			return nil, syserr.ErrInvalidArgument
 		}
 
 	case errorTargetName:
@@ -673,52 +684,3 @@ func hookFromLinux(hook int) iptables.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
-
-// printReplace prints information about the struct ipt_replace in optVal. It
-// is only for debugging.
-func printReplace(optVal []byte) {
-	// Basic replace info.
-	var replace linux.IPTReplace
-	replaceBuf := optVal[:linux.SizeOfIPTReplace]
-	optVal = optVal[linux.SizeOfIPTReplace:]
-	binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
-	log.Infof("Replacing table %q: %+v", replace.Name.String(), replace)
-
-	// Read in the list of entries at the end of replace.
-	var totalOffset uint16
-	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
-		var entry linux.IPTEntry
-		entryBuf := optVal[:linux.SizeOfIPTEntry]
-		binary.Unmarshal(entryBuf, usermem.ByteOrder, &entry)
-		log.Infof("Entry %d (total offset %d): %+v", entryIdx, totalOffset, entry)
-
-		totalOffset += entry.NextOffset
-		if entry.TargetOffset == linux.SizeOfIPTEntry {
-			log.Infof("Entry has no matches.")
-		} else {
-			log.Infof("Entry has matches.")
-		}
-
-		var target linux.XTEntryTarget
-		targetBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTEntryTarget]
-		binary.Unmarshal(targetBuf, usermem.ByteOrder, &target)
-		log.Infof("Target named %q: %+v", target.Name.String(), target)
-
-		switch target.Name.String() {
-		case "":
-			var standardTarget linux.XTStandardTarget
-			stBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTStandardTarget]
-			binary.Unmarshal(stBuf, usermem.ByteOrder, &standardTarget)
-			log.Infof("Standard target with verdict %q (%d).", linux.VerdictStrings[standardTarget.Verdict], standardTarget.Verdict)
-		case errorTargetName:
-			var errorTarget linux.XTErrorTarget
-			etBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTErrorTarget]
-			binary.Unmarshal(etBuf, usermem.ByteOrder, &errorTarget)
-			log.Infof("Error target with name %q.", errorTarget.Name.String())
-		default:
-			log.Infof("Unknown target type.")
-		}
-
-		optVal = optVal[entry.NextOffset:]
-	}
-}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index d47447d40..ba5ed75b4 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -169,8 +169,6 @@ type IPHeaderFilter struct {
 	Protocol tcpip.TransportProtocolNumber
 }
 
-// TODO: Should these be able to marshal/unmarshal themselves?
-// TODO: Something has to map the name to the matcher.
 // A Matcher is the interface for matching packets.
 type Matcher interface {
 	// Match returns whether the packet matches and whether the packet
@@ -179,19 +177,6 @@ type Matcher interface {
 	//
 	// Precondition: packet.NetworkHeader is set.
 	Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
-
-	// TODO: Make this typesafe by having each Matcher have their own, typed CheckEntry?
-	// CheckEntry(params MatchCheckEntryParams) bool
-}
-
-// TODO: Unused?
-type MatchCheckEntryParams struct {
-	Table  string // TODO: Tables should be an enum...
-	Filter IPHeaderFilter
-	Info   interface{} // TODO: Type unsafe.
-	// HookMask       uint8
-	// Family         uint8
-	// NFTCompat      bool
 }
 
 // A Target is the interface for taking an action for a packet.
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
index 65ae7f9e0..f59ca2027 100644
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ b/pkg/tcpip/iptables/udp_matcher.go
@@ -16,33 +16,28 @@ package iptables
 
 import (
 	"fmt"
-	"runtime/debug"
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+// TODO(gvisor.dev/issue/170): The following per-matcher params should be
+// supported:
+// - Table name
+// - Match size
+// - User size
+// - Hooks
+// - Proto
+// - Family
+
+// UDPMatcher matches UDP packets and their headers. It implements Matcher.
 type UDPMatcher struct {
-	Data UDPMatcherData
-
-	// tablename string
-	// unsigned int matchsize;
-	// unsigned int usersize;
-	// #ifdef CONFIG_COMPAT
-	// unsigned int compatsize;
-	// #endif
-	// unsigned int hooks;
-	// unsigned short proto;
-	// unsigned short family;
+	Data UDPMatcherParams
 }
 
-// TODO: Delete?
-// MatchCheckEntryParams
-
-type UDPMatcherData struct {
-	// Filter IPHeaderFilter
-
+// UDPMatcherParams are the parameters used to create a UDPMatcher.
+type UDPMatcherParams struct {
 	SourcePortStart      uint16
 	SourcePortEnd        uint16
 	DestinationPortStart uint16
@@ -50,12 +45,12 @@ type UDPMatcherData struct {
 	InverseFlags         uint8
 }
 
-func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error) {
-	// TODO: We currently only support source port and destination port.
-	log.Infof("Adding rule with UDPMatcherData: %+v", data)
+// NewUDPMatcher returns a new instance of UDPMatcher.
+func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherParams) (Matcher, error) {
+	log.Infof("Adding rule with UDPMatcherParams: %+v", data)
 
 	if data.InverseFlags != 0 {
-		return nil, fmt.Errorf("unsupported UDP matcher flags set")
+		return nil, fmt.Errorf("unsupported UDP matcher inverse flags set")
 	}
 
 	if filter.Protocol != header.UDPProtocolNumber {
@@ -65,21 +60,18 @@ func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherData) (Matcher, error)
 	return &UDPMatcher{Data: data}, nil
 }
 
-// TODO: Check xt_tcpudp.c. Need to check for same things (e.g. fragments).
+// Match implements Matcher.Match.
 func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
-	log.Infof("UDPMatcher called from: %s", string(debug.Stack()))
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
-	// TODO: Do we check proto here or elsewhere? I think elsewhere (check
-	// codesearch).
+	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+	// into the iptables.Check codepath as matchers are added.
 	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
-		log.Infof("UDPMatcher: wrong protocol number")
 		return false, false
 	}
 
 	// We dont't match fragments.
 	if frag := netHeader.FragmentOffset(); frag != 0 {
-		log.Infof("UDPMatcher: it's a fragment")
 		if frag == 1 {
 			return false, true
 		}
@@ -89,20 +81,18 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 
 	// Now we need the transport header. However, this may not have been set
 	// yet.
-	// TODO
+	// TODO(gvisor.dev/issue/170): Parsing the transport header should
+	// ultimately be moved into the iptables.Check codepath as matchers are
+	// added.
 	var udpHeader header.UDP
 	if pkt.TransportHeader != nil {
-		log.Infof("UDPMatcher: transport header is not nil")
 		udpHeader = header.UDP(pkt.TransportHeader)
 	} else {
-		log.Infof("UDPMatcher: transport header is nil")
-		log.Infof("UDPMatcher: is network header nil: %t", pkt.NetworkHeader == nil)
 		// The UDP header hasn't been parsed yet. We have to do it here.
 		if len(pkt.Data.First()) < header.UDPMinimumSize {
 			// There's no valid UDP header here, so we hotdrop the
 			// packet.
-			// TODO: Stats.
-			log.Warningf("Dropping UDP packet: size to small.")
+			log.Warningf("Dropping UDP packet: size too small.")
 			return false, true
 		}
 		udpHeader = header.UDP(pkt.Data.First())
@@ -112,10 +102,6 @@ func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName str
 	// matching range.
 	sourcePort := udpHeader.SourcePort()
 	destinationPort := udpHeader.DestinationPort()
-	log.Infof("UDPMatcher: sport and dport are %d and %d. sports and dport start and end are (%d, %d) and (%d, %d)",
-		udpHeader.SourcePort(), udpHeader.DestinationPort(),
-		um.Data.SourcePortStart, um.Data.SourcePortEnd,
-		um.Data.DestinationPortStart, um.Data.DestinationPortEnd)
 	if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort {
 		return false, false
 	}
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index bc963d40e..e9f0978eb 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -264,9 +264,9 @@ func (FilterInputMultiUDPRules) ContainerAction(ip net.IP) error {
 	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
 		return err
 	}
-	// if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
-	// 	return err
-	// }
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
+		return err
+	}
 	return filterTable("-L")
 }
 
-- 
cgit v1.2.3


From d6a2e01d3e57e0837c7e5cfda3b56c4dcfbb4627 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Jan 2020 16:40:46 -0800
Subject: Address GH comments.

---
 pkg/sentry/socket/netfilter/netfilter.go | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index b8848f08a..a06562743 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -43,7 +43,7 @@ const (
 // a metadata struct when the tables are written, and when they are read out we
 // verify that certain fields are the same.
 //
-// metadata is opaque to netstack.
+// metadata is used by this serialization/deserializing code, not netstack.
 type metadata struct {
 	HookEntry  [linux.NF_INET_NUMHOOKS]uint32
 	Underflow  [linux.NF_INET_NUMHOOKS]uint32
@@ -51,14 +51,10 @@ type metadata struct {
 	Size       uint32
 }
 
-const enableDebug = false
-
 // nflog logs messages related to the writing and reading of iptables, but only
 // when enableDebug is true.
 func nflog(format string, args ...interface{}) {
-	if enableDebug {
-		log.Infof("netfilter: "+format, args...)
-	}
+	log.Infof("netfilter: "+format, args...)
 }
 
 // GetInfo returns information about iptables.
@@ -233,14 +229,12 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	return entries, meta, nil
 }
 
-// TODO: SOMEHOW THIS IS NOT GETTING APPENDED!
 func marshalMatcher(matcher iptables.Matcher) []byte {
 	switch m := matcher.(type) {
 	case *iptables.UDPMatcher:
 		return marshalUDPMatcher(m)
 	default:
-		// TODO(gvisor.dev/issue/170): We don't support any matchers
-		// yet, so any call to marshalMatcher will panic.
+		// TODO(gvisor.dev/issue/170): Support other matchers.
 		panic(fmt.Errorf("unknown matcher of type %T", matcher))
 	}
 }
@@ -249,11 +243,11 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 	nflog("convert to binary: marshalling UDP matcher: %+v", matcher)
 
 	// We have to pad this struct size to a multiple of 8 bytes.
-	const size = linux.SizeOfXTEntryMatch + linux.SizeOfXTUDP + 6
+	size := alignUp(linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP, 8)
 
 	linuxMatcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: size,
+			MatchSize: uint16(size),
 		},
 		Data: make([]byte, 0, linux.SizeOfXTUDP),
 	}
@@ -270,7 +264,7 @@ func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
 
 	buf := make([]byte, 0, size)
 	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
-	buf = append(buf, []byte{0, 0, 0, 0, 0, 0}...)
+	buf = append(buf, make([]byte, size-len(buf))...)
 	nflog("convert to binary: marshalled UDP matcher into %v", buf)
 	return buf[:]
 }
@@ -410,7 +404,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 
 		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
-		// that they only work for certiain protocols, hooks, tables.
+		// that they only work for certain protocols, hooks, tables.
 		// Get matchers.
 		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
 		if len(optVal) < int(matchersSize) {
@@ -684,3 +678,8 @@ func hookFromLinux(hook int) iptables.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
+
+// alignUp rounds a length up to an alignment. align must be a power of 2.
+func alignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
-- 
cgit v1.2.3


From 51b783505b1ec164b02b48a0fd234509fba01a73 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 29 Jan 2020 15:41:51 -0800
Subject: Add support for TCP_DEFER_ACCEPT.

PiperOrigin-RevId: 292233574
---
 pkg/sentry/socket/netstack/netstack.go      |  22 ++++
 pkg/tcpip/tcpip.go                          |   6 ++
 pkg/tcpip/transport/tcp/BUILD               |   1 +
 pkg/tcpip/transport/tcp/accept.go           |  25 ++---
 pkg/tcpip/transport/tcp/connect.go          |  53 +++++++++-
 pkg/tcpip/transport/tcp/endpoint.go         |  26 ++++-
 pkg/tcpip/transport/tcp/forwarder.go        |   4 +-
 pkg/tcpip/transport/tcp/tcp_test.go         | 126 ++++++++++++++++++++++
 test/syscalls/linux/socket_inet_loopback.cc | 158 ++++++++++++++++++++++++++++
 test/syscalls/linux/tcp_socket.cc           |  53 ++++++++++
 10 files changed, 451 insertions(+), 23 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8619cc506..049d04bf2 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1260,6 +1260,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_DEFER_ACCEPT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPDeferAcceptOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Second), nil
+
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1713,6 +1725,16 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_DEFER_ACCEPT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		if v < 0 {
+			v = 0
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 59c9b3fb0..0fa141d58 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -626,6 +626,12 @@ type TCPLingerTimeoutOption time.Duration
 // before being marked closed.
 type TCPTimeWaitTimeoutOption time.Duration
 
+// TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
+// accept to return a completed connection only when there is data to be
+// read. This usually means the listening socket will drop the final ACK
+// for a handshake till the specified timeout until a segment with data arrives.
+type TCPDeferAcceptOption time.Duration
+
 // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
 // TTL value for multicast messages. The default is 1.
 type MulticastTTLOption uint8
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 4acd9fb9a..7b4a87a2d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -57,6 +57,7 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d469758eb..6101f2945 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -222,13 +222,13 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
 // the connection parameters given by the arguments.
-func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create a new endpoint.
 	netProto := l.netProto
 	if netProto == 0 {
 		netProto = s.route.NetProto
 	}
-	n := newEndpoint(l.stack, netProto, nil)
+	n := newEndpoint(l.stack, netProto, queue)
 	n.v6only = l.v6only
 	n.ID = s.id
 	n.boundNICID = s.route.NICID()
@@ -273,16 +273,17 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 // createEndpoint creates a new endpoint in connected state and then performs
 // the TCP 3-way handshake.
-func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
 	isn := generateSecureISN(s.id, l.stack.Seed())
-	ep, err := l.createConnectingEndpoint(s, isn, irs, opts)
+	ep, err := l.createConnectingEndpoint(s, isn, irs, opts, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
+	deferAccept := time.Duration(0)
 	if l.listenEP != nil {
 		l.listenEP.mu.Lock()
 		if l.listenEP.EndpointState() != StateListen {
@@ -290,13 +291,12 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 			return nil, tcpip.ErrConnectionAborted
 		}
 		l.addPendingEndpoint(ep)
+		deferAccept = l.listenEP.deferAccept
 		l.listenEP.mu.Unlock()
 	}
 
 	// Perform the 3-way handshake.
-	h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow()))
-
-	h.resetToSynRcvd(isn, irs, opts)
+	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
 		ep.Close()
 		if l.listenEP != nil {
@@ -377,16 +377,14 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	defer e.decSynRcvdCount()
 	defer s.decRef()
 
-	n, err := ctx.createEndpointAndPerformHandshake(s, opts)
+	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{})
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		e.stats.FailedConnectionAttempts.Increment()
 		return
 	}
 	ctx.removePendingEndpoint(n)
-	// Start the protocol goroutine.
-	wq := &waiter.Queue{}
-	n.startAcceptedLoop(wq)
+	n.startAcceptedLoop()
 	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 
 	e.deliverAccepted(n)
@@ -546,7 +544,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
 		}
 
-		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions)
+		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions, &waiter.Queue{})
 		if err != nil {
 			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 			e.stats.FailedConnectionAttempts.Increment()
@@ -576,8 +574,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// space available in the backlog.
 
 		// Start the protocol goroutine.
-		wq := &waiter.Queue{}
-		n.startAcceptedLoop(wq)
+		n.startAcceptedLoop()
 		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 		go e.deliverAccepted(n)
 	}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4e3c5419c..9ff7ac261 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -86,6 +86,19 @@ type handshake struct {
 
 	// rcvWndScale is the receive window scale, as defined in RFC 1323.
 	rcvWndScale int
+
+	// startTime is the time at which the first SYN/SYN-ACK was sent.
+	startTime time.Time
+
+	// deferAccept if non-zero will drop the final ACK for a passive
+	// handshake till an ACK segment with data is received or the timeout is
+	// hit.
+	deferAccept time.Duration
+
+	// acked is true if the the final ACK for a 3-way handshake has
+	// been received. This is required to stop retransmitting the
+	// original SYN-ACK when deferAccept is enabled.
+	acked bool
 }
 
 func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
@@ -112,6 +125,12 @@ func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
 	return h
 }
 
+func newPassiveHandshake(ep *endpoint, rcvWnd seqnum.Size, isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) handshake {
+	h := newHandshake(ep, rcvWnd)
+	h.resetToSynRcvd(isn, irs, opts, deferAccept)
+	return h
+}
+
 // FindWndScale determines the window scale to use for the given maximum window
 // size.
 func FindWndScale(wnd seqnum.Size) int {
@@ -181,7 +200,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 {
 
 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
 // state.
-func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
+func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) {
 	h.active = false
 	h.state = handshakeSynRcvd
 	h.flags = header.TCPFlagSyn | header.TCPFlagAck
@@ -189,6 +208,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.ackNum = irs + 1
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
+	h.deferAccept = deferAccept
 	h.ep.mu.Lock()
 	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
@@ -352,6 +372,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 	// We have previously received (and acknowledged) the peer's SYN. If the
 	// peer acknowledges our SYN, the handshake is completed.
 	if s.flagIsSet(header.TCPFlagAck) {
+		// If deferAccept is not zero and this is a bare ACK and the
+		// timeout is not hit then drop the ACK.
+		if h.deferAccept != 0 && s.data.Size() == 0 && time.Since(h.startTime) < h.deferAccept {
+			h.acked = true
+			h.ep.stack.Stats().DroppedPackets.Increment()
+			return nil
+		}
+
 		// If the timestamp option is negotiated and the segment does
 		// not carry a timestamp option then the segment must be dropped
 		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
@@ -365,10 +393,16 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
 		}
 		h.state = handshakeCompleted
+
 		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
+		// If the segment has data then requeue it for the receiver
+		// to process it again once main loop is started.
+		if s.data.Size() > 0 {
+			s.incRef()
+			h.ep.enqueueSegment(s)
+		}
 		h.ep.mu.Unlock()
-
 		return nil
 	}
 
@@ -471,6 +505,7 @@ func (h *handshake) execute() *tcpip.Error {
 		}
 	}
 
+	h.startTime = time.Now()
 	// Initialize the resend timer.
 	resendWaker := sleep.Waker{}
 	timeOut := time.Duration(time.Second)
@@ -524,11 +559,21 @@ func (h *handshake) execute() *tcpip.Error {
 		switch index, _ := s.Fetch(true); index {
 		case wakerForResend:
 			timeOut *= 2
-			if timeOut > 60*time.Second {
+			if timeOut > MaxRTO {
 				return tcpip.ErrTimeout
 			}
 			rt.Reset(timeOut)
-			h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			// Resend the SYN/SYN-ACK only if the following conditions hold.
+			//  - It's an active handshake (deferAccept does not apply)
+			//  - It's a passive handshake and we have not yet got the final-ACK.
+			//  - It's a passive handshake and we got an ACK but deferAccept is
+			//    enabled and we are now past the deferAccept duration.
+			// The last is required to provide a way for the peer to complete
+			// the connection with another ACK or data (as ACKs are never
+			// retransmitted on their own).
+			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
+				h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			}
 
 		case wakerForNotification:
 			n := h.ep.fetchNotifications()
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 13718ff55..8d52414b7 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -498,6 +498,13 @@ type endpoint struct {
 	// without any data being acked.
 	userTimeout time.Duration
 
+	// deferAccept if non-zero specifies a user specified time during
+	// which the final ACK of a handshake will be dropped provided the
+	// ACK is a bare ACK and carries no data. If the timeout is crossed then
+	// the bare ACK is accepted and the connection is delivered to the
+	// listener.
+	deferAccept time.Duration
+
 	// pendingAccepted is a synchronization primitive used to track number
 	// of connections that are queued up to be delivered to the accepted
 	// channel. We use this to ensure that all goroutines blocked on writing
@@ -1574,6 +1581,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		if time.Duration(v) > MaxRTO {
+			v = tcpip.TCPDeferAcceptOption(MaxRTO)
+		}
+		e.deferAccept = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return nil
 	}
@@ -1798,6 +1814,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case *tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -2149,9 +2171,8 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
-func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
+func (e *endpoint) startAcceptedLoop() {
 	e.mu.Lock()
-	e.waiterQueue = waiterQueue
 	e.workerRunning = true
 	e.mu.Unlock()
 	wakerInitDone := make(chan struct{})
@@ -2177,7 +2198,6 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
-
 	return n, n.waiterQueue, nil
 }
 
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 7eb613be5..c9ee5bf06 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -157,13 +157,13 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 		TSVal:         r.synOptions.TSVal,
 		TSEcr:         r.synOptions.TSEcr,
 		SACKPermitted: r.synOptions.SACKPermitted,
-	})
+	}, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// Start the protocol goroutine.
-	ep.startAcceptedLoop(queue)
+	ep.startAcceptedLoop()
 
 	return ep, nil
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index df2fb1071..a12336d47 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6787,3 +6787,129 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 		),
 	)
 }
+
+func TestTCPDeferAccept(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give a bit of time for the socket to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+func TestTCPDeferAcceptTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Sleep for a little of the tcpDeferAccept timeout.
+	time.Sleep(tcpDeferAccept + 100*time.Millisecond)
+
+	// On timeout expiry we should get a SYN-ACK retransmission.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+		checker.AckNum(uint32(irs)+1)))
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give sometime for the endpoint to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 2f9821555..3bf7081b9 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -828,6 +828,164 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
   EXPECT_EQ(get, kUserTimeout);
 }
 
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAccept_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now write some data to the socket.
+  int data = 0;
+  ASSERT_THAT(RetryEINTR(write)(conn_fd.get(), &data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+
+  // This should now cause the connection to complete and be delivered to the
+  // accept socket.
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Verify that the accepted socket returns the data written.
+  int get = -1;
+  ASSERT_THAT(RetryEINTR(recv)(accepted.get(), &get, sizeof(get), 0),
+              SyscallSucceedsWithValue(sizeof(get)));
+
+  EXPECT_EQ(get, data);
+}
+
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAcceptTimeout_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R once issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Verify that there is no acceptable connection before TCP_DEFER_ACCEPT
+  // timeout is hit.
+  absl::SleepFor(absl::Seconds(kTCPDeferAccept - 1));
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now sleep for a little over the TCP_DEFER_ACCEPT duration. When the timeout
+  // is hit a SYN-ACK should be retransmitted by the listener as a last ditch
+  // attempt to complete the connection with or without data.
+  absl::SleepFor(absl::Seconds(2));
+
+  // Verify that we have a connection that can be accepted even though no
+  // data was written.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 33a5ac66c..525ccbd88 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1286,6 +1286,59 @@ TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
   EXPECT_EQ(get, kTCPUserTimeout);
 }
 
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptNeg) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // -ve TCP_DEFER_ACCEPT is same as setting it to zero.
+  constexpr int kNeg = -1;
+  EXPECT_THAT(
+      setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &kNeg, sizeof(kNeg)),
+      SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, GetTCPDeferAcceptDefault) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  // kTCPDeferAccept is in seconds.
+  // NOTE: linux translates seconds to # of retries and back from
+  //   #of retries to seconds. Which means only certain values
+  //   translate back exactly. That's why we use 3 here, a value of
+  //   5 will result in us getting back 7 instead of 5 in the
+  //   getsockopt.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
+      SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPDeferAccept);
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From eba7bdc24d31388ca81eeab251ed2db108f785dc Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 31 Jan 2020 13:46:13 -0800
Subject: iptables: enable TCP matching with "-m tcp".

A couple other things that changed:

- There's a proper extension registration system for matchers. Anyone
  adding another matcher can use tcp_matcher.go or udp_matcher.go as a
  template.
- All logging and use of syserr.Error in the netfilter package happens at the
  highest possible level (public functions). Lower-level functions just
  return normal, descriptive golang errors.
---
 pkg/abi/linux/netfilter.go                 |  52 ++++++++
 pkg/sentry/socket/netfilter/BUILD          |   4 +
 pkg/sentry/socket/netfilter/extensions.go  |  98 +++++++++++++++
 pkg/sentry/socket/netfilter/netfilter.go   | 187 ++++++++---------------------
 pkg/sentry/socket/netfilter/tcp_matcher.go | 143 ++++++++++++++++++++++
 pkg/sentry/socket/netfilter/udp_matcher.go | 142 ++++++++++++++++++++++
 pkg/tcpip/iptables/BUILD                   |   1 -
 pkg/tcpip/iptables/types.go                |   3 +
 pkg/tcpip/iptables/udp_matcher.go          | 113 -----------------
 9 files changed, 495 insertions(+), 248 deletions(-)
 create mode 100644 pkg/sentry/socket/netfilter/extensions.go
 create mode 100644 pkg/sentry/socket/netfilter/tcp_matcher.go
 create mode 100644 pkg/sentry/socket/netfilter/udp_matcher.go
 delete mode 100644 pkg/tcpip/iptables/udp_matcher.go

(limited to 'pkg/sentry/socket')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 8e40bcc62..e4aabb6bb 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -348,6 +348,58 @@ func goString(cstring []byte) string {
 	return string(cstring)
 }
 
+// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp
+// in include/uapi/linux/netfilter/xt_tcpudp.h.
+type XTTCP struct {
+	// SourcePortStart specifies the inclusive start of the range of source
+	// ports to which the matcher applies.
+	SourcePortStart uint16
+
+	// SourcePortEnd specifies the inclusive end of the range of source ports
+	// to which the matcher applies.
+	SourcePortEnd uint16
+
+	// DestinationPortStart specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortStart uint16
+
+	// DestinationPortEnd specifies the start of the destination port
+	// range to which the matcher applies.
+	DestinationPortEnd uint16
+
+	// Option specifies that a particular TCP option must be set.
+	Option uint8
+
+	// FlagMask masks the FlagCompare byte when comparing to the TCP flag
+	// fields.
+	FlagMask uint8
+
+	// FlagCompare is binary and-ed with the TCP flag fields.
+	FlagCompare uint8
+
+	// InverseFlags flips the meaning of certain fields. See the
+	// TX_TCP_INV_* flags.
+	InverseFlags uint8
+}
+
+// SizeOfXTTCP is the size of an XTTCP.
+const SizeOfXTTCP = 12
+
+// Flags in XTTCP.InverseFlags. Corresponding constants are in
+// include/uapi/linux/netfilter/xt_tcpudp.h.
+const (
+	// Invert the meaning of SourcePortStart/End.
+	XT_TCP_INV_SRCPT = 0x01
+	// Invert the meaning of DestinationPortStart/End.
+	XT_TCP_INV_DSTPT = 0x02
+	// Invert the meaning of FlagCompare.
+	XT_TCP_INV_FLAGS = 0x04
+	// Invert the meaning of Option.
+	XT_TCP_INV_OPTION = 0x08
+	// Enable all flags.
+	XT_TCP_INV_MASK = 0x0F
+)
+
 // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
 // in include/uapi/linux/netfilter/xt_tcpudp.h.
 type XTUDP struct {
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index fa2a2cb66..c91ec7494 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -5,7 +5,10 @@ package(licenses = ["notice"])
 go_library(
     name = "netfilter",
     srcs = [
+        "extensions.go",
         "netfilter.go",
+        "tcp_matcher.go",
+        "udp_matcher.go",
     ],
     # This target depends on netstack and should only be used by epsocket,
     # which is allowed to depend on netstack.
@@ -17,6 +20,7 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/syserr",
         "//pkg/tcpip",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/usermem",
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
new file mode 100644
index 000000000..5a4cac84c
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -0,0 +1,98 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// TODO(gvisor.dev/issue/170): The following per-matcher params should be
+// supported:
+// - Table name
+// - Match size
+// - User size
+// - Hooks
+// - Proto
+// - Family
+
+// matchMarshaler knows how to (un)marshal the matcher named name().
+type matchMarshaler interface {
+	// name is the matcher name as stored in the xt_entry_match struct.
+	name() string
+
+	// marshal converts from an iptables.Matcher to an ABI struct.
+	marshal(matcher iptables.Matcher) []byte
+
+	// unmarshal converts from the ABI matcher struct to an
+	// iptables.Matcher.
+	unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error)
+}
+
+var matchMarshalers = map[string]matchMarshaler{}
+
+// registerMatchMarshaler should be called by match extensions to register them
+// with the netfilter package.
+func registerMatchMarshaler(mm matchMarshaler) {
+	if _, ok := matchMarshalers[mm.name()]; ok {
+		panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name()))
+	}
+	matchMarshalers[mm.name()] = mm
+}
+
+func marshalMatcher(matcher iptables.Matcher) []byte {
+	matchMaker, ok := matchMarshalers[matcher.Name()]
+	if !ok {
+		panic(fmt.Errorf("Unknown matcher of type %T.", matcher))
+	}
+	return matchMaker.marshal(matcher)
+}
+
+// marshalEntryMatch creates a marshalled XTEntryMatch with the given name and
+// data appended at the end.
+func marshalEntryMatch(name string, data []byte) []byte {
+	nflog("marshaling matcher %q", name)
+
+	// We have to pad this struct size to a multiple of 8 bytes.
+	size := alignUp(linux.SizeOfXTEntryMatch+len(data), 8)
+	matcher := linux.KernelXTEntryMatch{
+		XTEntryMatch: linux.XTEntryMatch{
+			MatchSize: uint16(size),
+		},
+		Data: data,
+	}
+	copy(matcher.Name[:], name)
+
+	buf := make([]byte, 0, size)
+	buf = binary.Marshal(buf, usermem.ByteOrder, matcher)
+	return append(buf, make([]byte, size-len(buf))...)
+}
+
+func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) {
+	matchMaker, ok := matchMarshalers[match.Name.String()]
+	if !ok {
+		return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
+	}
+	return matchMaker.unmarshal(buf, filter)
+}
+
+// alignUp rounds a length up to an alignment. align must be a power of 2.
+func alignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3dda6c7a1..8f14643b0 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -17,6 +17,7 @@
 package netfilter
 
 import (
+	"errors"
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -34,10 +35,6 @@ import (
 // shouldn't be reached - an error has occurred if we fall through to one.
 const errorTargetName = "ERROR"
 
-const (
-	matcherNameUDP = "udp"
-)
-
 // Metadata is used to verify that we are correctly serializing and
 // deserializing iptables into structs consumable by the iptables tool. We save
 // a metadata struct when the tables are written, and when they are read out we
@@ -68,7 +65,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	// Find the appropriate table.
 	table, err := findTable(stack, info.Name)
 	if err != nil {
-		return linux.IPTGetinfo{}, err
+		nflog("%v", err)
+		return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
 	}
 
 	// Get the hooks that apply to this table.
@@ -95,39 +93,40 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
-		log.Warningf("netfilter: couldn't copy in entries %q", userEntries.Name)
+		nflog("couldn't copy in entries %q", userEntries.Name)
 		return linux.KernelIPTGetEntries{}, syserr.FromError(err)
 	}
 
 	// Find the appropriate table.
 	table, err := findTable(stack, userEntries.Name)
 	if err != nil {
-		log.Warningf("netfilter: couldn't find table %q", userEntries.Name)
-		return linux.KernelIPTGetEntries{}, err
+		nflog("%v", err)
+		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
 	entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table)
 	if err != nil {
-		return linux.KernelIPTGetEntries{}, err
+		nflog("couldn't read entries: %v", err)
+		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 	if meta != table.Metadata().(metadata) {
 		panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta))
 	}
 	if binary.Size(entries) > uintptr(outLen) {
-		log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen))
+		nflog("insufficient GetEntries output size: %d", uintptr(outLen))
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
 	}
 
 	return entries, nil
 }
 
-func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, *syserr.Error) {
+func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, error) {
 	ipt := stack.IPTables()
 	table, ok := ipt.Tables[tablename.String()]
 	if !ok {
-		return iptables.Table{}, syserr.ErrInvalidArgument
+		return iptables.Table{}, fmt.Errorf("couldn't find table %q", tablename)
 	}
 	return table, nil
 }
@@ -151,19 +150,19 @@ func FillDefaultIPTables(stack *stack.Stack) {
 	stack.SetIPTables(ipt)
 }
 
+// TODO: Return proto.
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
 // jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
+func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, error) {
 	// Return values.
 	var entries linux.KernelIPTGetEntries
 	var meta metadata
 
 	// The table name has to fit in the struct.
 	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
-		log.Warningf("Table name %q too long.", tablename)
-		return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
+		return linux.KernelIPTGetEntries{}, metadata{}, fmt.Errorf("Table name %q too long.", tablename)
 	}
 	copy(entries.Name[:], tablename)
 
@@ -229,46 +228,6 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	return entries, meta, nil
 }
 
-func marshalMatcher(matcher iptables.Matcher) []byte {
-	switch m := matcher.(type) {
-	case *iptables.UDPMatcher:
-		return marshalUDPMatcher(m)
-	default:
-		// TODO(gvisor.dev/issue/170): Support other matchers.
-		panic(fmt.Errorf("unknown matcher of type %T", matcher))
-	}
-}
-
-func marshalUDPMatcher(matcher *iptables.UDPMatcher) []byte {
-	nflog("convert to binary: marshalling UDP matcher: %+v", matcher)
-
-	// We have to pad this struct size to a multiple of 8 bytes.
-	size := alignUp(linux.SizeOfXTEntryMatch+linux.SizeOfXTUDP, 8)
-
-	linuxMatcher := linux.KernelXTEntryMatch{
-		XTEntryMatch: linux.XTEntryMatch{
-			MatchSize: uint16(size),
-		},
-		Data: make([]byte, 0, linux.SizeOfXTUDP),
-	}
-	copy(linuxMatcher.Name[:], matcherNameUDP)
-
-	xtudp := linux.XTUDP{
-		SourcePortStart:      matcher.Data.SourcePortStart,
-		SourcePortEnd:        matcher.Data.SourcePortEnd,
-		DestinationPortStart: matcher.Data.DestinationPortStart,
-		DestinationPortEnd:   matcher.Data.DestinationPortEnd,
-		InverseFlags:         matcher.Data.InverseFlags,
-	}
-	linuxMatcher.Data = binary.Marshal(linuxMatcher.Data, usermem.ByteOrder, xtudp)
-
-	buf := make([]byte, 0, size)
-	buf = binary.Marshal(buf, usermem.ByteOrder, linuxMatcher)
-	buf = append(buf, make([]byte, size-len(buf))...)
-	nflog("convert to binary: marshalled UDP matcher into %v", buf)
-	return buf[:]
-}
-
 func marshalTarget(target iptables.Target) []byte {
 	switch target.(type) {
 	case iptables.UnconditionalAcceptTarget:
@@ -332,7 +291,7 @@ func translateFromStandardVerdict(verdict iptables.Verdict) int32 {
 
 // translateToStandardVerdict translates from the value in a
 // linux.XTStandardTarget to an iptables.Verdict.
-func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
+func translateToStandardVerdict(val int32) (iptables.Verdict, error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
@@ -340,13 +299,12 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 	case -linux.NF_DROP - 1:
 		return iptables.Drop, nil
 	case -linux.NF_QUEUE - 1:
-		log.Warningf("Unsupported iptables verdict QUEUE.")
+		return iptables.Invalid, errors.New("unsupported iptables verdict QUEUE")
 	case linux.NF_RETURN:
-		log.Warningf("Unsupported iptables verdict RETURN.")
+		return iptables.Invalid, errors.New("unsupported iptables verdict RETURN")
 	default:
-		log.Warningf("Unknown iptables verdict %d.", val)
+		return iptables.Invalid, fmt.Errorf("unknown iptables verdict %d.", val)
 	}
-	return iptables.Invalid, syserr.ErrInvalidArgument
 }
 
 // SetEntries sets iptables rules for a single table. See
@@ -354,7 +312,7 @@ func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
 func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
-		log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
+		nflog("optVal has insufficient size for replace %d", len(optVal))
 		return syserr.ErrInvalidArgument
 	}
 	var replace linux.IPTReplace
@@ -368,7 +326,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	case iptables.TablenameFilter:
 		table = iptables.EmptyFilterTable()
 	default:
-		log.Warningf("We don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
+		nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
 		return syserr.ErrInvalidArgument
 	}
 
@@ -382,7 +340,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 		// Get the struct ipt_entry.
 		if len(optVal) < linux.SizeOfIPTEntry {
-			log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal))
+			nflog("optVal has insufficient size for entry %d", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		var entry linux.IPTEntry
@@ -392,7 +350,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		optVal = optVal[linux.SizeOfIPTEntry:]
 
 		if entry.TargetOffset < linux.SizeOfIPTEntry {
-			log.Warningf("netfilter: entry has too-small target offset %d", entry.TargetOffset)
+			nflog("entry has too-small target offset %d", entry.TargetOffset)
 			return syserr.ErrInvalidArgument
 		}
 
@@ -400,7 +358,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// filtering fields.
 		filter, err := filterFromIPTIP(entry.IP)
 		if err != nil {
-			return err
+			nflog("bad iptip: %v", err)
+			return syserr.ErrInvalidArgument
 		}
 
 		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
@@ -408,25 +367,26 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// Get matchers.
 		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
 		if len(optVal) < int(matchersSize) {
-			log.Warningf("netfilter: entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		matchers, err := parseMatchers(filter, optVal[:matchersSize])
 		if err != nil {
-			log.Warningf("netfilter: failed to parse matchers: %v", err)
-			return err
+			nflog("failed to parse matchers: %v", err)
+			return syserr.ErrInvalidArgument
 		}
 		optVal = optVal[matchersSize:]
 
 		// Get the target of the rule.
 		targetSize := entry.NextOffset - entry.TargetOffset
 		if len(optVal) < int(targetSize) {
-			log.Warningf("netfilter: entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
 		target, err := parseTarget(optVal[:targetSize])
 		if err != nil {
-			return err
+			nflog("failed to parse target: %v", err)
+			return syserr.ErrInvalidArgument
 		}
 		optVal = optVal[targetSize:]
 
@@ -439,7 +399,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		offset += uint32(entry.NextOffset)
 
 		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
-			log.Warningf("netfilter: entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
 		}
 	}
 
@@ -457,11 +417,11 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 				}
 			}
 			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset {
-				log.Warningf("Hook %v is unset.", hk)
+				nflog("hook %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
 			if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset {
-				log.Warningf("Underflow %v is unset.", hk)
+				nflog("underflow %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
 		}
@@ -473,7 +433,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	for hook, ruleIdx := range table.BuiltinChains {
 		if hook != iptables.Input {
 			if _, ok := table.Rules[ruleIdx].Target.(iptables.UnconditionalAcceptTarget); !ok {
-				log.Warningf("Hook %d is unsupported.", hook)
+				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
 		}
@@ -499,7 +459,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
-func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, *syserr.Error) {
+func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, error) {
 	nflog("set entries: parsing matchers of size %d", len(optVal))
 	var matchers []iptables.Matcher
 	for len(optVal) > 0 {
@@ -507,8 +467,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 		// Get the XTEntryMatch.
 		if len(optVal) < linux.SizeOfXTEntryMatch {
-			log.Warningf("netfilter: optVal has insufficient size for entry match: %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has insufficient size for entry match: %d", len(optVal))
 		}
 		var match linux.XTEntryMatch
 		buf := optVal[:linux.SizeOfXTEntryMatch]
@@ -517,45 +476,18 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
-			log.Warningf("netfilter: match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
-			return nil, syserr.ErrInvalidArgument
+
+			return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
 		}
 		if len(optVal) < int(match.MatchSize) {
-			log.Warningf("netfilter: optVal has insufficient size for match: %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has insufficient size for match: %d", len(optVal))
 		}
 
-		buf = optVal[linux.SizeOfXTEntryMatch:match.MatchSize]
-		var matcher iptables.Matcher
-		var err error
-		switch match.Name.String() {
-		case matcherNameUDP:
-			if len(buf) < linux.SizeOfXTUDP {
-				log.Warningf("netfilter: optVal has insufficient size for UDP match: %d", len(optVal))
-				return nil, syserr.ErrInvalidArgument
-			}
-			// For alignment reasons, the match's total size may
-			// exceed what's strictly necessary to hold matchData.
-			var matchData linux.XTUDP
-			binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
-			log.Infof("parseMatchers: parsed XTUDP: %+v", matchData)
-			matcher, err = iptables.NewUDPMatcher(filter, iptables.UDPMatcherParams{
-				SourcePortStart:      matchData.SourcePortStart,
-				SourcePortEnd:        matchData.SourcePortEnd,
-				DestinationPortStart: matchData.DestinationPortStart,
-				DestinationPortEnd:   matchData.DestinationPortEnd,
-				InverseFlags:         matchData.InverseFlags,
-			})
-			if err != nil {
-				log.Warningf("netfilter: failed to create UDP matcher: %v", err)
-				return nil, syserr.ErrInvalidArgument
-			}
-
-		default:
-			log.Warningf("netfilter: unsupported matcher with name %q", match.Name.String())
-			return nil, syserr.ErrInvalidArgument
+		// Parse the specific matcher.
+		matcher, err := unmarshalMatcher(match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize])
+		if err != nil {
+			return nil, fmt.Errorf("failed to create matcher: %v", err)
 		}
-
 		matchers = append(matchers, matcher)
 
 		// TODO(gvisor.dev/issue/170): Check the revision field.
@@ -563,8 +495,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 	}
 
 	if len(optVal) != 0 {
-		log.Warningf("netfilter: optVal should be exhausted after parsing matchers")
-		return nil, syserr.ErrInvalidArgument
+		return nil, errors.New("optVal should be exhausted after parsing matchers")
 	}
 
 	return matchers, nil
@@ -572,11 +503,10 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
-func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
+func parseTarget(optVal []byte) (iptables.Target, error) {
 	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
-		log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
-		return nil, syserr.ErrInvalidArgument
+		return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
 	}
 	var target linux.XTEntryTarget
 	buf := optVal[:linux.SizeOfXTEntryTarget]
@@ -585,8 +515,7 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 	case "":
 		// Standard target.
 		if len(optVal) != linux.SizeOfXTStandardTarget {
-			log.Warningf("netfilter.SetEntries: optVal has wrong size for standard target %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal))
 		}
 		var standardTarget linux.XTStandardTarget
 		buf = optVal[:linux.SizeOfXTStandardTarget]
@@ -602,15 +531,13 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 		case iptables.Drop:
 			return iptables.UnconditionalDropTarget{}, nil
 		default:
-			log.Warningf("Unknown verdict: %v", verdict)
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("Unknown verdict: %v", verdict)
 		}
 
 	case errorTargetName:
 		// Error target.
 		if len(optVal) != linux.SizeOfXTErrorTarget {
-			log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal))
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal))
 		}
 		var errorTarget linux.XTErrorTarget
 		buf = optVal[:linux.SizeOfXTErrorTarget]
@@ -627,20 +554,17 @@ func parseTarget(optVal []byte) (iptables.Target, *syserr.Error) {
 		case errorTargetName:
 			return iptables.ErrorTarget{}, nil
 		default:
-			log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
-			return nil, syserr.ErrInvalidArgument
+			return nil, fmt.Errorf("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
 		}
 	}
 
 	// Unknown target.
-	log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
-	return nil, syserr.ErrInvalidArgument
+	return nil, fmt.Errorf("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
 }
 
-func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) {
+func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) {
 	if containsUnsupportedFields(iptip) {
-		log.Warningf("netfilter: unsupported fields in struct iptip: %+v", iptip)
-		return iptables.IPHeaderFilter{}, syserr.ErrInvalidArgument
+		return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
 	}
 	return iptables.IPHeaderFilter{
 		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
@@ -678,8 +602,3 @@ func hookFromLinux(hook int) iptables.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
-
-// alignUp rounds a length up to an alignment. align must be a power of 2.
-func alignUp(length int, align uint) int {
-	return (length + int(align) - 1) & ^(int(align) - 1)
-}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
new file mode 100644
index 000000000..1646d22f7
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -0,0 +1,143 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const matcherNameTCP = "tcp"
+
+func init() {
+	registerMatchMarshaler(tcpMarshaler{})
+}
+
+// tcpMarshaler implements matchMarshaler for TCP matching.
+type tcpMarshaler struct{}
+
+// name implements matchMarshaler.name.
+func (tcpMarshaler) name() string {
+	return matcherNameTCP
+}
+
+// marshal implements matchMarshaler.marshal.
+func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
+	matcher := mr.(*TCPMatcher)
+	xttcp := linux.XTTCP{
+		SourcePortStart:      matcher.sourcePortStart,
+		SourcePortEnd:        matcher.sourcePortEnd,
+		DestinationPortStart: matcher.destinationPortStart,
+		DestinationPortEnd:   matcher.destinationPortEnd,
+	}
+	buf := make([]byte, 0, linux.SizeOfXTUDP)
+	return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp))
+}
+
+// unmarshal implements matchMarshaler.unmarshal.
+func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+	if len(buf) < linux.SizeOfXTTCP {
+		return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf))
+	}
+
+	// For alignment reasons, the match's total size may
+	// exceed what's strictly necessary to hold matchData.
+	var matchData linux.XTTCP
+	binary.Unmarshal(buf[:linux.SizeOfXTTCP], usermem.ByteOrder, &matchData)
+	nflog("parseMatchers: parsed XTTCP: %+v", matchData)
+
+	if matchData.Option != 0 ||
+		matchData.FlagMask != 0 ||
+		matchData.FlagCompare != 0 ||
+		matchData.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported TCP matcher flags set")
+	}
+
+	if filter.Protocol != header.TCPProtocolNumber {
+		return nil, fmt.Errorf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
+	}
+
+	return &TCPMatcher{
+		sourcePortStart:      matchData.SourcePortStart,
+		sourcePortEnd:        matchData.SourcePortEnd,
+		destinationPortStart: matchData.DestinationPortStart,
+		destinationPortEnd:   matchData.DestinationPortEnd,
+	}, nil
+}
+
+// TCPMatcher matches TCP packets and their headers. It implements Matcher.
+type TCPMatcher struct {
+	sourcePortStart      uint16
+	sourcePortEnd        uint16
+	destinationPortStart uint16
+	destinationPortEnd   uint16
+}
+
+// Name implements Matcher.Name.
+func (*TCPMatcher) Name() string {
+	return matcherNameTCP
+}
+
+// Match implements Matcher.Match.
+func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		if frag == 1 {
+			return false, true
+		}
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO(gvisor.dev/issue/170): Parsing the transport header should
+	// ultimately be moved into the iptables.Check codepath as matchers are
+	// added.
+	var tcpHeader header.TCP
+	if pkt.TransportHeader != nil {
+		tcpHeader = header.TCP(pkt.TransportHeader)
+	} else {
+		// The TCP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.TCPMinimumSize {
+			// There's no valid TCP header here, so we hotdrop the
+			// packet.
+			return false, true
+		}
+		tcpHeader = header.TCP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	if sourcePort := tcpHeader.SourcePort(); sourcePort < tm.sourcePortStart || tm.sourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort := tcpHeader.DestinationPort(); destinationPort < tm.destinationPortStart || tm.destinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
new file mode 100644
index 000000000..b6e95bbc5
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -0,0 +1,142 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const matcherNameUDP = "udp"
+
+func init() {
+	registerMatchMarshaler(udpMarshaler{})
+}
+
+// udpMarshaler implements matchMarshaler for UDP matching.
+type udpMarshaler struct{}
+
+// name implements matchMarshaler.name.
+func (udpMarshaler) name() string {
+	return matcherNameUDP
+}
+
+// marshal implements matchMarshaler.marshal.
+func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
+	matcher := mr.(*UDPMatcher)
+	xtudp := linux.XTUDP{
+		SourcePortStart:      matcher.sourcePortStart,
+		SourcePortEnd:        matcher.sourcePortEnd,
+		DestinationPortStart: matcher.destinationPortStart,
+		DestinationPortEnd:   matcher.destinationPortEnd,
+	}
+	buf := make([]byte, 0, linux.SizeOfXTUDP)
+	return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp))
+}
+
+// unmarshal implements matchMarshaler.unmarshal.
+func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+	if len(buf) < linux.SizeOfXTUDP {
+		return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf))
+	}
+
+	// For alignment reasons, the match's total size may exceed what's
+	// strictly necessary to hold matchData.
+	var matchData linux.XTUDP
+	binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData)
+	nflog("parseMatchers: parsed XTUDP: %+v", matchData)
+
+	if matchData.InverseFlags != 0 {
+		return nil, fmt.Errorf("unsupported UDP matcher inverse flags set")
+	}
+
+	if filter.Protocol != header.UDPProtocolNumber {
+		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+	}
+
+	return &UDPMatcher{
+		sourcePortStart:      matchData.SourcePortStart,
+		sourcePortEnd:        matchData.SourcePortEnd,
+		destinationPortStart: matchData.DestinationPortStart,
+		destinationPortEnd:   matchData.DestinationPortEnd,
+	}, nil
+}
+
+// UDPMatcher matches UDP packets and their headers. It implements Matcher.
+type UDPMatcher struct {
+	sourcePortStart      uint16
+	sourcePortEnd        uint16
+	destinationPortStart uint16
+	destinationPortEnd   uint16
+}
+
+// Name implements Matcher.Name.
+func (*UDPMatcher) Name() string {
+	return matcherNameUDP
+}
+
+// Match implements Matcher.Match.
+func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+
+	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+	// into the iptables.Check codepath as matchers are added.
+	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+		return false, false
+	}
+
+	// We dont't match fragments.
+	if frag := netHeader.FragmentOffset(); frag != 0 {
+		if frag == 1 {
+			return false, true
+		}
+		return false, false
+	}
+
+	// Now we need the transport header. However, this may not have been set
+	// yet.
+	// TODO(gvisor.dev/issue/170): Parsing the transport header should
+	// ultimately be moved into the iptables.Check codepath as matchers are
+	// added.
+	var udpHeader header.UDP
+	if pkt.TransportHeader != nil {
+		udpHeader = header.UDP(pkt.TransportHeader)
+	} else {
+		// The UDP header hasn't been parsed yet. We have to do it here.
+		if len(pkt.Data.First()) < header.UDPMinimumSize {
+			// There's no valid UDP header here, so we hotdrop the
+			// packet.
+			return false, true
+		}
+		udpHeader = header.UDP(pkt.Data.First())
+	}
+
+	// Check whether the source and destination ports are within the
+	// matching range.
+	if sourcePort := udpHeader.SourcePort(); sourcePort < um.sourcePortStart || um.sourcePortEnd < sourcePort {
+		return false, false
+	}
+	if destinationPort := udpHeader.DestinationPort(); destinationPort < um.destinationPortStart || um.destinationPortEnd < destinationPort {
+		return false, false
+	}
+
+	return true, false
+}
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index bab26580b..d1b73cfdf 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -8,7 +8,6 @@ go_library(
         "iptables.go",
         "targets.go",
         "types.go",
-        "udp_matcher.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 7f77802a0..d660aab04 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -171,6 +171,9 @@ type IPHeaderFilter struct {
 
 // A Matcher is the interface for matching packets.
 type Matcher interface {
+	// Name returns the name of the Matcher.
+	Name() string
+
 	// Match returns whether the packet matches and whether the packet
 	// should be "hotdropped", i.e. dropped immediately. This is usually
 	// used for suspicious packets.
diff --git a/pkg/tcpip/iptables/udp_matcher.go b/pkg/tcpip/iptables/udp_matcher.go
deleted file mode 100644
index 3bb076f9c..000000000
--- a/pkg/tcpip/iptables/udp_matcher.go
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package iptables
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-// TODO(gvisor.dev/issue/170): The following per-matcher params should be
-// supported:
-// - Table name
-// - Match size
-// - User size
-// - Hooks
-// - Proto
-// - Family
-
-// UDPMatcher matches UDP packets and their headers. It implements Matcher.
-type UDPMatcher struct {
-	Data UDPMatcherParams
-}
-
-// UDPMatcherParams are the parameters used to create a UDPMatcher.
-type UDPMatcherParams struct {
-	SourcePortStart      uint16
-	SourcePortEnd        uint16
-	DestinationPortStart uint16
-	DestinationPortEnd   uint16
-	InverseFlags         uint8
-}
-
-// NewUDPMatcher returns a new instance of UDPMatcher.
-func NewUDPMatcher(filter IPHeaderFilter, data UDPMatcherParams) (Matcher, error) {
-	log.Infof("Adding rule with UDPMatcherParams: %+v", data)
-
-	if data.InverseFlags != 0 {
-		return nil, fmt.Errorf("unsupported UDP matcher inverse flags set")
-	}
-
-	if filter.Protocol != header.UDPProtocolNumber {
-		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
-	}
-
-	return &UDPMatcher{Data: data}, nil
-}
-
-// Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader)
-
-	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
-	// into the iptables.Check codepath as matchers are added.
-	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
-		return false, false
-	}
-
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			log.Warningf("Dropping UDP packet: malicious fragmented packet.")
-			return false, true
-		}
-		return false, false
-	}
-
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the iptables.Check codepath as matchers are
-	// added.
-	var udpHeader header.UDP
-	if pkt.TransportHeader != nil {
-		udpHeader = header.UDP(pkt.TransportHeader)
-	} else {
-		// The UDP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.UDPMinimumSize {
-			// There's no valid UDP header here, so we hotdrop the
-			// packet.
-			log.Warningf("Dropping UDP packet: size too small.")
-			return false, true
-		}
-		udpHeader = header.UDP(pkt.Data.First())
-	}
-
-	// Check whether the source and destination ports are within the
-	// matching range.
-	sourcePort := udpHeader.SourcePort()
-	destinationPort := udpHeader.DestinationPort()
-	if sourcePort < um.Data.SourcePortStart || um.Data.SourcePortEnd < sourcePort {
-		return false, false
-	}
-	if destinationPort < um.Data.DestinationPortStart || um.Data.DestinationPortEnd < destinationPort {
-		return false, false
-	}
-
-	return true, false
-}
-- 
cgit v1.2.3


From 29ad5762e4549d961f48c65292cfdeb7256524f6 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 31 Jan 2020 13:53:58 -0800
Subject: Spelling

---
 pkg/sentry/socket/netfilter/extensions.go  | 18 +++++++++---------
 pkg/sentry/socket/netfilter/tcp_matcher.go | 10 +++++-----
 pkg/sentry/socket/netfilter/udp_matcher.go | 10 +++++-----
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 5a4cac84c..b5fbb52e4 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -32,8 +32,8 @@ import (
 // - Proto
 // - Family
 
-// matchMarshaler knows how to (un)marshal the matcher named name().
-type matchMarshaler interface {
+// matchMaker knows how to (un)marshal the matcher named name().
+type matchMaker interface {
 	// name is the matcher name as stored in the xt_entry_match struct.
 	name() string
 
@@ -45,19 +45,19 @@ type matchMarshaler interface {
 	unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error)
 }
 
-var matchMarshalers = map[string]matchMarshaler{}
+var matchMakers = map[string]matchMaker{}
 
-// registerMatchMarshaler should be called by match extensions to register them
+// registermatchMaker should be called by match extensions to register them
 // with the netfilter package.
-func registerMatchMarshaler(mm matchMarshaler) {
-	if _, ok := matchMarshalers[mm.name()]; ok {
+func registerMatchMaker(mm matchMaker) {
+	if _, ok := matchMakers[mm.name()]; ok {
 		panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name()))
 	}
-	matchMarshalers[mm.name()] = mm
+	matchMakers[mm.name()] = mm
 }
 
 func marshalMatcher(matcher iptables.Matcher) []byte {
-	matchMaker, ok := matchMarshalers[matcher.Name()]
+	matchMaker, ok := matchMakers[matcher.Name()]
 	if !ok {
 		panic(fmt.Errorf("Unknown matcher of type %T.", matcher))
 	}
@@ -85,7 +85,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
 }
 
 func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) {
-	matchMaker, ok := matchMarshalers[match.Name.String()]
+	matchMaker, ok := matchMakers[match.Name.String()]
 	if !ok {
 		return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
 	}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 1646d22f7..6b2f4c31a 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -28,18 +28,18 @@ import (
 const matcherNameTCP = "tcp"
 
 func init() {
-	registerMatchMarshaler(tcpMarshaler{})
+	registerMatchMaker(tcpMarshaler{})
 }
 
-// tcpMarshaler implements matchMarshaler for TCP matching.
+// tcpMarshaler implements matchMaker for TCP matching.
 type tcpMarshaler struct{}
 
-// name implements matchMarshaler.name.
+// name implements matchMaker.name.
 func (tcpMarshaler) name() string {
 	return matcherNameTCP
 }
 
-// marshal implements matchMarshaler.marshal.
+// marshal implements matchMaker.marshal.
 func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
 	matcher := mr.(*TCPMatcher)
 	xttcp := linux.XTTCP{
@@ -52,7 +52,7 @@ func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
 	return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp))
 }
 
-// unmarshal implements matchMarshaler.unmarshal.
+// unmarshal implements matchMaker.unmarshal.
 func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
 	if len(buf) < linux.SizeOfXTTCP {
 		return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf))
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index b6e95bbc5..86aa11696 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -28,18 +28,18 @@ import (
 const matcherNameUDP = "udp"
 
 func init() {
-	registerMatchMarshaler(udpMarshaler{})
+	registerMatchMaker(udpMarshaler{})
 }
 
-// udpMarshaler implements matchMarshaler for UDP matching.
+// udpMarshaler implements matchMaker for UDP matching.
 type udpMarshaler struct{}
 
-// name implements matchMarshaler.name.
+// name implements matchMaker.name.
 func (udpMarshaler) name() string {
 	return matcherNameUDP
 }
 
-// marshal implements matchMarshaler.marshal.
+// marshal implements matchMaker.marshal.
 func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
 	matcher := mr.(*UDPMatcher)
 	xtudp := linux.XTUDP{
@@ -52,7 +52,7 @@ func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
 	return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp))
 }
 
-// unmarshal implements matchMarshaler.unmarshal.
+// unmarshal implements matchMaker.unmarshal.
 func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
 	if len(buf) < linux.SizeOfXTUDP {
 		return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf))
-- 
cgit v1.2.3


From 492229d0176c1af2ab4ea4cf91bf211e940b5b12 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 4 Feb 2020 11:28:36 -0800
Subject: VFS2 gofer client

Updates #1198

Opening host pipes (by spinning in fdpipe) and host sockets is not yet
complete, and will be done in a future CL.

Major differences from VFS1 gofer client (sentry/fs/gofer), with varying levels
of backportability:

- "Cache policies" are replaced by InteropMode, which control the behavior of
  timestamps in addition to caching. Under InteropModeExclusive (analogous to
  cacheAll) and InteropModeWritethrough (analogous to cacheAllWritethrough),
  client timestamps are *not* written back to the server (it is not possible in
  9P or Linux for clients to set ctime, so writing back client-authoritative
  timestamps results in incoherence between atime/mtime and ctime). Under
  InteropModeShared (analogous to cacheRemoteRevalidating), client timestamps
  are not used at all (remote filesystem clocks are authoritative). cacheNone
  is translated to InteropModeShared + new option
  filesystemOptions.specialRegularFiles.

- Under InteropModeShared, "unstable attribute" reloading for permission
  checks, lookup, and revalidation are fused, which is feasible in VFS2 since
  gofer.filesystem controls path resolution. This results in a ~33% reduction
  in RPCs for filesystem operations compared to cacheRemoteRevalidating. For
  example, consider stat("/foo/bar/baz") where "/foo/bar/baz" fails
  revalidation, resulting in the instantiation of a new dentry:

  VFS1 RPCs:
  getattr("/")                          // fs.MountNamespace.FindLink() => fs.Inode.CheckPermission() => gofer.inodeOperations.check() => gofer.inodeOperations.UnstableAttr()
  walkgetattr("/", "foo") = fid1        // fs.Dirent.walk() => gofer.session.Revalidate() => gofer.cachePolicy.Revalidate()
  clunk(fid1)
  getattr("/foo")                       // CheckPermission
  walkgetattr("/foo", "bar") = fid2     // Revalidate
  clunk(fid2)
  getattr("/foo/bar")                   // CheckPermission
  walkgetattr("/foo/bar", "baz") = fid3 // Revalidate
  clunk(fid3)
  walkgetattr("/foo/bar", "baz") = fid4 // fs.Dirent.walk() => gofer.inodeOperations.Lookup
  getattr("/foo/bar/baz")               // linux.stat() => gofer.inodeOperations.UnstableAttr()

  VFS2 RPCs:
  getattr("/")                          // gofer.filesystem.walkExistingLocked()
  walkgetattr("/", "foo") = fid1        // gofer.filesystem.stepExistingLocked()
  clunk(fid1)
                                        // No getattr: walkgetattr already updated metadata for permission check
  walkgetattr("/foo", "bar") = fid2
  clunk(fid2)
  walkgetattr("/foo/bar", "baz") = fid3
                                        // No clunk: fid3 used for new gofer.dentry
                                        // No getattr: walkgetattr already updated metadata for stat()

- gofer.filesystem.unlinkAt() does not require instantiation of a dentry that
  represents the file to be deleted. Updates #898.

- gofer.regularFileFD.OnClose() skips Tflushf for regular files under
  InteropModeExclusive, as it's nonsensical to request a remote file flush
  without flushing locally-buffered writes to that remote file first.

- Symlink targets are cached when InteropModeShared is not in effect.

- p9.QID.Path (which is already required to be unique for each file within a
  server, and is accordingly already synthesized from device/inode numbers in
  all known gofers) is used as-is for inode numbers, rather than being mapped
  along with attr.RDev in the client to yet another synthetic inode number.

- Relevant parts of fsutil.CachingInodeOperations are inlined directly into
  gofer package code. This avoids having to duplicate part of its functionality
  in fsutil.HostMappable.

PiperOrigin-RevId: 293190213
---
 pkg/safemem/seq_unsafe.go                |   17 +
 pkg/sentry/fs/fsutil/BUILD               |    4 +-
 pkg/sentry/fs/fsutil/frame_ref_set.go    |   13 +-
 pkg/sentry/fs/fsutil/inode_cached.go     |    2 +-
 pkg/sentry/fsimpl/gofer/BUILD            |   55 ++
 pkg/sentry/fsimpl/gofer/directory.go     |  190 +++++
 pkg/sentry/fsimpl/gofer/filesystem.go    | 1087 ++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/gofer/gofer.go         | 1147 ++++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/gofer/handle.go        |  135 ++++
 pkg/sentry/fsimpl/gofer/handle_unsafe.go |   66 ++
 pkg/sentry/fsimpl/gofer/p9file.go        |  219 ++++++
 pkg/sentry/fsimpl/gofer/pagemath.go      |   31 +
 pkg/sentry/fsimpl/gofer/regular_file.go  |  860 ++++++++++++++++++++++
 pkg/sentry/fsimpl/gofer/special_file.go  |  159 +++++
 pkg/sentry/fsimpl/gofer/symlink.go       |   47 ++
 pkg/sentry/fsimpl/gofer/time.go          |   75 ++
 pkg/sentry/fsimpl/tmpfs/filesystem.go    |    2 +-
 pkg/sentry/socket/hostinet/socket.go     |   23 +-
 18 files changed, 4103 insertions(+), 29 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/gofer/BUILD
 create mode 100644 pkg/sentry/fsimpl/gofer/directory.go
 create mode 100644 pkg/sentry/fsimpl/gofer/filesystem.go
 create mode 100644 pkg/sentry/fsimpl/gofer/gofer.go
 create mode 100644 pkg/sentry/fsimpl/gofer/handle.go
 create mode 100644 pkg/sentry/fsimpl/gofer/handle_unsafe.go
 create mode 100644 pkg/sentry/fsimpl/gofer/p9file.go
 create mode 100644 pkg/sentry/fsimpl/gofer/pagemath.go
 create mode 100644 pkg/sentry/fsimpl/gofer/regular_file.go
 create mode 100644 pkg/sentry/fsimpl/gofer/special_file.go
 create mode 100644 pkg/sentry/fsimpl/gofer/symlink.go
 create mode 100644 pkg/sentry/fsimpl/gofer/time.go

(limited to 'pkg/sentry/socket')

diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
index 354a95dde..dcdfc9600 100644
--- a/pkg/safemem/seq_unsafe.go
+++ b/pkg/safemem/seq_unsafe.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 	"reflect"
+	"syscall"
 	"unsafe"
 )
 
@@ -297,3 +298,19 @@ func ZeroSeq(dsts BlockSeq) (uint64, error) {
 	}
 	return done, nil
 }
+
+// IovecsFromBlockSeq returns a []syscall.Iovec representing seq.
+func IovecsFromBlockSeq(bs BlockSeq) []syscall.Iovec {
+	iovs := make([]syscall.Iovec, 0, bs.NumBlocks())
+	for ; !bs.IsEmpty(); bs = bs.Tail() {
+		b := bs.Head()
+		iovs = append(iovs, syscall.Iovec{
+			Base: &b.ToSlice()[0],
+			Len:  uint64(b.Len()),
+		})
+		// We don't need to care about b.NeedSafecopy(), because the host
+		// kernel will handle such address ranges just fine (by returning
+		// EFAULT).
+	}
+	return iovs
+}
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 4ab2a384f..789369220 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -28,13 +28,13 @@ go_template_instance(
         "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
     },
     package = "fsutil",
-    prefix = "frameRef",
+    prefix = "FrameRef",
     template = "//pkg/segment:generic_set",
     types = {
         "Key": "uint64",
         "Range": "platform.FileRange",
         "Value": "uint64",
-        "Functions": "frameRefSetFunctions",
+        "Functions": "FrameRefSetFunctions",
     },
 )
 
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index dd63db32b..6564fd0c6 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -20,24 +20,25 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 )
 
-type frameRefSetFunctions struct{}
+// FrameRefSetFunctions implements segment.Functions for FrameRefSet.
+type FrameRefSetFunctions struct{}
 
 // MinKey implements segment.Functions.MinKey.
-func (frameRefSetFunctions) MinKey() uint64 {
+func (FrameRefSetFunctions) MinKey() uint64 {
 	return 0
 }
 
 // MaxKey implements segment.Functions.MaxKey.
-func (frameRefSetFunctions) MaxKey() uint64 {
+func (FrameRefSetFunctions) MaxKey() uint64 {
 	return math.MaxUint64
 }
 
 // ClearValue implements segment.Functions.ClearValue.
-func (frameRefSetFunctions) ClearValue(val *uint64) {
+func (FrameRefSetFunctions) ClearValue(val *uint64) {
 }
 
 // Merge implements segment.Functions.Merge.
-func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
+func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
 	if val1 != val2 {
 		return 0, false
 	}
@@ -45,6 +46,6 @@ func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
 }
 
 // Split implements segment.Functions.Split.
-func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
+func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
 	return val, val
 }
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 573b8586e..800c8b4e1 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -111,7 +111,7 @@ type CachingInodeOperations struct {
 	// refs tracks active references to data in the cache.
 	//
 	// refs is protected by dataMu.
-	refs frameRefSet
+	refs FrameRefSet
 }
 
 // CachingInodeOperationsOptions configures a CachingInodeOperations.
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
new file mode 100644
index 000000000..4ba76a1e8
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -0,0 +1,55 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "gofer",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*dentry",
+        "Linker": "*dentry",
+    },
+)
+
+go_library(
+    name = "gofer",
+    srcs = [
+        "dentry_list.go",
+        "directory.go",
+        "filesystem.go",
+        "gofer.go",
+        "handle.go",
+        "handle_unsafe.go",
+        "p9file.go",
+        "pagemath.go",
+        "regular_file.go",
+        "special_file.go",
+        "symlink.go",
+        "time.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fd",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/safemem",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/unet",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
new file mode 100644
index 000000000..baa2cdd8e
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -0,0 +1,190 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func (d *dentry) isDir() bool {
+	return d.fileType() == linux.S_IFDIR
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop !=
+// InteropModeShared.
+func (d *dentry) cacheNegativeChildLocked(name string) {
+	if d.negativeChildren == nil {
+		d.negativeChildren = make(map[string]struct{})
+	}
+	d.negativeChildren[name] = struct{}{}
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	mu      sync.Mutex
+	off     int64
+	dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	if fd.dirents == nil {
+		ds, err := fd.dentry().getDirents(ctx)
+		if err != nil {
+			return err
+		}
+		fd.dirents = ds
+	}
+
+	for fd.off < int64(len(fd.dirents)) {
+		if !cb.Handle(fd.dirents[fd.off]) {
+			return nil
+		}
+		fd.off++
+	}
+	return nil
+}
+
+// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+	// 9P2000.L's readdir does not specify behavior in the presence of
+	// concurrent mutation of an iterated directory, so implementations may
+	// duplicate or omit entries in this case, which violates POSIX semantics.
+	// Thus we read all directory entries while holding d.dirMu to exclude
+	// directory mutations. (Note that it is impossible for the client to
+	// exclude concurrent mutation from other remote filesystem users. Since
+	// there is no way to detect if the server has incorrectly omitted
+	// directory entries, we simply assume that the server is well-behaved
+	// under InteropModeShared.) This is inconsistent with Linux (which appears
+	// to assume that directory fids have the correct semantics, and translates
+	// struct file_operations::readdir calls directly to readdir RPCs), but is
+	// consistent with VFS1.
+
+	d.fs.renameMu.RLock()
+	defer d.fs.renameMu.RUnlock()
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+	if d.dirents != nil {
+		return d.dirents, nil
+	}
+
+	// It's not clear if 9P2000.L's readdir is expected to return "." and "..",
+	// so we generate them here.
+	parent := d.vfsd.ParentOrSelf().Impl().(*dentry)
+	dirents := []vfs.Dirent{
+		{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     d.ino,
+			NextOff: 1,
+		},
+		{
+			Name:    "..",
+			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
+			Ino:     parent.ino,
+			NextOff: 2,
+		},
+	}
+	off := uint64(0)
+	const count = 64 * 1024 // for consistency with the vfs1 client
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	if !d.handleReadable {
+		// This should not be possible because a readable handle should have
+		// been opened when the calling directoryFD was opened.
+		panic("gofer.dentry.getDirents called without a readable handle")
+	}
+	for {
+		p9ds, err := d.handle.file.readdir(ctx, off, count)
+		if err != nil {
+			return nil, err
+		}
+		if len(p9ds) == 0 {
+			// Cache dirents for future directoryFDs if permitted.
+			if d.fs.opts.interop != InteropModeShared {
+				d.dirents = dirents
+			}
+			return dirents, nil
+		}
+		for _, p9d := range p9ds {
+			if p9d.Name == "." || p9d.Name == ".." {
+				continue
+			}
+			dirent := vfs.Dirent{
+				Name:    p9d.Name,
+				Ino:     p9d.QID.Path,
+				NextOff: int64(len(dirents) + 1),
+			}
+			// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
+			// DMSOCKET.
+			switch p9d.Type {
+			case p9.TypeSymlink:
+				dirent.Type = linux.DT_LNK
+			case p9.TypeDir:
+				dirent.Type = linux.DT_DIR
+			default:
+				dirent.Type = linux.DT_REG
+			}
+			dirents = append(dirents, dirent)
+		}
+		off = p9ds[len(p9ds)-1].Offset
+	}
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		if offset == 0 {
+			// Ensure that the next call to fd.IterDirents() calls
+			// fd.dentry().getDirents().
+			fd.dirents = nil
+		}
+		fd.off = offset
+		return fd.off, nil
+	case linux.SEEK_CUR:
+		offset += fd.off
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		// Don't clear fd.dirents in this case, even if offset == 0.
+		fd.off = offset
+		return fd.off, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
new file mode 100644
index 000000000..8eb61debf
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -0,0 +1,1087 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// Snapshot current dentries and special files.
+	fs.syncMu.Lock()
+	ds := make([]*dentry, 0, len(fs.dentries))
+	for d := range fs.dentries {
+		ds = append(ds, d)
+	}
+	sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs))
+	for sffd := range fs.specialFileFDs {
+		sffds = append(sffds, sffd)
+	}
+	fs.syncMu.Unlock()
+
+	// Return the first error we encounter, but sync everything we can
+	// regardless.
+	var retErr error
+
+	// Sync regular files.
+	for _, d := range ds {
+		if !d.TryIncRef() {
+			continue
+		}
+		err := d.syncSharedHandle(ctx)
+		d.DecRef()
+		if err != nil && retErr == nil {
+			retErr = err
+		}
+	}
+
+	// Sync special files, which may be writable but do not use dentry shared
+	// handles (so they won't be synced by the above).
+	for _, sffd := range sffds {
+		if !sffd.vfsfd.TryIncRef() {
+			continue
+		}
+		err := sffd.Sync(ctx)
+		sffd.vfsfd.DecRef()
+		if err != nil && retErr == nil {
+			retErr = err
+		}
+	}
+
+	return retErr
+}
+
+// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
+// encoding of strings, which uses 2 bytes for the length prefix.
+const maxFilenameLen = (1 << 16) - 1
+
+// dentrySlicePool is a pool of *[]*dentry used to store dentries for which
+// dentry.checkCachingLocked() must be called. The pool holds pointers to
+// slices because Go lacks generics, so sync.Pool operates on interface{}, so
+// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy
+// of the slice header on the heap.
+var dentrySlicePool = sync.Pool{
+	New: func() interface{} {
+		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+		return &ds
+	},
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+	if ds == nil {
+		ds = dentrySlicePool.Get().(*[]*dentry)
+	}
+	*ds = append(*ds, d)
+	return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+	// Allow dentries to be GC'd.
+	for i := range *ds {
+		(*ds)[i] = nil
+	}
+	*ds = (*ds)[:0]
+	dentrySlicePool.Put(ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may become cached as a result of the traversal are appended
+// to *ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached
+// metadata must be up to date.
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		parentVFSD, err := rp.ResolveParent(&d.vfsd)
+		if err != nil {
+			return nil, err
+		}
+		parent := parentVFSD.Impl().(*dentry)
+		if fs.opts.interop == InteropModeShared {
+			// We must assume that parentVFSD is correct, because if d has been
+			// moved elsewhere in the remote filesystem so that its parent has
+			// changed, we have no way of determining its new parent's location
+			// in the filesystem. Get updated metadata for parentVFSD.
+			_, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask())
+			if err != nil {
+				return nil, err
+			}
+			parent.updateFromP9Attrs(attrMask, &attr)
+		}
+		rp.Advance()
+		return parent, nil
+	}
+	childVFSD, err := rp.ResolveChild(&d.vfsd, name)
+	if err != nil {
+		return nil, err
+	}
+	// FIXME(jamieliu): Linux performs revalidation before mount lookup
+	// (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(),
+	// __follow_mount_rcu()).
+	child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds)
+	if err != nil {
+		return nil, err
+	}
+	if child == nil {
+		return nil, syserror.ENOENT
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
+// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
+// nil) to verify that the returned child (or lack thereof) is correct. If no file
+// exists at name, revalidateChildLocked returns (nil, nil).
+//
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// parent.isDir(). name is not "." or "..".
+//
+// Postconditions: If revalidateChildLocked returns a non-nil dentry, its
+// cached metadata is up to date.
+func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) {
+	if childVFSD != nil && fs.opts.interop != InteropModeShared {
+		// We have a cached dentry that is assumed to be correct.
+		return childVFSD.Impl().(*dentry), nil
+	}
+	// We either don't have a cached dentry or need to verify that it's still
+	// correct, either of which requires a remote lookup. Check if this name is
+	// valid before performing the lookup.
+	if len(name) > maxFilenameLen {
+		return nil, syserror.ENAMETOOLONG
+	}
+	// Check if we've already cached this lookup with a negative result.
+	if _, ok := parent.negativeChildren[name]; ok {
+		return nil, nil
+	}
+	// Perform the remote lookup.
+	qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
+	if err != nil && err != syserror.ENOENT {
+		return nil, err
+	}
+	if childVFSD != nil {
+		child := childVFSD.Impl().(*dentry)
+		if !file.isNil() && qid.Path == child.ino {
+			// The file at this path hasn't changed. Just update cached
+			// metadata.
+			file.close(ctx)
+			child.updateFromP9Attrs(attrMask, &attr)
+			return child, nil
+		}
+		// The file at this path has changed or no longer exists. Remove
+		// the stale dentry from the tree, and re-evaluate its caching
+		// status (i.e. if it has 0 references, drop it).
+		vfsObj.ForceDeleteDentry(childVFSD)
+		*ds = appendDentry(*ds, child)
+		childVFSD = nil
+	}
+	if file.isNil() {
+		// No file exists at this path now. Cache the negative lookup if
+		// allowed.
+		if fs.opts.interop != InteropModeShared {
+			parent.cacheNegativeChildLocked(name)
+		}
+		return nil, nil
+	}
+	// Create a new dentry representing the file.
+	child, err := fs.newDentry(ctx, file, qid, attrMask, &attr)
+	if err != nil {
+		file.close(ctx)
+		return nil, err
+	}
+	parent.IncRef() // reference held by child on its parent
+	parent.vfsd.InsertChild(&child.vfsd, name)
+	// For now, child has 0 references, so our caller should call
+	// child.checkCachingLocked().
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop ==
+// InteropModeShared, then d's cached metadata must be up to date.
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for rp.Start() as required by fs.stepLocked().
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return err
+	}
+	if parent.isDeleted() {
+		return syserror.ENOENT
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	if fs.opts.interop == InteropModeShared {
+		// The existence of a dentry at name would be inconclusive because the
+		// file it represents may have been deleted from the remote filesystem,
+		// so we would need to make an RPC to revalidate the dentry. Just
+		// attempt the file creation RPC instead. If a file does exist, the RPC
+		// will fail with EEXIST like we would have. If the RPC succeeds, and a
+		// stale dentry exists, the dentry will fail revalidation next time
+		// it's used.
+		return create(parent, name)
+	}
+	if parent.vfsd.Child(name) != nil {
+		return syserror.EEXIST
+	}
+	// No cached dentry exists; however, there might still be an existing file
+	// at name. As above, we attempt the file creation RPC anyway.
+	if err := create(parent, name); err != nil {
+		return err
+	}
+	parent.touchCMtime(ctx)
+	delete(parent.negativeChildren, name)
+	parent.dirents = nil
+	return nil
+}
+
+// Preconditions: !rp.Done().
+func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+
+	name := rp.Component()
+	if dir {
+		if name == "." {
+			return syserror.EINVAL
+		}
+		if name == ".." {
+			return syserror.ENOTEMPTY
+		}
+	} else {
+		if name == "." || name == ".." {
+			return syserror.EISDIR
+		}
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	childVFSD := parent.vfsd.Child(name)
+	var child *dentry
+	// We only need a dentry representing the file at name if it can be a mount
+	// point. If childVFSD is nil, then it can't be a mount point. If childVFSD
+	// is non-nil but stale, the actual file can't be a mount point either; we
+	// detect this case by just speculatively calling PrepareDeleteDentry and
+	// only revalidating the dentry if that fails (indicating that the existing
+	// dentry is a mount point).
+	if childVFSD != nil {
+		child = childVFSD.Impl().(*dentry)
+		if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds)
+			if err != nil {
+				return err
+			}
+			if child != nil {
+				childVFSD = &child.vfsd
+				if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+					return err
+				}
+			} else {
+				childVFSD = nil
+			}
+		}
+	} else if _, ok := parent.negativeChildren[name]; ok {
+		return syserror.ENOENT
+	}
+	flags := uint32(0)
+	if dir {
+		if child != nil && !child.isDir() {
+			return syserror.ENOTDIR
+		}
+		flags = linux.AT_REMOVEDIR
+	} else {
+		if child != nil && child.isDir() {
+			return syserror.EISDIR
+		}
+		if rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+	err = parent.file.unlinkAt(ctx, name, flags)
+	if err != nil {
+		if childVFSD != nil {
+			vfsObj.AbortDeleteDentry(childVFSD)
+		}
+		return err
+	}
+	if fs.opts.interop != InteropModeShared {
+		parent.touchCMtime(ctx)
+		parent.cacheNegativeChildLocked(name)
+		parent.dirents = nil
+	}
+	if child != nil {
+		child.setDeleted()
+		vfsObj.CommitDeleteDentry(childVFSD)
+		ds = appendDentry(ds, child)
+	}
+	return nil
+}
+
+// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls
+// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) {
+	fs.renameMu.RUnlock()
+	if *ds == nil {
+		return
+	}
+	if len(**ds) != 0 {
+		fs.renameMu.Lock()
+		for _, d := range **ds {
+			d.checkCachingLocked()
+		}
+		fs.renameMu.Unlock()
+	}
+	putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) {
+	if *ds == nil {
+		fs.renameMu.Unlock()
+		return
+	}
+	for _, d := range **ds {
+		d.checkCachingLocked()
+	}
+	fs.renameMu.Unlock()
+	putDentrySlice(*ds)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		// 9P2000.L supports hard links, but we don't.
+		return syserror.EPERM
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Reject O_TMPFILE, which is not supported; supporting it correctly in the
+	// presence of other remote filesystem users requires remote filesystem
+	// support, and it isn't clear that there's any way to implement this in
+	// 9P.
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		return nil, syserror.EOPNOTSUPP
+	}
+	mayCreate := opts.Flags&linux.O_CREAT != 0
+	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		// Get updated metadata for start as required by fs.stepLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	if rp.Done() {
+		return start.openLocked(ctx, rp, opts.Flags)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+	// Determine whether or not we need to create a file.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, &ds)
+	if err == syserror.ENOENT && mayCreate {
+		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
+		parent.dirMu.Unlock()
+		return fd, err
+	}
+	if err != nil {
+		parent.dirMu.Unlock()
+		return nil, err
+	}
+	// Open existing child or follow symlink.
+	parent.dirMu.Unlock()
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	return child.openLocked(ctx, rp, opts.Flags)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(flags)
+	if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil {
+		return nil, err
+	}
+	mnt := rp.Mount()
+	filetype := d.fileType()
+	switch {
+	case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD:
+		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil {
+			return nil, err
+		}
+		fd := &regularFileFD{}
+		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+			AllowDirectIO: true,
+		}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	case filetype == linux.S_IFDIR:
+		// Can't open directories with O_CREAT.
+		if flags&linux.O_CREAT != 0 {
+			return nil, syserror.EISDIR
+		}
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		if flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
+			return nil, err
+		}
+		fd := &directoryFD{}
+		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	case filetype == linux.S_IFLNK:
+		// Can't open symlinks without O_PATH (which is unimplemented).
+		return nil, syserror.ELOOP
+	default:
+		if flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0)
+		if err != nil {
+			return nil, err
+		}
+		fd := &specialFileFD{
+			handle: h,
+		}
+		if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			h.close(ctx)
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	}
+}
+
+// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+		return nil, err
+	}
+	if d.isDeleted() {
+		return nil, syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return nil, err
+	}
+	defer mnt.EndWrite()
+
+	// 9P2000.L's lcreate takes a fid representing the parent directory, and
+	// converts it into an open fid representing the created file, so we need
+	// to duplicate the directory fid first.
+	_, dirfile, err := d.file.walk(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+	creds := rp.Credentials()
+	name := rp.Component()
+	fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+	if err != nil {
+		dirfile.close(ctx)
+		return nil, err
+	}
+	// Then we need to walk to the file we just created to get a non-open fid
+	// representing it, and to get its metadata. This must use d.file since, as
+	// explained above, dirfile was invalidated by dirfile.Create().
+	walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
+	if err != nil {
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, err
+	}
+	// Sanity-check that we walked to the file we created.
+	if createQID.Path != walkQID.Path {
+		// Probably due to concurrent remote filesystem mutation?
+		ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop)
+		nonOpenFile.close(ctx)
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, syserror.EAGAIN
+	}
+
+	// Construct the new dentry.
+	child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
+	if err != nil {
+		nonOpenFile.close(ctx)
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, err
+	}
+	// Incorporate the fid that was opened by lcreate.
+	useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
+	if useRegularFileFD {
+		child.handleMu.Lock()
+		child.handle.file = openFile
+		if fdobj != nil {
+			child.handle.fd = int32(fdobj.Release())
+		}
+		child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags)
+		child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
+		child.handleMu.Unlock()
+	}
+	// Take a reference on the new dentry to be held by the new file
+	// description. (This reference also means that the new dentry is not
+	// eligible for caching yet, so we don't need to append to a dentry slice.)
+	child.refs = 1
+	// Insert the dentry into the tree.
+	d.IncRef() // reference held by child on its parent d
+	d.vfsd.InsertChild(&child.vfsd, name)
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchCMtime(ctx)
+		delete(d.negativeChildren, name)
+		d.dirents = nil
+	}
+
+	// Finally, construct a file description representing the created file.
+	var childVFSFD *vfs.FileDescription
+	mnt.IncRef()
+	if useRegularFileFD {
+		fd := &regularFileFD{}
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
+			AllowDirectIO: true,
+		}); err != nil {
+			return nil, err
+		}
+		childVFSFD = &fd.vfsfd
+	} else {
+		fd := &specialFileFD{
+			handle: handle{
+				file: openFile,
+				fd:   -1,
+			},
+		}
+		if fdobj != nil {
+			fd.handle.fd = int32(fdobj.Release())
+		}
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			fd.handle.close(ctx)
+			return nil, err
+		}
+		childVFSFD = &fd.vfsfd
+	}
+	return childVFSFD, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	if !d.isSymlink() {
+		return "", syserror.EINVAL
+	}
+	return d.readlink(ctx, rp.Mount())
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// Requires 9P support.
+		return syserror.EINVAL
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.Lock()
+	defer fs.renameMuUnlockAndCheckCaching(&ds)
+	newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	if fs.opts.interop == InteropModeShared {
+		if err := oldParent.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return err
+	}
+	vfsObj := rp.VirtualFilesystem()
+	// We need a dentry representing the renamed file since, if it's a
+	// directory, we need to check for write permission on it.
+	oldParent.dirMu.Lock()
+	defer oldParent.dirMu.Unlock()
+	renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds)
+	if err != nil {
+		return err
+	}
+	if renamed == nil {
+		return syserror.ENOENT
+	}
+	if renamed.isDir() {
+		if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if oldParent != newParent {
+		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+			return err
+		}
+		newParent.dirMu.Lock()
+		defer newParent.dirMu.Unlock()
+	}
+	if newParent.isDeleted() {
+		return syserror.ENOENT
+	}
+	replacedVFSD := newParent.vfsd.Child(newName)
+	var replaced *dentry
+	// This is similar to unlinkAt, except:
+	//
+	// - We revalidate the replaced dentry unconditionally for simplicity.
+	//
+	// - If rp.MustBeDir(), then we need a dentry representing the replaced
+	// file regardless to confirm that it's a directory.
+	if replacedVFSD != nil || rp.MustBeDir() {
+		replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds)
+		if err != nil {
+			return err
+		}
+		if replaced != nil {
+			if replaced.isDir() {
+				if !renamed.isDir() {
+					return syserror.EISDIR
+				}
+			} else {
+				if rp.MustBeDir() || renamed.isDir() {
+					return syserror.ENOTDIR
+				}
+			}
+			replacedVFSD = &replaced.vfsd
+		} else {
+			replacedVFSD = nil
+		}
+	}
+
+	if oldParent == newParent && oldName == newName {
+		return nil
+	}
+	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil {
+		return err
+	}
+	if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
+		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+		return err
+	}
+	if fs.opts.interop != InteropModeShared {
+		oldParent.cacheNegativeChildLocked(oldName)
+		oldParent.dirents = nil
+		delete(newParent.negativeChildren, newName)
+		newParent.dirents = nil
+	}
+	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	return fs.unlinkAt(ctx, rp, true /* dir */)
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount())
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	// Since walking updates metadata for all traversed dentries under
+	// InteropModeShared, including the returned one, we can return cached
+	// metadata here regardless of fs.opts.interop.
+	var stat linux.Statx
+	d.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	fsstat, err := d.file.statFS(ctx)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	nameLen := uint64(fsstat.NameLength)
+	if nameLen > maxFilenameLen {
+		nameLen = maxFilenameLen
+	}
+	return linux.Statfs{
+		// This is primarily for distinguishing a gofer file system in
+		// tests. Testing is important, so instead of defining
+		// something completely random, use a standard value.
+		Type:            linux.V9FS_MAGIC,
+		BlockSize:       int64(fsstat.BlockSize),
+		Blocks:          fsstat.Blocks,
+		BlocksFree:      fsstat.BlocksFree,
+		BlocksAvailable: fsstat.BlocksAvailable,
+		Files:           fsstat.Files,
+		FilesFree:       fsstat.FilesFree,
+		NameLength:      nameLen,
+	}, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	return fs.unlinkAt(ctx, rp, false /* dir */)
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	return d.listxattr(ctx)
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	return d.getxattr(ctx, name)
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.setxattr(ctx, &opts)
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.removexattr(ctx, name)
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
new file mode 100644
index 000000000..d0552bd99
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -0,0 +1,1147 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gofer provides a filesystem implementation that is backed by a 9p
+// server, interchangably referred to as "gofers" throughout this package.
+//
+// Lock order:
+//   regularFileFD/directoryFD.mu
+//     filesystem.renameMu
+//       dentry.dirMu
+//         filesystem.syncMu
+//         dentry.metadataMu
+//           *** "memmap.Mappable locks" below this point
+//           dentry.mapsMu
+//             *** "memmap.Mappable locks taken by Translate" below this point
+//             dentry.handleMu
+//               dentry.dataMu
+//
+// Locking dentry.dirMu in multiple dentries requires holding
+// filesystem.renameMu for writing.
+package gofer
+
+import (
+	"fmt"
+	"strconv"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// mfp is used to allocate memory that caches regular file contents. mfp is
+	// immutable.
+	mfp pgalloc.MemoryFileProvider
+
+	// Immutable options.
+	opts filesystemOptions
+
+	// client is the client used by this filesystem. client is immutable.
+	client *p9.Client
+
+	// uid and gid are the effective KUID and KGID of the filesystem's creator,
+	// and are used as the owner and group for files that don't specify one.
+	// uid and gid are immutable.
+	uid auth.KUID
+	gid auth.KGID
+
+	// renameMu serves two purposes:
+	//
+	// - It synchronizes path resolution with renaming initiated by this
+	// client.
+	//
+	// - It is held by path resolution to ensure that reachable dentries remain
+	// valid. A dentry is reachable by path resolution if it has a non-zero
+	// reference count (such that it is usable as vfs.ResolvingPath.Start() or
+	// is reachable from its children), or if it is a child dentry (such that
+	// it is reachable from its parent).
+	renameMu sync.RWMutex
+
+	// cachedDentries contains all dentries with 0 references. (Due to race
+	// conditions, it may also contain dentries with non-zero references.)
+	// cachedDentriesLen is the number of dentries in cachedDentries. These
+	// fields are protected by renameMu.
+	cachedDentries    dentryList
+	cachedDentriesLen uint64
+
+	// dentries contains all dentries in this filesystem. specialFileFDs
+	// contains all open specialFileFDs. These fields are protected by syncMu.
+	syncMu         sync.Mutex
+	dentries       map[*dentry]struct{}
+	specialFileFDs map[*specialFileFD]struct{}
+}
+
+type filesystemOptions struct {
+	// "Standard" 9P options.
+	fd      int
+	aname   string
+	interop InteropMode // derived from the "cache" mount option
+	msize   uint32
+	version string
+
+	// maxCachedDentries is the maximum number of dentries with 0 references
+	// retained by the client.
+	maxCachedDentries uint64
+
+	// If forcePageCache is true, host FDs may not be used for application
+	// memory mappings even if available; instead, the client must perform its
+	// own caching of regular file pages. This is primarily useful for testing.
+	forcePageCache bool
+
+	// If limitHostFDTranslation is true, apply maxFillRange() constraints to
+	// host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
+	// makes memory accounting behavior more consistent between cases where
+	// host FDs are / are not available, but may increase the frequency of
+	// sentry-handled page faults on files for which a host FD is available.
+	limitHostFDTranslation bool
+
+	// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
+	// filesystem may not be coherent with writable host FDs opened later, so
+	// mappings of the former must be replaced by mappings of the latter. This
+	// is usually only the case when the remote filesystem is an overlayfs
+	// mount on Linux < 4.19.
+	overlayfsStaleRead bool
+
+	// If regularFilesUseSpecialFileFD is true, application FDs representing
+	// regular files will use distinct file handles for each FD, in the same
+	// way that application FDs representing "special files" such as sockets
+	// do. Note that this disables client caching and mmap for regular files.
+	regularFilesUseSpecialFileFD bool
+}
+
+// InteropMode controls the client's interaction with other remote filesystem
+// users.
+type InteropMode uint32
+
+const (
+	// InteropModeExclusive is appropriate when the filesystem client is the
+	// only user of the remote filesystem.
+	//
+	// - The client may cache arbitrary filesystem state (file data, metadata,
+	// filesystem structure, etc.).
+	//
+	// - Client changes to filesystem state may be sent to the remote
+	// filesystem asynchronously, except when server permission checks are
+	// necessary.
+	//
+	// - File timestamps are based on client clocks. This ensures that users of
+	// the client observe timestamps that are coherent with their own clocks
+	// and consistent with Linux's semantics. However, since it is not always
+	// possible for clients to set arbitrary atimes and mtimes, and never
+	// possible for clients to set arbitrary ctimes, file timestamp changes are
+	// stored in the client only and never sent to the remote filesystem.
+	InteropModeExclusive InteropMode = iota
+
+	// InteropModeWritethrough is appropriate when there are read-only users of
+	// the remote filesystem that expect to observe changes made by the
+	// filesystem client.
+	//
+	// - The client may cache arbitrary filesystem state.
+	//
+	// - Client changes to filesystem state must be sent to the remote
+	// filesystem synchronously.
+	//
+	// - File timestamps are based on client clocks. As a corollary, access
+	// timestamp changes from other remote filesystem users will not be visible
+	// to the client.
+	InteropModeWritethrough
+
+	// InteropModeShared is appropriate when there are users of the remote
+	// filesystem that may mutate its state other than the client.
+	//
+	// - The client must verify cached filesystem state before using it.
+	//
+	// - Client changes to filesystem state must be sent to the remote
+	// filesystem synchronously.
+	//
+	// - File timestamps are based on server clocks. This is necessary to
+	// ensure that timestamp changes are synchronized between remote filesystem
+	// users.
+	//
+	// Note that the correctness of InteropModeShared depends on the server
+	// correctly implementing 9P fids (i.e. each fid immutably represents a
+	// single filesystem object), even in the presence of remote filesystem
+	// mutations from other users. If this is violated, the behavior of the
+	// client is undefined.
+	InteropModeShared
+)
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider")
+		return nil, nil, syserror.EINVAL
+	}
+
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	var fsopts filesystemOptions
+
+	// Check that the transport is "fd".
+	trans, ok := mopts["trans"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "trans")
+	if trans != "fd" {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Check that read and write FDs are provided and identical.
+	rfdstr, ok := mopts["rfdno"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "rfdno")
+	rfd, err := strconv.Atoi(rfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr)
+		return nil, nil, syserror.EINVAL
+	}
+	wfdstr, ok := mopts["wfdno"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "wfdno")
+	wfd, err := strconv.Atoi(wfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr)
+		return nil, nil, syserror.EINVAL
+	}
+	if rfd != wfd {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
+		return nil, nil, syserror.EINVAL
+	}
+	fsopts.fd = rfd
+
+	// Get the attach name.
+	fsopts.aname = "/"
+	if aname, ok := mopts["aname"]; ok {
+		delete(mopts, "aname")
+		fsopts.aname = aname
+	}
+
+	// Parse the cache policy. For historical reasons, this defaults to the
+	// least generally-applicable option, InteropModeExclusive.
+	fsopts.interop = InteropModeExclusive
+	if cache, ok := mopts["cache"]; ok {
+		delete(mopts, "cache")
+		switch cache {
+		case "fscache":
+			fsopts.interop = InteropModeExclusive
+		case "fscache_writethrough":
+			fsopts.interop = InteropModeWritethrough
+		case "none":
+			fsopts.regularFilesUseSpecialFileFD = true
+			fallthrough
+		case "remote_revalidating":
+			fsopts.interop = InteropModeShared
+		default:
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache)
+			return nil, nil, syserror.EINVAL
+		}
+	}
+
+	// Parse the 9P message size.
+	fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
+	if msizestr, ok := mopts["msize"]; ok {
+		delete(mopts, "msize")
+		msize, err := strconv.ParseUint(msizestr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.msize = uint32(msize)
+	}
+
+	// Parse the 9P protocol version.
+	fsopts.version = p9.HighestVersionString()
+	if version, ok := mopts["version"]; ok {
+		delete(mopts, "version")
+		fsopts.version = version
+	}
+
+	// Parse the dentry cache limit.
+	fsopts.maxCachedDentries = 1000
+	if str, ok := mopts["dentry_cache_limit"]; ok {
+		delete(mopts, "dentry_cache_limit")
+		maxCachedDentries, err := strconv.ParseUint(str, 10, 64)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.maxCachedDentries = maxCachedDentries
+	}
+
+	// Handle simple flags.
+	if _, ok := mopts["force_page_cache"]; ok {
+		delete(mopts, "force_page_cache")
+		fsopts.forcePageCache = true
+	}
+	if _, ok := mopts["limit_host_fd_translation"]; ok {
+		delete(mopts, "limit_host_fd_translation")
+		fsopts.limitHostFDTranslation = true
+	}
+	if _, ok := mopts["overlayfs_stale_read"]; ok {
+		delete(mopts, "overlayfs_stale_read")
+		fsopts.overlayfsStaleRead = true
+	}
+	// fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
+	// "cache=none".
+
+	// Check for unparsed options.
+	if len(mopts) != 0 {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Establish a connection with the server.
+	conn, err := unet.NewSocket(fsopts.fd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Perform version negotiation with the server.
+	ctx.UninterruptibleSleepStart(false)
+	client, err := p9.NewClient(conn, fsopts.msize, fsopts.version)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		conn.Close()
+		return nil, nil, err
+	}
+	// Ownership of conn has been transferred to client.
+
+	// Perform attach to obtain the filesystem root.
+	ctx.UninterruptibleSleepStart(false)
+	attached, err := client.Attach(fsopts.aname)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		client.Close()
+		return nil, nil, err
+	}
+	attachFile := p9file{attached}
+	qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
+	if err != nil {
+		attachFile.close(ctx)
+		client.Close()
+		return nil, nil, err
+	}
+
+	// Construct the filesystem object.
+	fs := &filesystem{
+		mfp:            mfp,
+		opts:           fsopts,
+		uid:            creds.EffectiveKUID,
+		gid:            creds.EffectiveKGID,
+		client:         client,
+		dentries:       make(map[*dentry]struct{}),
+		specialFileFDs: make(map[*specialFileFD]struct{}),
+	}
+	fs.vfsfs.Init(vfsObj, fs)
+
+	// Construct the root dentry.
+	root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
+	if err != nil {
+		attachFile.close(ctx)
+		fs.vfsfs.DecRef()
+		return nil, nil, err
+	}
+	// Set the root's reference count to 2. One reference is returned to the
+	// caller, and the other is deliberately leaked to prevent the root from
+	// being "cached" and subsequently evicted. Its resources will still be
+	// cleaned up by fs.Release().
+	root.refs = 2
+
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	ctx := context.Background()
+	mf := fs.mfp.MemoryFile()
+
+	fs.syncMu.Lock()
+	for d := range fs.dentries {
+		d.handleMu.Lock()
+		d.dataMu.Lock()
+		if d.handleWritable {
+			// Write dirty cached data to the remote file.
+			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil {
+				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
+			}
+			// TODO(jamieliu): Do we need to flushf/fsync d?
+		}
+		// Discard cached pages.
+		d.cache.DropAll(mf)
+		d.dirty.RemoveAll()
+		d.dataMu.Unlock()
+		// Close the host fd if one exists.
+		if d.handle.fd >= 0 {
+			syscall.Close(int(d.handle.fd))
+			d.handle.fd = -1
+		}
+		d.handleMu.Unlock()
+	}
+	// There can't be any specialFileFDs still using fs, since each such
+	// FileDescription would hold a reference on a Mount holding a reference on
+	// fs.
+	fs.syncMu.Unlock()
+
+	// Close the connection to the server. This implicitly clunks all fids.
+	fs.client.Close()
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	// refs is the reference count. Each dentry holds a reference on its
+	// parent, even if disowned. refs is accessed using atomic memory
+	// operations.
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// We don't support hard links, so each dentry maps 1:1 to an inode.
+
+	// file is the unopened p9.File that backs this dentry. file is immutable.
+	file p9file
+
+	// If deleted is non-zero, the file represented by this dentry has been
+	// deleted. deleted is accessed using atomic memory operations.
+	deleted uint32
+
+	// If cached is true, dentryEntry links dentry into
+	// filesystem.cachedDentries. cached and dentryEntry are protected by
+	// filesystem.renameMu.
+	cached bool
+	dentryEntry
+
+	dirMu sync.Mutex
+
+	// If this dentry represents a directory, and InteropModeShared is not in
+	// effect, negativeChildren is a set of child names in this directory that
+	// are known not to exist. negativeChildren is protected by dirMu.
+	negativeChildren map[string]struct{}
+
+	// If this dentry represents a directory, InteropModeShared is not in
+	// effect, and dirents is not nil, it is a cache of all entries in the
+	// directory, in the order they were returned by the server. dirents is
+	// protected by dirMu.
+	dirents []vfs.Dirent
+
+	// Cached metadata; protected by metadataMu and accessed using atomic
+	// memory operations unless otherwise specified.
+	metadataMu sync.Mutex
+	ino        uint64 // immutable
+	mode       uint32 // type is immutable, perms are mutable
+	uid        uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid        uint32 // auth.KGID, but ...
+	blockSize  uint32 // 0 if unknown
+	// Timestamps, all nsecs from the Unix epoch.
+	atime int64
+	mtime int64
+	ctime int64
+	btime int64
+	// File size, protected by both metadataMu and dataMu (i.e. both must be
+	// locked to mutate it).
+	size uint64
+
+	mapsMu sync.Mutex
+
+	// If this dentry represents a regular file, mappings tracks mappings of
+	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
+	mappings memmap.MappingSet
+
+	// If this dentry represents a regular file or directory:
+	//
+	// - handle is the I/O handle used by all regularFileFDs/directoryFDs
+	// representing this dentry.
+	//
+	// - handleReadable is true if handle is readable.
+	//
+	// - handleWritable is true if handle is writable.
+	//
+	// Invariants:
+	//
+	// - If handleReadable == handleWritable == false, then handle.file == nil
+	// (i.e. there is no open handle). Conversely, if handleReadable ||
+	// handleWritable == true, then handle.file != nil (i.e. there is an open
+	// handle).
+	//
+	// - handleReadable and handleWritable cannot transition from true to false
+	// (i.e. handles may not be downgraded).
+	//
+	// These fields are protected by handleMu.
+	handleMu       sync.RWMutex
+	handle         handle
+	handleReadable bool
+	handleWritable bool
+
+	dataMu sync.RWMutex
+
+	// If this dentry represents a regular file that is client-cached, cache
+	// maps offsets into the cached file to offsets into
+	// filesystem.mfp.MemoryFile() that store the file's data. cache is
+	// protected by dataMu.
+	cache fsutil.FileRangeSet
+
+	// If this dentry represents a regular file that is client-cached, dirty
+	// tracks dirty segments in cache. dirty is protected by dataMu.
+	dirty fsutil.DirtySet
+
+	// pf implements platform.File for mappings of handle.fd.
+	pf dentryPlatformFile
+
+	// If this dentry represents a symbolic link, InteropModeShared is not in
+	// effect, and haveTarget is true, target is the symlink target. haveTarget
+	// and target are protected by dataMu.
+	haveTarget bool
+	target     string
+}
+
+// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
+// gofer client.
+func dentryAttrMask() p9.AttrMask {
+	return p9.AttrMask{
+		Mode:  true,
+		UID:   true,
+		GID:   true,
+		ATime: true,
+		MTime: true,
+		CTime: true,
+		Size:  true,
+		BTime: true,
+	}
+}
+
+// newDentry creates a new dentry representing the given file. The dentry
+// initially has no references, but is not cached; it is the caller's
+// responsibility to set the dentry's reference count and/or call
+// dentry.checkCachingLocked() as appropriate.
+func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
+	if !mask.Mode {
+		ctx.Warningf("can't create gofer.dentry without file type")
+		return nil, syserror.EIO
+	}
+	if attr.Mode.FileType() == p9.ModeRegular && !mask.Size {
+		ctx.Warningf("can't create regular file gofer.dentry without file size")
+		return nil, syserror.EIO
+	}
+
+	d := &dentry{
+		fs:        fs,
+		file:      file,
+		ino:       qid.Path,
+		mode:      uint32(attr.Mode),
+		uid:       uint32(fs.uid),
+		gid:       uint32(fs.gid),
+		blockSize: usermem.PageSize,
+		handle: handle{
+			fd: -1,
+		},
+	}
+	d.pf.dentry = d
+	if mask.UID {
+		d.uid = uint32(attr.UID)
+	}
+	if mask.GID {
+		d.gid = uint32(attr.GID)
+	}
+	if mask.Size {
+		d.size = attr.Size
+	}
+	if attr.BlockSize != 0 {
+		d.blockSize = uint32(attr.BlockSize)
+	}
+	if mask.ATime {
+		d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)
+	}
+	if mask.MTime {
+		d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)
+	}
+	if mask.CTime {
+		d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)
+	}
+	if mask.BTime {
+		d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
+	}
+	d.vfsd.Init(d)
+
+	fs.syncMu.Lock()
+	fs.dentries[d] = struct{}{}
+	fs.syncMu.Unlock()
+	return d, nil
+}
+
+// updateFromP9Attrs is called to update d's metadata after an update from the
+// remote filesystem.
+func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
+	d.metadataMu.Lock()
+	if mask.Mode {
+		if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
+			d.metadataMu.Unlock()
+			panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
+		}
+		atomic.StoreUint32(&d.mode, uint32(attr.Mode))
+	}
+	if mask.UID {
+		atomic.StoreUint32(&d.uid, uint32(attr.UID))
+	}
+	if mask.GID {
+		atomic.StoreUint32(&d.gid, uint32(attr.GID))
+	}
+	// There is no P9_GETATTR_* bit for I/O block size.
+	if attr.BlockSize != 0 {
+		atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
+	}
+	if mask.ATime {
+		atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
+	}
+	if mask.MTime {
+		atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
+	}
+	if mask.CTime {
+		atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds))
+	}
+	if mask.BTime {
+		atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
+	}
+	if mask.Size {
+		d.dataMu.Lock()
+		atomic.StoreUint64(&d.size, attr.Size)
+		d.dataMu.Unlock()
+	}
+	d.metadataMu.Unlock()
+}
+
+func (d *dentry) updateFromGetattr(ctx context.Context) error {
+	// Use d.handle.file, which represents a 9P fid that has been opened, in
+	// preference to d.file, which represents a 9P fid that has not. This may
+	// be significantly more efficient in some implementations.
+	var (
+		file            p9file
+		handleMuRLocked bool
+	)
+	d.handleMu.RLock()
+	if !d.handle.file.isNil() {
+		file = d.handle.file
+		handleMuRLocked = true
+	} else {
+		file = d.file
+		d.handleMu.RUnlock()
+	}
+	_, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask())
+	if handleMuRLocked {
+		d.handleMu.RUnlock()
+	}
+	if err != nil {
+		return err
+	}
+	d.updateFromP9Attrs(attrMask, &attr)
+	return nil
+}
+
+func (d *dentry) fileType() uint32 {
+	return atomic.LoadUint32(&d.mode) & linux.S_IFMT
+}
+
+func (d *dentry) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
+	stat.Blksize = atomic.LoadUint32(&d.blockSize)
+	stat.Nlink = 1
+	if d.isDir() {
+		stat.Nlink = 2
+	}
+	stat.UID = atomic.LoadUint32(&d.uid)
+	stat.GID = atomic.LoadUint32(&d.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+	stat.Ino = d.ino
+	stat.Size = atomic.LoadUint64(&d.size)
+	// This is consistent with regularFileFD.Seek(), which treats regular files
+	// as having no holes.
+	stat.Blocks = (stat.Size + 511) / 512
+	stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime))
+	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
+	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
+	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
+	// TODO(jamieliu): device number
+}
+
+func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
+	if stat.Mask == 0 {
+		return nil
+	}
+	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
+		return syserror.EPERM
+	}
+	if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	setLocalAtime := false
+	setLocalMtime := false
+	if d.fs.opts.interop != InteropModeShared {
+		// Timestamp updates will be handled locally.
+		setLocalAtime = stat.Mask&linux.STATX_ATIME != 0
+		setLocalMtime = stat.Mask&linux.STATX_MTIME != 0
+		stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME
+		if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) {
+			// Truncate updates mtime.
+			setLocalMtime = true
+			stat.Mtime.Nsec = linux.UTIME_NOW
+		}
+	}
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	if stat.Mask != 0 {
+		if err := d.file.setAttr(ctx, p9.SetAttrMask{
+			Permissions:        stat.Mask&linux.STATX_MODE != 0,
+			UID:                stat.Mask&linux.STATX_UID != 0,
+			GID:                stat.Mask&linux.STATX_GID != 0,
+			Size:               stat.Mask&linux.STATX_SIZE != 0,
+			ATime:              stat.Mask&linux.STATX_ATIME != 0,
+			MTime:              stat.Mask&linux.STATX_MTIME != 0,
+			ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW,
+			MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW,
+		}, p9.SetAttr{
+			Permissions:      p9.FileMode(stat.Mode),
+			UID:              p9.UID(stat.UID),
+			GID:              p9.GID(stat.GID),
+			Size:             stat.Size,
+			ATimeSeconds:     uint64(stat.Atime.Sec),
+			ATimeNanoSeconds: uint64(stat.Atime.Nsec),
+			MTimeSeconds:     uint64(stat.Mtime.Sec),
+			MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
+		}); err != nil {
+			return err
+		}
+	}
+	if d.fs.opts.interop == InteropModeShared {
+		// There's no point to updating d's metadata in this case since it'll
+		// be overwritten by revalidation before the next time it's used
+		// anyway. (InteropModeShared inhibits client caching of regular file
+		// data, so there's no cache to truncate either.)
+		return nil
+	}
+	now, haveNow := nowFromContext(ctx)
+	if !haveNow {
+		ctx.Warningf("gofer.dentry.setStat: current time not available")
+	}
+	if stat.Mask&linux.STATX_MODE != 0 {
+		atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
+	}
+	if stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&d.uid, stat.UID)
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&d.gid, stat.GID)
+	}
+	if setLocalAtime {
+		if stat.Atime.Nsec == linux.UTIME_NOW {
+			if haveNow {
+				atomic.StoreInt64(&d.atime, now)
+			}
+		} else {
+			atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
+		}
+	}
+	if setLocalMtime {
+		if stat.Mtime.Nsec == linux.UTIME_NOW {
+			if haveNow {
+				atomic.StoreInt64(&d.mtime, now)
+			}
+		} else {
+			atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
+		}
+	}
+	if haveNow {
+		atomic.StoreInt64(&d.ctime, now)
+	}
+	if stat.Mask&linux.STATX_SIZE != 0 {
+		d.dataMu.Lock()
+		oldSize := d.size
+		d.size = stat.Size
+		// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
+		// below. This allows concurrent calls to Read/Translate/etc. These
+		// functions synchronize with truncation by refusing to use cache
+		// contents beyond the new d.size. (We are still holding d.metadataMu,
+		// so we can't race with Write or another truncate.)
+		d.dataMu.Unlock()
+		if d.size < oldSize {
+			oldpgend := pageRoundUp(oldSize)
+			newpgend := pageRoundUp(d.size)
+			if oldpgend != newpgend {
+				d.mapsMu.Lock()
+				d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+					// Compare Linux's mm/truncate.c:truncate_setsize() =>
+					// truncate_pagecache() =>
+					// mm/memory.c:unmap_mapping_range(evencows=1).
+					InvalidatePrivate: true,
+				})
+				d.mapsMu.Unlock()
+			}
+			// We are now guaranteed that there are no translations of
+			// truncated pages, and can remove them from the cache. Since
+			// truncated pages have been removed from the remote file, they
+			// should be dropped without being written back.
+			d.dataMu.Lock()
+			d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
+			d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
+			d.dataMu.Unlock()
+		}
+	}
+	return nil
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+	// d.checkCachingLocked().
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef() {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.renameMu.Lock()
+		d.checkCachingLocked()
+		d.fs.renameMu.Unlock()
+	} else if refs < 0 {
+		panic("gofer.dentry.DecRef() called without holding a reference")
+	}
+}
+
+// checkCachingLocked should be called after d's reference count becomes 0 or it
+// becomes disowned.
+//
+// Preconditions: d.fs.renameMu must be locked for writing.
+func (d *dentry) checkCachingLocked() {
+	// Dentries with a non-zero reference count must be retained. (The only way
+	// to obtain a reference on a dentry with zero references is via path
+	// resolution, which requires renameMu, so if d.refs is zero then it will
+	// remain zero while we hold renameMu for writing.)
+	if atomic.LoadInt64(&d.refs) != 0 {
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		return
+	}
+	// Non-child dentries with zero references are no longer reachable by path
+	// resolution and should be dropped immediately.
+	if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() {
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		d.destroyLocked()
+		return
+	}
+	// If d is already cached, just move it to the front of the LRU.
+	if d.cached {
+		d.fs.cachedDentries.Remove(d)
+		d.fs.cachedDentries.PushFront(d)
+		return
+	}
+	// Cache the dentry, then evict the least recently used cached dentry if
+	// the cache becomes over-full.
+	d.fs.cachedDentries.PushFront(d)
+	d.fs.cachedDentriesLen++
+	d.cached = true
+	if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries {
+		victim := d.fs.cachedDentries.Back()
+		d.fs.cachedDentries.Remove(victim)
+		d.fs.cachedDentriesLen--
+		victim.cached = false
+		// victim.refs may have become non-zero from an earlier path
+		// resolution since it was inserted into fs.cachedDentries; see
+		// dentry.incRefLocked(). Either way, we brought
+		// fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so
+		// we don't loop.
+		if atomic.LoadInt64(&victim.refs) == 0 {
+			if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil {
+				victimParent := victimParentVFSD.Impl().(*dentry)
+				victimParent.dirMu.Lock()
+				if !victim.vfsd.IsDisowned() {
+					// victim can't be a mount point (in any mount
+					// namespace), since VFS holds references on mount
+					// points.
+					d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd)
+					// We're only deleting the dentry, not the file it
+					// represents, so we don't need to update
+					// victimParent.dirents etc.
+				}
+				victimParent.dirMu.Unlock()
+			}
+			victim.destroyLocked()
+		}
+	}
+}
+
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
+// not a child dentry.
+func (d *dentry) destroyLocked() {
+	ctx := context.Background()
+	d.handleMu.Lock()
+	if !d.handle.file.isNil() {
+		mf := d.fs.mfp.MemoryFile()
+		d.dataMu.Lock()
+		// Write dirty pages back to the remote filesystem.
+		if d.handleWritable {
+			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+				log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err)
+			}
+		}
+		// Discard cached data.
+		d.cache.DropAll(mf)
+		d.dirty.RemoveAll()
+		d.dataMu.Unlock()
+		// Clunk open fids and close open host FDs.
+		d.handle.close(ctx)
+	}
+	d.handleMu.Unlock()
+	d.file.close(ctx)
+	// Remove d from the set of all dentries.
+	d.fs.syncMu.Lock()
+	delete(d.fs.dentries, d)
+	d.fs.syncMu.Unlock()
+	// Drop the reference held by d on its parent.
+	if parentVFSD := d.vfsd.Parent(); parentVFSD != nil {
+		parent := parentVFSD.Impl().(*dentry)
+		// This is parent.DecRef() without recursive locking of d.fs.renameMu.
+		if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 {
+			parent.checkCachingLocked()
+		} else if refs < 0 {
+			panic("gofer.dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
+func (d *dentry) isDeleted() bool {
+	return atomic.LoadUint32(&d.deleted) != 0
+}
+
+func (d *dentry) setDeleted() {
+	atomic.StoreUint32(&d.deleted, 1)
+}
+
+func (d *dentry) listxattr(ctx context.Context) ([]string, error) {
+	return nil, syserror.ENOTSUP
+}
+
+func (d *dentry) getxattr(ctx context.Context, name string) (string, error) {
+	// TODO(jamieliu): add vfs.GetxattrOptions.Size
+	return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX)
+}
+
+func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error {
+	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
+}
+
+func (d *dentry) removexattr(ctx context.Context, name string) error {
+	return syserror.ENOTSUP
+}
+
+// Preconditions: d.isRegularFile() || d.isDirectory().
+func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
+	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
+	// O_TRUNC).
+	if !trunc {
+		d.handleMu.RLock()
+		if (!read || d.handleReadable) && (!write || d.handleWritable) {
+			// The current handle is sufficient.
+			d.handleMu.RUnlock()
+			return nil
+		}
+		d.handleMu.RUnlock()
+	}
+
+	haveOldFD := false
+	d.handleMu.Lock()
+	if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc {
+		// Get a new handle.
+		wantReadable := d.handleReadable || read
+		wantWritable := d.handleWritable || write
+		h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc)
+		if err != nil {
+			d.handleMu.Unlock()
+			return err
+		}
+		if !d.handle.file.isNil() {
+			// Check that old and new handles are compatible: If the old handle
+			// includes a host file descriptor but the new one does not, or
+			// vice versa, old and new memory mappings may be incoherent.
+			haveOldFD = d.handle.fd >= 0
+			haveNewFD := h.fd >= 0
+			if haveOldFD != haveNewFD {
+				d.handleMu.Unlock()
+				ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD)
+				h.close(ctx)
+				return syserror.EIO
+			}
+			if haveOldFD {
+				// We may have raced with callers of d.pf.FD() that are now
+				// using the old file descriptor, preventing us from safely
+				// closing it. We could handle this by invalidating existing
+				// memmap.Translations, but this is expensive. Instead, use
+				// dup2() to make the old file descriptor refer to the new file
+				// description, then close the new file descriptor (which is no
+				// longer needed). Racing callers may use the old or new file
+				// description, but this doesn't matter since they refer to the
+				// same file (unless d.fs.opts.overlayfsStaleRead is true,
+				// which we handle separately).
+				if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil {
+					d.handleMu.Unlock()
+					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
+					h.close(ctx)
+					return err
+				}
+				syscall.Close(int(h.fd))
+				h.fd = d.handle.fd
+				if d.fs.opts.overlayfsStaleRead {
+					// Replace sentry mappings of the old FD with mappings of
+					// the new FD, since the two are not necessarily coherent.
+					if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
+						d.handleMu.Unlock()
+						ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
+						h.close(ctx)
+						return err
+					}
+				}
+				// Clunk the old fid before making the new handle visible (by
+				// unlocking d.handleMu).
+				d.handle.file.close(ctx)
+			}
+		}
+		// Switch to the new handle.
+		d.handle = h
+		d.handleReadable = wantReadable
+		d.handleWritable = wantWritable
+	}
+	d.handleMu.Unlock()
+
+	if d.fs.opts.overlayfsStaleRead && haveOldFD {
+		// Invalidate application mappings that may be using the old FD; they
+		// will be replaced with mappings using the new FD after future calls
+		// to d.Translate(). This requires holding d.mapsMu, which precedes
+		// d.handleMu in the lock order.
+		d.mapsMu.Lock()
+		d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+		d.mapsMu.Unlock()
+	}
+
+	return nil
+}
+
+// fileDescription is embedded by gofer implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	d := fd.dentry()
+	if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
+		// available?
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	var stat linux.Statx
+	d.statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount())
+}
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) {
+	return fd.dentry().listxattr(ctx)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+	return fd.dentry().getxattr(ctx, name)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	return fd.dentry().setxattr(ctx, &opts)
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	return fd.dentry().removexattr(ctx, name)
+}
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
new file mode 100644
index 000000000..cfe66f797
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -0,0 +1,135 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// handle represents a remote "open file descriptor", consisting of an opened
+// fid (p9.File) and optionally a host file descriptor.
+type handle struct {
+	file p9file
+	fd   int32 // -1 if unavailable
+}
+
+// Preconditions: read || write.
+func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) {
+	_, newfile, err := file.walk(ctx, nil)
+	if err != nil {
+		return handle{fd: -1}, err
+	}
+	var flags p9.OpenFlags
+	switch {
+	case read && !write:
+		flags = p9.ReadOnly
+	case !read && write:
+		flags = p9.WriteOnly
+	case read && write:
+		flags = p9.ReadWrite
+	}
+	if trunc {
+		flags |= p9.OpenTruncate
+	}
+	fdobj, _, _, err := newfile.open(ctx, flags)
+	if err != nil {
+		newfile.close(ctx)
+		return handle{fd: -1}, err
+	}
+	fd := int32(-1)
+	if fdobj != nil {
+		fd = int32(fdobj.Release())
+	}
+	return handle{
+		file: newfile,
+		fd:   fd,
+	}, nil
+}
+
+func (h *handle) close(ctx context.Context) {
+	h.file.close(ctx)
+	h.file = p9file{}
+	if h.fd >= 0 {
+		syscall.Close(int(h.fd))
+		h.fd = -1
+	}
+}
+
+func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		n, err := hostPreadv(h.fd, dsts, int64(offset))
+		ctx.UninterruptibleSleepFinish(false)
+		return n, err
+	}
+	if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() {
+		n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset)
+		return uint64(n), err
+	}
+	// Buffer the read since p9.File.ReadAt() takes []byte.
+	buf := make([]byte, dsts.NumBytes())
+	n, err := h.file.readAt(ctx, buf, offset)
+	if n == 0 {
+		return 0, err
+	}
+	if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil {
+		return cp, cperr
+	}
+	return uint64(n), err
+}
+
+func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		n, err := hostPwritev(h.fd, srcs, int64(offset))
+		ctx.UninterruptibleSleepFinish(false)
+		return n, err
+	}
+	if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() {
+		n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset)
+		return uint64(n), err
+	}
+	// Buffer the write since p9.File.WriteAt() takes []byte.
+	buf := make([]byte, srcs.NumBytes())
+	cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs)
+	if cp == 0 {
+		return 0, cperr
+	}
+	n, err := h.file.writeAt(ctx, buf[:cp], offset)
+	if err != nil {
+		return uint64(n), err
+	}
+	return cp, cperr
+}
+
+func (h *handle) sync(ctx context.Context) error {
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		err := syscall.Fsync(int(h.fd))
+		ctx.UninterruptibleSleepFinish(false)
+		return err
+	}
+	return h.file.fsync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go
new file mode 100644
index 000000000..19560ab26
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/handle_unsafe.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// Preconditions: !dsts.IsEmpty().
+func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	if dsts.NumBlocks() == 1 {
+		// Use pread() instead of preadv() to avoid iovec allocation and
+		// copying.
+		dst := dsts.Head()
+		n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0)
+		if e != 0 {
+			return 0, e
+		}
+		return uint64(n), nil
+	}
+	iovs := safemem.IovecsFromBlockSeq(dsts)
+	n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
+	if e != 0 {
+		return 0, e
+	}
+	return uint64(n), nil
+}
+
+// Preconditions: !srcs.IsEmpty().
+func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	if srcs.NumBlocks() == 1 {
+		// Use pwrite() instead of pwritev() to avoid iovec allocation and
+		// copying.
+		src := srcs.Head()
+		n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0)
+		if e != 0 {
+			return 0, e
+		}
+		return uint64(n), nil
+	}
+	iovs := safemem.IovecsFromBlockSeq(srcs)
+	n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
+	if e != 0 {
+		return 0, e
+	}
+	return uint64(n), nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
new file mode 100644
index 000000000..755ac2985
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -0,0 +1,219 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// p9file is a wrapper around p9.File that provides methods that are
+// Context-aware.
+type p9file struct {
+	file p9.File
+}
+
+func (f p9file) isNil() bool {
+	return f.file == nil
+}
+
+func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, err := f.file.Walk(names)
+	ctx.UninterruptibleSleepFinish(false)
+	return qids, p9file{newfile}, err
+}
+
+func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names)
+	ctx.UninterruptibleSleepFinish(false)
+	return qids, p9file{newfile}, attrMask, attr, err
+}
+
+// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single
+// path component and returns a single qid.
+func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name})
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err
+	}
+	if len(qids) != 1 {
+		ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids)
+		if newfile != nil {
+			p9file{newfile}.close(ctx)
+		}
+		return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO
+	}
+	return qids[0], p9file{newfile}, attrMask, attr, nil
+}
+
+func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fsstat, err := f.file.StatFS()
+	ctx.UninterruptibleSleepFinish(false)
+	return fsstat, err
+}
+
+func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, attrMask, attr, err := f.file.GetAttr(req)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, attrMask, attr, err
+}
+
+func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetAttr(valid, attr)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	val, err := f.file.GetXattr(name, size)
+	ctx.UninterruptibleSleepFinish(false)
+	return val, err
+}
+
+func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetXattr(name, value, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Allocate(mode, offset, length)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) close(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Close()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, qid, iounit, err := f.file.Open(flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, qid, iounit, err
+}
+
+func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	n, err := f.file.ReadAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
+}
+
+func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	n, err := f.file.WriteAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
+}
+
+func (f p9file) fsync(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.FSync()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, p9file{newfile}, qid, iounit, err
+}
+
+func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Mkdir(name, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Symlink(oldName, newName, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) link(ctx context.Context, target p9file, newName string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Link(target.file, newName)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Mknod(name, mode, major, minor, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Rename(newDir.file, newName)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.UnlinkAt(name, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) {
+	ctx.UninterruptibleSleepStart(false)
+	dirents, err := f.file.Readdir(offset, count)
+	ctx.UninterruptibleSleepFinish(false)
+	return dirents, err
+}
+
+func (f p9file) readlink(ctx context.Context) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	target, err := f.file.Readlink()
+	ctx.UninterruptibleSleepFinish(false)
+	return target, err
+}
+
+func (f p9file) flush(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Flush()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, err := f.file.Connect(flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go
new file mode 100644
index 000000000..847cb0784
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/pagemath.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// This are equivalent to usermem.Addr.RoundDown/Up, but without the
+// potentially truncating conversion to usermem.Addr. This is necessary because
+// there is no way to define generic "PageRoundDown/Up" functions in Go.
+
+func pageRoundDown(x uint64) uint64 {
+	return x &^ (usermem.PageSize - 1)
+}
+
+func pageRoundUp(x uint64) uint64 {
+	return pageRoundDown(x + usermem.PageSize - 1)
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
new file mode 100644
index 000000000..8e11e06b3
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -0,0 +1,860 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isRegularFile() bool {
+	return d.fileType() == linux.S_IFREG
+}
+
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset. off is protected by mu.
+	mu  sync.Mutex
+	off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	// Skip flushing if writes may be buffered by the client, since (as with
+	// the VFS1 client) we don't flush buffered writes on close anyway.
+	d := fd.dentry()
+	if d.fs.opts.interop == InteropModeExclusive {
+		return nil
+	}
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	return d.handle.file.flush(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Check for reading at EOF before calling into MM (but not under
+	// InteropModeShared, which makes d.size unreliable).
+	d := fd.dentry()
+	if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) {
+		return 0, io.EOF
+	}
+
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Lock d.metadataMu for the rest of the read to prevent d.size from
+		// changing.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+		// Write dirty cached pages that will be touched by the read back to
+		// the remote file.
+		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
+			return 0, err
+		}
+	}
+
+	rw := getDentryReadWriter(ctx, d, offset)
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Require the read to go to the remote file.
+		rw.direct = true
+	}
+	n, err := dst.CopyOutFrom(ctx, rw)
+	putDentryReadWriter(rw)
+	if d.fs.opts.interop != InteropModeShared {
+		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+		d.touchAtime(ctx, fd.vfsfd.Mount())
+	}
+	return n, err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	d := fd.dentry()
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	if d.fs.opts.interop != InteropModeShared {
+		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
+		// file_update_time(). This is d.touchCMtime(), but without locking
+		// d.metadataMu (recursively).
+		if now, ok := nowFromContext(ctx); ok {
+			atomic.StoreInt64(&d.mtime, now)
+			atomic.StoreInt64(&d.ctime, now)
+		}
+	}
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Write dirty cached pages that will be touched by the write back to
+		// the remote file.
+		if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+			return 0, err
+		}
+		// Remove touched pages from the cache.
+		pgstart := pageRoundDown(uint64(offset))
+		pgend := pageRoundUp(uint64(offset + src.NumBytes()))
+		if pgend < pgstart {
+			return 0, syserror.EINVAL
+		}
+		mr := memmap.MappableRange{pgstart, pgend}
+		var freed []platform.FileRange
+		d.dataMu.Lock()
+		cseg := d.cache.LowerBoundSegment(mr.Start)
+		for cseg.Ok() && cseg.Start() < mr.End {
+			cseg = d.cache.Isolate(cseg, mr)
+			freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
+			cseg = d.cache.Remove(cseg).NextSegment()
+		}
+		d.dataMu.Unlock()
+		// Invalidate mappings of removed pages.
+		d.mapsMu.Lock()
+		d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
+		d.mapsMu.Unlock()
+		// Finally free pages removed from the cache.
+		mf := d.fs.mfp.MemoryFile()
+		for _, freedFR := range freed {
+			mf.DecRef(freedFR)
+		}
+	}
+	rw := getDentryReadWriter(ctx, d, offset)
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Require the write to go to the remote file.
+		rw.direct = true
+	}
+	n, err := src.CopyInTo(ctx, rw)
+	putDentryReadWriter(rw)
+	if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+		// Write dirty cached pages touched by the write back to the remote
+		// file.
+		if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+			return 0, err
+		}
+		// Request the remote filesystem to sync the remote file.
+		if err := d.handle.file.fsync(ctx); err != nil {
+			return 0, err
+		}
+	}
+	return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+type dentryReadWriter struct {
+	ctx    context.Context
+	d      *dentry
+	off    uint64
+	direct bool
+}
+
+var dentryReadWriterPool = sync.Pool{
+	New: func() interface{} {
+		return &dentryReadWriter{}
+	},
+}
+
+func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter {
+	rw := dentryReadWriterPool.Get().(*dentryReadWriter)
+	rw.ctx = ctx
+	rw.d = d
+	rw.off = uint64(offset)
+	rw.direct = false
+	return rw
+}
+
+func putDentryReadWriter(rw *dentryReadWriter) {
+	rw.ctx = nil
+	rw.d = nil
+	dentryReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+
+	// If we have a mmappable host FD (which must be used here to ensure
+	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
+	// (which prevents us from caching file contents and makes dentry.size
+	// unreliable), or if the file was opened O_DIRECT, read directly from
+	// dentry.handle without locking dentry.dataMu.
+	rw.d.handleMu.RLock()
+	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+		n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off)
+		rw.d.handleMu.RUnlock()
+		rw.off += n
+		return n, err
+	}
+
+	// Otherwise read from/through the cache.
+	mf := rw.d.fs.mfp.MemoryFile()
+	fillCache := mf.ShouldCacheEvictable()
+	var dataMuUnlock func()
+	if fillCache {
+		rw.d.dataMu.Lock()
+		dataMuUnlock = rw.d.dataMu.Unlock
+	} else {
+		rw.d.dataMu.RLock()
+		dataMuUnlock = rw.d.dataMu.RUnlock
+	}
+
+	// Compute the range to read (limited by file size and overflow-checked).
+	if rw.off >= rw.d.size {
+		dataMuUnlock()
+		rw.d.handleMu.RUnlock()
+		return 0, io.EOF
+	}
+	end := rw.d.size
+	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+		end = rend
+	}
+
+	var done uint64
+	seg, gap := rw.d.cache.Find(rw.off)
+	for rw.off < end {
+		mr := memmap.MappableRange{rw.off, end}
+		switch {
+		case seg.Ok():
+			// Get internal mappings from the cache.
+			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				dataMuUnlock()
+				rw.d.handleMu.RUnlock()
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.off += n
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				dataMuUnlock()
+				rw.d.handleMu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			gapMR := gap.Range().Intersect(mr)
+			if fillCache {
+				// Read into the cache, then re-enter the loop to read from the
+				// cache.
+				reqMR := memmap.MappableRange{
+					Start: pageRoundDown(gapMR.Start),
+					End:   pageRoundUp(gapMR.End),
+				}
+				optMR := gap.Range()
+				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
+				mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
+				seg, gap = rw.d.cache.Find(rw.off)
+				if !seg.Ok() {
+					dataMuUnlock()
+					rw.d.handleMu.RUnlock()
+					return done, err
+				}
+				// err might have occurred in part of gap.Range() outside
+				// gapMR. Forget about it for now; if the error matters and
+				// persists, we'll run into it again in a later iteration of
+				// this loop.
+			} else {
+				// Read directly from the file.
+				gapDsts := dsts.TakeFirst64(gapMR.Length())
+				n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
+				done += n
+				rw.off += n
+				dsts = dsts.DropFirst64(n)
+				// Partial reads are fine. But we must stop reading.
+				if n != gapDsts.NumBytes() || err != nil {
+					dataMuUnlock()
+					rw.d.handleMu.RUnlock()
+					return done, err
+				}
+
+				// Continue.
+				seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+			}
+		}
+	}
+	dataMuUnlock()
+	rw.d.handleMu.RUnlock()
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: rw.d.metadataMu must be locked.
+func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+
+	// If we have a mmappable host FD (which must be used here to ensure
+	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
+	// (which prevents us from caching file contents), or if the file was
+	// opened with O_DIRECT, write directly to dentry.handle without locking
+	// dentry.dataMu.
+	rw.d.handleMu.RLock()
+	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+		n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+		rw.d.handleMu.RUnlock()
+		rw.off += n
+		return n, err
+	}
+
+	// Otherwise write to/through the cache.
+	mf := rw.d.fs.mfp.MemoryFile()
+	rw.d.dataMu.Lock()
+
+	// Compute the range to write (overflow-checked).
+	start := rw.off
+	end := rw.off + srcs.NumBytes()
+	if end <= rw.off {
+		end = math.MaxInt64
+	}
+
+	var (
+		done   uint64
+		retErr error
+	)
+	seg, gap := rw.d.cache.Find(rw.off)
+	for rw.off < end {
+		mr := memmap.MappableRange{rw.off, end}
+		switch {
+		case seg.Ok():
+			// Get internal mappings from the cache.
+			segMR := seg.Range().Intersect(mr)
+			ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.off += n
+			srcs = srcs.DropFirst64(n)
+			rw.d.dirty.MarkDirty(segMR)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Write directly to the file. At present, we never fill the cache
+			// when writing, since doing so can convert small writes into
+			// inefficient read-modify-write cycles, and we have no mechanism
+			// for detecting or avoiding this.
+			gapMR := gap.Range().Intersect(mr)
+			gapSrcs := srcs.TakeFirst64(gapMR.Length())
+			n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
+			done += n
+			rw.off += n
+			srcs = srcs.DropFirst64(n)
+			// Partial writes are fine. But we must stop writing.
+			if n != gapSrcs.NumBytes() || err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+		}
+	}
+exitLoop:
+	if rw.off > rw.d.size {
+		atomic.StoreUint64(&rw.d.size, rw.off)
+		// The remote file's size will implicitly be extended to the correct
+		// value when we write back to it.
+	}
+	// If InteropModeWritethrough is in effect, flush written data back to the
+	// remote filesystem.
+	if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 {
+		if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
+			Start: start,
+			End:   rw.off,
+		}, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil {
+			// We have no idea how many bytes were actually flushed.
+			rw.off = start
+			done = 0
+			retErr = err
+		}
+	}
+	rw.d.dataMu.Unlock()
+	rw.d.handleMu.RUnlock()
+	return done, retErr
+}
+
+func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
+	if size == 0 {
+		return nil
+	}
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	// Compute the range of valid bytes (overflow-checked).
+	if uint64(offset) >= d.size {
+		return nil
+	}
+	end := int64(d.size)
+	if rend := offset + size; rend > offset && rend < end {
+		end = rend
+	}
+	return fsutil.SyncDirty(ctx, memmap.MappableRange{
+		Start: uint64(offset),
+		End:   uint64(end),
+	}, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as specified.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Ensure file size is up to date.
+		d := fd.dentry()
+		if fd.filesystem().opts.interop == InteropModeShared {
+			if err := d.updateFromGetattr(ctx); err != nil {
+				return 0, err
+			}
+		}
+		size := int64(atomic.LoadUint64(&d.size))
+		// For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous
+		// block of data.
+		switch whence {
+		case linux.SEEK_END:
+			offset += size
+		case linux.SEEK_DATA:
+			if offset > size {
+				return 0, syserror.ENXIO
+			}
+			// Use offset as specified.
+		case linux.SEEK_HOLE:
+			if offset > size {
+				return 0, syserror.ENXIO
+			}
+			offset = size
+		}
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	return fd.dentry().syncSharedHandle(ctx)
+}
+
+func (d *dentry) syncSharedHandle(ctx context.Context) error {
+	d.handleMu.RLock()
+	if !d.handleWritable {
+		d.handleMu.RUnlock()
+		return nil
+	}
+	d.dataMu.Lock()
+	// Write dirty cached data to the remote file.
+	err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+	d.dataMu.Unlock()
+	if err == nil {
+		// Sync the remote file.
+		err = d.handle.sync(ctx)
+	}
+	d.handleMu.RUnlock()
+	return err
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	d := fd.dentry()
+	switch d.fs.opts.interop {
+	case InteropModeExclusive:
+		// Any mapping is fine.
+	case InteropModeWritethrough:
+		// Shared writable mappings require a host FD, since otherwise we can't
+		// synchronously flush memory-mapped writes to the remote file.
+		if opts.Private || !opts.MaxPerms.Write {
+			break
+		}
+		fallthrough
+	case InteropModeShared:
+		// All mappings require a host FD to be coherent with other filesystem
+		// users.
+		if d.fs.opts.forcePageCache {
+			// Whether or not we have a host FD, we're not allowed to use it.
+			return syserror.ENODEV
+		}
+		d.handleMu.RLock()
+		haveFD := d.handle.fd >= 0
+		d.handleMu.RUnlock()
+		if !haveFD {
+			return syserror.ENODEV
+		}
+	default:
+		panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
+	}
+	return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
+}
+
+func (d *dentry) mayCachePages() bool {
+	if d.fs.opts.interop == InteropModeShared {
+		return false
+	}
+	if d.fs.opts.forcePageCache {
+		return true
+	}
+	d.handleMu.RLock()
+	haveFD := d.handle.fd >= 0
+	d.handleMu.RUnlock()
+	return haveFD
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	mapped := d.mappings.AddMapping(ms, ar, offset, writable)
+	// Do this unconditionally since whether we have a host FD can change
+	// across save/restore.
+	for _, r := range mapped {
+		d.pf.hostFileMapper.IncRefOn(r)
+	}
+	if d.mayCachePages() {
+		// d.Evict() will refuse to evict memory-mapped pages, so tell the
+		// MemoryFile to not bother trying.
+		mf := d.fs.mfp.MemoryFile()
+		for _, r := range mapped {
+			mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End})
+		}
+	}
+	d.mapsMu.Unlock()
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	d.mapsMu.Lock()
+	unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable)
+	for _, r := range unmapped {
+		d.pf.hostFileMapper.DecRefOn(r)
+	}
+	if d.mayCachePages() {
+		// Pages that are no longer referenced by any application memory
+		// mappings are now considered unused; allow MemoryFile to evict them
+		// when necessary.
+		mf := d.fs.mfp.MemoryFile()
+		d.dataMu.Lock()
+		for _, r := range unmapped {
+			// Since these pages are no longer mapped, they are no longer
+			// concurrently dirtyable by a writable memory mapping.
+			d.dirty.AllowClean(r)
+			mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End})
+		}
+		d.dataMu.Unlock()
+	}
+	d.mapsMu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return d.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	d.handleMu.RLock()
+	if d.handle.fd >= 0 && !d.fs.opts.forcePageCache {
+		d.handleMu.RUnlock()
+		mr := optional
+		if d.fs.opts.limitHostFDTranslation {
+			mr = maxFillRange(required, optional)
+		}
+		return []memmap.Translation{
+			{
+				Source: mr,
+				File:   &d.pf,
+				Offset: mr.Start,
+				Perms:  usermem.AnyAccess,
+			},
+		}, nil
+	}
+
+	d.dataMu.Lock()
+
+	// Constrain translations to d.size (rounded up) to prevent translation to
+	// pages that may be concurrently truncated.
+	pgend := pageRoundUp(d.size)
+	var beyondEOF bool
+	if required.End > pgend {
+		if required.Start >= pgend {
+			d.dataMu.Unlock()
+			d.handleMu.RUnlock()
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	mf := d.fs.mfp.MemoryFile()
+	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt)
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		// TODO(jamieliu): Make Translations writable even if writability is
+		// not required if already kept-dirty by another writable translation.
+		perms := usermem.AccessType{
+			Read:    true,
+			Execute: true,
+		}
+		if at.Write {
+			// From this point forward, this memory can be dirtied through the
+			// mapping at any time.
+			d.dirty.KeepDirty(segMR)
+			perms.Write = true
+		}
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   mf,
+			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  perms,
+		})
+		translatedEnd = segMR.End
+	}
+
+	d.dataMu.Unlock()
+	d.handleMu.RUnlock()
+
+	// Don't return the error returned by c.cache.Fill if it occurred outside
+	// of required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
+	}
+	return ts, nil
+}
+
+func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
+	const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
+	if required.Length() >= maxReadahead {
+		return required
+	}
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.Start = required.Start
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.End = optional.Start + maxReadahead
+	return optional
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+	// Whether we have a host fd (and consequently what platform.File is
+	// mapped) can change across save/restore, so invalidate all translations
+	// unconditionally.
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+
+	// Write the cache's contents back to the remote file so that if we have a
+	// host fd after restore, the remote file's contents are coherent.
+	mf := d.fs.mfp.MemoryFile()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+		return err
+	}
+
+	// Discard the cache so that it's not stored in saved state. This is safe
+	// because per InvalidateUnsavable invariants, no new translations can have
+	// been returned after we invalidated all existing translations above.
+	d.cache.DropAll(mf)
+	d.dirty.RemoveAll()
+
+	return nil
+}
+
+// Evict implements pgalloc.EvictableMemoryUser.Evict.
+func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+
+	mr := memmap.MappableRange{er.Start, er.End}
+	mf := d.fs.mfp.MemoryFile()
+	// Only allow pages that are no longer memory-mapped to be evicted.
+	for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
+		mgapMR := mgap.Range().Intersect(mr)
+		if mgapMR.Length() == 0 {
+			continue
+		}
+		if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+			log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
+		}
+		d.cache.Drop(mgapMR, mf)
+		d.dirty.KeepClean(mgapMR)
+	}
+}
+
+// dentryPlatformFile implements platform.File. It exists solely because dentry
+// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef.
+//
+// dentryPlatformFile is only used when a host FD representing the remote file
+// is available (i.e. dentry.handle.fd >= 0), and that FD is used for
+// application memory mappings (i.e. !filesystem.opts.forcePageCache).
+type dentryPlatformFile struct {
+	*dentry
+
+	// fdRefs counts references on platform.File offsets. fdRefs is protected
+	// by dentry.dataMu.
+	fdRefs fsutil.FrameRefSet
+
+	// If this dentry represents a regular file, and handle.fd >= 0,
+	// hostFileMapper caches mappings of handle.fd.
+	hostFileMapper fsutil.HostFileMapper
+}
+
+// IncRef implements platform.File.IncRef.
+func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
+	d.dataMu.Lock()
+	seg, gap := d.fdRefs.Find(fr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < fr.End:
+			seg = d.fdRefs.Isolate(seg, fr)
+			seg.SetValue(seg.Value() + 1)
+			seg, gap = seg.NextNonEmpty()
+		case gap.Ok() && gap.Start() < fr.End:
+			newRange := gap.Range().Intersect(fr)
+			usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
+			seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
+		default:
+			d.fdRefs.MergeAdjacent(fr)
+			d.dataMu.Unlock()
+			return
+		}
+	}
+}
+
+// DecRef implements platform.File.DecRef.
+func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
+	d.dataMu.Lock()
+	seg := d.fdRefs.FindSegment(fr.Start)
+
+	for seg.Ok() && seg.Start() < fr.End {
+		seg = d.fdRefs.Isolate(seg, fr)
+		if old := seg.Value(); old == 1 {
+			usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
+			seg = d.fdRefs.Remove(seg).NextSegment()
+		} else {
+			seg.SetValue(old - 1)
+			seg = seg.NextSegment()
+		}
+	}
+	d.fdRefs.MergeAdjacent(fr)
+	d.dataMu.Unlock()
+
+}
+
+// MapInternal implements platform.File.MapInternal.
+func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	d.handleMu.RLock()
+	bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write)
+	d.handleMu.RUnlock()
+	return bs, err
+}
+
+// FD implements platform.File.FD.
+func (d *dentryPlatformFile) FD() int {
+	d.handleMu.RLock()
+	fd := d.handle.fd
+	d.handleMu.RUnlock()
+	return int(fd)
+}
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
new file mode 100644
index 000000000..08c691c47
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -0,0 +1,159 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// specialFileFD implements vfs.FileDescriptionImpl for files other than
+// regular files, directories, and symlinks: pipes, sockets, etc. It is also
+// used for regular files when filesystemOptions.specialRegularFiles is in
+// effect. specialFileFD differs from regularFileFD by using per-FD handles
+// instead of shared per-dentry handles, and never buffering I/O.
+type specialFileFD struct {
+	fileDescription
+
+	// handle is immutable.
+	handle handle
+
+	// off is the file offset. off is protected by mu. (POSIX 2.9.7 only
+	// requires operations using the file offset to be atomic for regular files
+	// and symlinks; however, since specialFileFD may be used for regular
+	// files, we apply this atomicity unconditionally.)
+	mu  sync.Mutex
+	off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *specialFileFD) Release() {
+	fd.handle.close(context.Background())
+	fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+	fs.syncMu.Lock()
+	delete(fs.specialFileFDs, fd)
+	fs.syncMu.Unlock()
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *specialFileFD) OnClose(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	return fd.handle.file.flush(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Going through dst.CopyOutFrom() holds MM locks around file operations of
+	// unknown duration. For regularFileFD, doing so is necessary to support
+	// mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
+	// hold here since specialFileFD doesn't client-cache data. Just buffer the
+	// read instead.
+	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+		d.touchAtime(ctx, fd.vfsfd.Mount())
+	}
+	buf := make([]byte, dst.NumBytes())
+	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if n == 0 {
+		return 0, err
+	}
+	if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
+		return int64(cp), cperr
+	}
+	return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Do a buffered write. See rationale in PRead.
+	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+		d.touchCMtime(ctx)
+	}
+	buf := make([]byte, src.NumBytes())
+	// Don't do partial writes if we get a partial read from src.
+	if _, err := src.CopyIn(ctx, buf); err != nil {
+		return 0, err
+	}
+	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	return int64(n), err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
+		// SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not
+		// clear that file size is even meaningful for these files.
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *specialFileFD) Sync(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	return fd.handle.sync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
new file mode 100644
index 000000000..adf43be60
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -0,0 +1,47 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func (d *dentry) isSymlink() bool {
+	return d.fileType() == linux.S_IFLNK
+}
+
+// Precondition: d.isSymlink().
+func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchAtime(ctx, mnt)
+		d.dataMu.Lock()
+		if d.haveTarget {
+			target := d.target
+			d.dataMu.Unlock()
+			return target, nil
+		}
+	}
+	target, err := d.file.readlink(ctx)
+	if d.fs.opts.interop != InteropModeShared {
+		if err == nil {
+			d.haveTarget = true
+			d.target = target
+		}
+		d.dataMu.Unlock()
+	}
+	return target, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
new file mode 100644
index 000000000..7598ec6a8
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -0,0 +1,75 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func dentryTimestampFromP9(s, ns uint64) int64 {
+	return int64(s*1e9 + ns)
+}
+
+func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 {
+	return ts.Sec*1e9 + int64(ts.Nsec)
+}
+
+func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
+	return linux.StatxTimestamp{
+		Sec:  ns / 1e9,
+		Nsec: uint32(ns % 1e9),
+	}
+}
+
+func nowFromContext(ctx context.Context) (int64, bool) {
+	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
+		return clock.Now().Nanoseconds(), true
+	}
+	return 0, false
+}
+
+// Preconditions: fs.interop != InteropModeShared.
+func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) {
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now, ok := nowFromContext(ctx)
+	if !ok {
+		mnt.EndWrite()
+		return
+	}
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.atime, now)
+	d.metadataMu.Unlock()
+	mnt.EndWrite()
+}
+
+// Preconditions: fs.interop != InteropModeShared. The caller has successfully
+// called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCMtime(ctx context.Context) {
+	now, ok := nowFromContext(ctx)
+	if !ok {
+		return
+	}
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.mtime, now)
+	atomic.StoreInt64(&d.ctime, now)
+	d.metadataMu.Unlock()
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 5ee9cf1e9..72bc15264 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -622,7 +622,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if child.inode.isDir() {
 		return syserror.EISDIR
 	}
-	if !rp.MustBeDir() {
+	if rp.MustBeDir() {
 		return syserror.ENOTDIR
 	}
 	mnt := rp.Mount()
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index bde4c7a1e..34f63986f 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -126,7 +126,7 @@ func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 			}
 			return uint64(n), nil
 		}
-		return readv(s.fd, iovecsFromBlockSeq(dsts))
+		return readv(s.fd, safemem.IovecsFromBlockSeq(dsts))
 	}))
 	return int64(n), err
 }
@@ -149,7 +149,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 			}
 			return uint64(n), nil
 		}
-		return writev(s.fd, iovecsFromBlockSeq(srcs))
+		return writev(s.fd, safemem.IovecsFromBlockSeq(srcs))
 	}))
 	return int64(n), err
 }
@@ -402,7 +402,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		// We always do a non-blocking recv*().
 		sysflags := flags | syscall.MSG_DONTWAIT
 
-		iovs := iovecsFromBlockSeq(dsts)
+		iovs := safemem.IovecsFromBlockSeq(dsts)
 		msg := syscall.Msghdr{
 			Iov:    &iovs[0],
 			Iovlen: uint64(len(iovs)),
@@ -522,7 +522,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 			return uint64(n), nil
 		}
 
-		iovs := iovecsFromBlockSeq(srcs)
+		iovs := safemem.IovecsFromBlockSeq(srcs)
 		msg := syscall.Msghdr{
 			Iov:    &iovs[0],
 			Iovlen: uint64(len(iovs)),
@@ -567,21 +567,6 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	return int(n), syserr.FromError(err)
 }
 
-func iovecsFromBlockSeq(bs safemem.BlockSeq) []syscall.Iovec {
-	iovs := make([]syscall.Iovec, 0, bs.NumBlocks())
-	for ; !bs.IsEmpty(); bs = bs.Tail() {
-		b := bs.Head()
-		iovs = append(iovs, syscall.Iovec{
-			Base: &b.ToSlice()[0],
-			Len:  uint64(b.Len()),
-		})
-		// We don't need to care about b.NeedSafecopy(), because the host
-		// kernel will handle such address ranges just fine (by returning
-		// EFAULT).
-	}
-	return iovs
-}
-
 func translateIOSyscallError(err error) error {
 	if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK {
 		return syserror.ErrWouldBlock
-- 
cgit v1.2.3


From 665b614e4a6e715bac25bea15c5c29184016e549 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Tue, 4 Feb 2020 18:04:26 -0800
Subject: Support RTM_NEWADDR and RTM_GETLINK in (rt)netlink.

PiperOrigin-RevId: 293271055
---
 pkg/sentry/inet/inet.go                      |   4 +
 pkg/sentry/inet/test_stack.go                |   6 +
 pkg/sentry/socket/hostinet/stack.go          |   5 +
 pkg/sentry/socket/netlink/BUILD              |  14 +-
 pkg/sentry/socket/netlink/message.go         | 129 +++++++++++
 pkg/sentry/socket/netlink/message_test.go    | 312 +++++++++++++++++++++++++++
 pkg/sentry/socket/netlink/provider.go        |   2 +-
 pkg/sentry/socket/netlink/route/BUILD        |   2 -
 pkg/sentry/socket/netlink/route/protocol.go  | 238 ++++++++++++++------
 pkg/sentry/socket/netlink/socket.go          |  54 ++---
 pkg/sentry/socket/netlink/uevent/protocol.go |   2 +-
 pkg/sentry/socket/netstack/stack.go          |  55 +++++
 pkg/tcpip/stack/stack.go                     |   9 +
 test/syscalls/linux/BUILD                    |   2 +
 test/syscalls/linux/socket_netlink_route.cc  | 296 ++++++++++++++++++++-----
 test/syscalls/linux/socket_netlink_util.cc   |  45 +++-
 test/syscalls/linux/socket_netlink_util.h    |   9 +
 17 files changed, 1022 insertions(+), 162 deletions(-)
 create mode 100644 pkg/sentry/socket/netlink/message_test.go

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index a7dfb78a7..2916a0644 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -28,6 +28,10 @@ type Stack interface {
 	// interface indexes to a slice of associated interface address properties.
 	InterfaceAddrs() map[int32][]InterfaceAddr
 
+	// AddInterfaceAddr adds an address to the network interface identified by
+	// index.
+	AddInterfaceAddr(idx int32, addr InterfaceAddr) error
+
 	// SupportsIPv6 returns true if the stack supports IPv6 connectivity.
 	SupportsIPv6() bool
 
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index dcfcbd97e..d8961fc94 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -47,6 +47,12 @@ func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr {
 	return s.InterfaceAddrsMap
 }
 
+// AddInterfaceAddr implements Stack.AddInterfaceAddr.
+func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error {
+	s.InterfaceAddrsMap[idx] = append(s.InterfaceAddrsMap[idx], addr)
+	return nil
+}
+
 // SupportsIPv6 implements Stack.SupportsIPv6.
 func (s *TestStack) SupportsIPv6() bool {
 	return s.SupportsIPv6Flag
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 034eca676..a48082631 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -310,6 +310,11 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return addrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	return syserror.EACCES
+}
+
 // SupportsIPv6 implements inet.Stack.SupportsIPv6.
 func (s *Stack) SupportsIPv6() bool {
 	return s.supportsIPv6
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index f8b8e467d..1911cd9b8 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -33,3 +33,15 @@ go_library(
         "//pkg/waiter",
     ],
 )
+
+go_test(
+    name = "netlink_test",
+    size = "small",
+    srcs = [
+        "message_test.go",
+    ],
+    deps = [
+        ":netlink",
+        "//pkg/abi/linux",
+    ],
+)
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index b21e0ca4b..4ea252ccb 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -30,8 +30,16 @@ func alignUp(length int, align uint) int {
 	return (length + int(align) - 1) &^ (int(align) - 1)
 }
 
+// alignPad returns the length of padding required for alignment.
+//
+// Preconditions: align is a power of two.
+func alignPad(length int, align uint) int {
+	return alignUp(length, align) - length
+}
+
 // Message contains a complete serialized netlink message.
 type Message struct {
+	hdr linux.NetlinkMessageHeader
 	buf []byte
 }
 
@@ -40,10 +48,86 @@ type Message struct {
 // The header length will be updated by Finalize.
 func NewMessage(hdr linux.NetlinkMessageHeader) *Message {
 	return &Message{
+		hdr: hdr,
 		buf: binary.Marshal(nil, usermem.ByteOrder, hdr),
 	}
 }
 
+// ParseMessage parses the first message seen at buf, returning the rest of the
+// buffer. If message is malformed, ok of false is returned. For last message,
+// padding check is loose, if there isn't enought padding, whole buf is consumed
+// and ok is set to true.
+func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) {
+	b := BytesView(buf)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+	var hdr linux.NetlinkMessageHeader
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	// Msg portion.
+	totalMsgLen := int(hdr.Length)
+	_, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+
+	// Padding.
+	numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return
+	}
+
+	return &Message{
+		hdr: hdr,
+		buf: buf[:totalMsgLen],
+	}, []byte(b), true
+}
+
+// Header returns the header of this message.
+func (m *Message) Header() linux.NetlinkMessageHeader {
+	return m.hdr
+}
+
+// GetData unmarshals the payload message header from this netlink message, and
+// returns the attributes portion.
+func (m *Message) GetData(msg interface{}) (AttrsView, bool) {
+	b := BytesView(m.buf)
+
+	_, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return nil, false
+	}
+
+	size := int(binary.Size(msg))
+	msgBytes, ok := b.Extract(size)
+	if !ok {
+		return nil, false
+	}
+	binary.Unmarshal(msgBytes, usermem.ByteOrder, msg)
+
+	numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return nil, false
+	}
+
+	return AttrsView(b), true
+}
+
 // Finalize returns the []byte containing the entire message, with the total
 // length set in the message header. The Message must not be modified after
 // calling Finalize.
@@ -157,3 +241,48 @@ func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message {
 	ms.Messages = append(ms.Messages, m)
 	return m
 }
+
+// AttrsView is a view into the attributes portion of a netlink message.
+type AttrsView []byte
+
+// Empty returns whether there is no attribute left in v.
+func (v AttrsView) Empty() bool {
+	return len(v) == 0
+}
+
+// ParseFirst parses first netlink attribute at the beginning of v.
+func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) {
+	b := BytesView(v)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+
+	_, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO))
+	if !ok {
+		return
+	}
+
+	return hdr, value, AttrsView(b), ok
+}
+
+// BytesView supports extracting data from a byte slice with bounds checking.
+type BytesView []byte
+
+// Extract removes the first n bytes from v and returns it. If n is out of
+// bounds, it returns false.
+func (v *BytesView) Extract(n int) ([]byte, bool) {
+	if n < 0 || n > len(*v) {
+		return nil, false
+	}
+	extracted := (*v)[:n]
+	*v = (*v)[n:]
+	return extracted, true
+}
diff --git a/pkg/sentry/socket/netlink/message_test.go b/pkg/sentry/socket/netlink/message_test.go
new file mode 100644
index 000000000..ef13d9386
--- /dev/null
+++ b/pkg/sentry/socket/netlink/message_test.go
@@ -0,0 +1,312 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package message_test
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+)
+
+type dummyNetlinkMsg struct {
+	Foo uint16
+}
+
+func TestParseMessage(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		header  linux.NetlinkMessageHeader
+		dataMsg *dummyNetlinkMsg
+		restLen int
+		ok      bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid with next message",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+				0xFF, // Next message (rest)
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 1,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message without padding",
+			input: []byte{
+				0x12, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 18,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message not to be aligned",
+			input: []byte{
+				0x13, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+				0x00, // Excessive 1 byte permitted at end
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 19,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "header.Length too short",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header.Length too long",
+			input: []byte{
+				0xFF, 0xFF, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header incomplete",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+			},
+			ok: false,
+		},
+		{
+			desc:  "empty message",
+			input: []byte{},
+			ok:    false,
+		},
+	}
+	for _, test := range tests {
+		msg, rest, ok := netlink.ParseMessage(test.input)
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+			continue
+		}
+		if !test.ok {
+			continue
+		}
+		if !reflect.DeepEqual(msg.Header(), test.header) {
+			t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, msg.Header(), test.header)
+		}
+
+		dataMsg := &dummyNetlinkMsg{}
+		_, dataOk := msg.GetData(dataMsg)
+		if !dataOk {
+			t.Errorf("%v: GetData.ok = %v, want = true", test.desc, dataOk)
+		} else if !reflect.DeepEqual(dataMsg, test.dataMsg) {
+			t.Errorf("%v: GetData.msg = %+v, want = %+v", test.desc, dataMsg, test.dataMsg)
+		}
+
+		if got, want := rest, test.input[len(test.input)-test.restLen:]; !bytes.Equal(got, want) {
+			t.Errorf("%v: got rest = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
+
+func TestAttrView(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		// Outputs for ParseFirst.
+		hdr     linux.NetlinkAttrHeader
+		value   []byte
+		restLen int
+		ok      bool
+
+		// Outputs for Empty.
+		isEmpty bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x06, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x00, 0x00, // Data with 2 bytes padding
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 6,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment with rest data",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+				0xFF, 0xFE, // Rest data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 2,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too long",
+			input: []byte{
+				0xFF, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too short",
+			input: []byte{
+				0x01, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc:    "empty",
+			input:   []byte{},
+			ok:      false,
+			isEmpty: true,
+		},
+	}
+	for _, test := range tests {
+		attrs := netlink.AttrsView(test.input)
+
+		// Test ParseFirst().
+		hdr, value, rest, ok := attrs.ParseFirst()
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+		} else if test.ok {
+			if !reflect.DeepEqual(hdr, test.hdr) {
+				t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, hdr, test.hdr)
+			}
+			if !bytes.Equal(value, test.value) {
+				t.Errorf("%v: got value = %v, want = %v", test.desc, value, test.value)
+			}
+			if wantRest := test.input[len(test.input)-test.restLen:]; !bytes.Equal(rest, wantRest) {
+				t.Errorf("%v: got rest = %v, want = %v", test.desc, rest, wantRest)
+			}
+		}
+
+		// Test Empty().
+		if got, want := attrs.Empty(), test.isEmpty; got != want {
+			t.Errorf("%v: got empty = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 07f860a49..b0dc70e5c 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -42,7 +42,7 @@ type Protocol interface {
 	// If err == nil, any messages added to ms will be sent back to the
 	// other end of the socket. Setting ms.Multi will cause an NLMSG_DONE
 	// message to be sent even if ms contains no messages.
-	ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *MessageSet) *syserr.Error
+	ProcessMessage(ctx context.Context, msg *Message, ms *MessageSet) *syserr.Error
 }
 
 // Provider is a function that creates a new Protocol for a specific netlink
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 622a1eafc..93127398d 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -10,13 +10,11 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
         "//pkg/syserr",
-        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 2b3c7f5b3..c84d8bd7c 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -17,16 +17,15 @@ package route
 
 import (
 	"bytes"
+	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // commandKind describes the operational class of a message type.
@@ -69,13 +68,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // dumpLinks handles RTM_GETLINK dump requests.
-func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
-	}
-
+func (p *Protocol) dumpLinks(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
 	// ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some
 	// userspace applications (including glibc) still include rtgenmsg.
@@ -99,44 +92,105 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		return nil
 	}
 
-	for id, i := range stack.Interfaces() {
-		m := ms.AddMessage(linux.NetlinkMessageHeader{
-			Type: linux.RTM_NEWLINK,
-		})
+	for idx, i := range stack.Interfaces() {
+		addNewLinkMessage(ms, idx, i)
+	}
 
-		m.Put(linux.InterfaceInfoMessage{
-			Family: linux.AF_UNSPEC,
-			Type:   i.DeviceType,
-			Index:  id,
-			Flags:  i.Flags,
-		})
+	return nil
+}
 
-		m.PutAttrString(linux.IFLA_IFNAME, i.Name)
-		m.PutAttr(linux.IFLA_MTU, i.MTU)
+// getLinks handles RTM_GETLINK requests.
+func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network devices.
+		return nil
+	}
 
-		mac := make([]byte, 6)
-		brd := mac
-		if len(i.Addr) > 0 {
-			mac = i.Addr
-			brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
+	// Parse message.
+	var ifi linux.InterfaceInfoMessage
+	attrs, ok := msg.GetData(&ifi)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	// Parse attributes.
+	var byName []byte
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
 		}
-		m.PutAttr(linux.IFLA_ADDRESS, mac)
-		m.PutAttr(linux.IFLA_BROADCAST, brd)
+		attrs = rest
 
-		// TODO(gvisor.dev/issue/578): There are many more attributes.
+		switch ahdr.Type {
+		case linux.IFLA_IFNAME:
+			if len(value) < 1 {
+				return syserr.ErrInvalidArgument
+			}
+			byName = value[:len(value)-1]
+
+			// TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK.
+		}
 	}
 
+	found := false
+	for idx, i := range stack.Interfaces() {
+		switch {
+		case ifi.Index > 0:
+			if idx != ifi.Index {
+				continue
+			}
+		case byName != nil:
+			if string(byName) != i.Name {
+				continue
+			}
+		default:
+			// Criteria not specified.
+			return syserr.ErrInvalidArgument
+		}
+
+		addNewLinkMessage(ms, idx, i)
+		found = true
+		break
+	}
+	if !found {
+		return syserr.ErrNoDevice
+	}
 	return nil
 }
 
-// dumpAddrs handles RTM_GETADDR dump requests.
-func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
+// addNewLinkMessage appends RTM_NEWLINK message for the given interface into
+// the message set.
+func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.RTM_NEWLINK,
+	})
+
+	m.Put(linux.InterfaceInfoMessage{
+		Family: linux.AF_UNSPEC,
+		Type:   i.DeviceType,
+		Index:  idx,
+		Flags:  i.Flags,
+	})
+
+	m.PutAttrString(linux.IFLA_IFNAME, i.Name)
+	m.PutAttr(linux.IFLA_MTU, i.MTU)
+
+	mac := make([]byte, 6)
+	brd := mac
+	if len(i.Addr) > 0 {
+		mac = i.Addr
+		brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
 	}
+	m.PutAttr(linux.IFLA_ADDRESS, mac)
+	m.PutAttr(linux.IFLA_BROADCAST, brd)
+
+	// TODO(gvisor.dev/issue/578): There are many more attributes.
+}
 
+// dumpAddrs handles RTM_GETADDR dump requests.
+func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETADDR dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -168,6 +222,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 				Index:     uint32(id),
 			})
 
+			m.PutAttr(linux.IFA_LOCAL, []byte(a.Addr))
 			m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
 
 			// TODO(gvisor.dev/issue/578): There are many more attributes.
@@ -252,12 +307,12 @@ func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) {
 }
 
 // parseForDestination parses a message as format of RouteMessage-RtAttr-dst.
-func parseForDestination(data []byte) ([]byte, *syserr.Error) {
+func parseForDestination(msg *netlink.Message) ([]byte, *syserr.Error) {
 	var rtMsg linux.RouteMessage
-	if len(data) < linux.SizeOfRouteMessage {
+	attrs, ok := msg.GetData(&rtMsg)
+	if !ok {
 		return nil, syserr.ErrInvalidArgument
 	}
-	binary.Unmarshal(data[:linux.SizeOfRouteMessage], usermem.ByteOrder, &rtMsg)
 	// iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See
 	// commit bc234301af12. Note we don't check this flag for backward
 	// compatibility.
@@ -265,26 +320,15 @@ func parseForDestination(data []byte) ([]byte, *syserr.Error) {
 		return nil, syserr.ErrNotSupported
 	}
 
-	data = data[linux.SizeOfRouteMessage:]
-
-	// TODO(gvisor.dev/issue/1611): Add generic attribute parsing.
-	var rtAttr linux.RtAttr
-	if len(data) < linux.SizeOfRtAttr {
-		return nil, syserr.ErrInvalidArgument
+	// Expect first attribute is RTA_DST.
+	if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST {
+		return value, nil
 	}
-	binary.Unmarshal(data[:linux.SizeOfRtAttr], usermem.ByteOrder, &rtAttr)
-	if rtAttr.Type != linux.RTA_DST {
-		return nil, syserr.ErrInvalidArgument
-	}
-
-	if len(data) < int(rtAttr.Len) {
-		return nil, syserr.ErrInvalidArgument
-	}
-	return data[linux.SizeOfRtAttr:rtAttr.Len], nil
+	return nil, syserr.ErrInvalidArgument
 }
 
 // dumpRoutes handles RTM_GETROUTE requests.
-func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETROUTE dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -295,10 +339,11 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 		return nil
 	}
 
+	hdr := msg.Header()
 	routeTables := stack.RouteTable()
 
 	if hdr.Flags == linux.NLM_F_REQUEST {
-		dst, err := parseForDestination(data)
+		dst, err := parseForDestination(msg)
 		if err != nil {
 			return err
 		}
@@ -357,10 +402,55 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 	return nil
 }
 
+// newAddr handles RTM_NEWADDR requests.
+func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network stack.
+		return syserr.ErrProtocolNotSupported
+	}
+
+	var ifa linux.InterfaceAddrMessage
+	attrs, ok := msg.GetData(&ifa)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
+		}
+		attrs = rest
+
+		switch ahdr.Type {
+		case linux.IFA_LOCAL:
+			err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
+				Family:    ifa.Family,
+				PrefixLen: ifa.PrefixLen,
+				Flags:     ifa.Flags,
+				Addr:      value,
+			})
+			if err == syscall.EEXIST {
+				flags := msg.Header().Flags
+				if flags&linux.NLM_F_EXCL != 0 {
+					return syserr.ErrExists
+				}
+			} else if err != nil {
+				return syserr.ErrInvalidArgument
+			}
+		}
+	}
+	return nil
+}
+
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	hdr := msg.Header()
+
 	// All messages start with a 1 byte protocol family.
-	if len(data) < 1 {
+	var family uint8
+	if _, ok := msg.GetData(&family); !ok {
 		// Linux ignores messages missing the protocol family. See
 		// net/core/rtnetlink.c:rtnetlink_rcv_msg.
 		return nil
@@ -374,16 +464,32 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		}
 	}
 
-	switch hdr.Type {
-	case linux.RTM_GETLINK:
-		return p.dumpLinks(ctx, hdr, data, ms)
-	case linux.RTM_GETADDR:
-		return p.dumpAddrs(ctx, hdr, data, ms)
-	case linux.RTM_GETROUTE:
-		return p.dumpRoutes(ctx, hdr, data, ms)
-	default:
-		return syserr.ErrNotSupported
+	if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP {
+		// TODO(b/68878065): Only the dump variant of the types below are
+		// supported.
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.dumpLinks(ctx, msg, ms)
+		case linux.RTM_GETADDR:
+			return p.dumpAddrs(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
+	} else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST {
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.getLink(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		case linux.RTM_NEWADDR:
+			return p.newAddr(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
 	}
+	return syserr.ErrNotSupported
 }
 
 // init registers the NETLINK_ROUTE provider.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index c4b95debb..2ca02567d 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -644,47 +644,38 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 	return nil
 }
 
-func (s *Socket) dumpErrorMesage(ctx context.Context, hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) *syserr.Error {
+func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) {
 	m := ms.AddMessage(linux.NetlinkMessageHeader{
 		Type: linux.NLMSG_ERROR,
 	})
-
 	m.Put(linux.NetlinkErrorMessage{
 		Error:  int32(-err.ToLinux().Number()),
 		Header: hdr,
 	})
-	return nil
+}
 
+func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.NLMSG_ERROR,
+	})
+	m.Put(linux.NetlinkErrorMessage{
+		Error:  0,
+		Header: hdr,
+	})
 }
 
 // processMessages handles each message in buf, passing it to the protocol
 // handler for final handling.
 func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
 	for len(buf) > 0 {
-		if len(buf) < linux.NetlinkMessageHeaderSize {
+		msg, rest, ok := ParseMessage(buf)
+		if !ok {
 			// Linux ignores messages that are too short. See
 			// net/netlink/af_netlink.c:netlink_rcv_skb.
 			break
 		}
-
-		var hdr linux.NetlinkMessageHeader
-		binary.Unmarshal(buf[:linux.NetlinkMessageHeaderSize], usermem.ByteOrder, &hdr)
-
-		if hdr.Length < linux.NetlinkMessageHeaderSize || uint64(hdr.Length) > uint64(len(buf)) {
-			// Linux ignores malformed messages. See
-			// net/netlink/af_netlink.c:netlink_rcv_skb.
-			break
-		}
-
-		// Data from this message.
-		data := buf[linux.NetlinkMessageHeaderSize:hdr.Length]
-
-		// Advance to the next message.
-		next := alignUp(int(hdr.Length), linux.NLMSG_ALIGNTO)
-		if next >= len(buf)-1 {
-			next = len(buf) - 1
-		}
-		buf = buf[next:]
+		buf = rest
+		hdr := msg.Header()
 
 		// Ignore control messages.
 		if hdr.Type < linux.NLMSG_MIN_TYPE {
@@ -692,19 +683,10 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 		}
 
 		ms := NewMessageSet(s.portID, hdr.Seq)
-		var err *syserr.Error
-		// TODO(b/68877377): ACKs not supported yet.
-		if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
-			err = syserr.ErrNotSupported
-		} else {
-
-			err = s.protocol.ProcessMessage(ctx, hdr, data, ms)
-		}
-		if err != nil {
-			ms = NewMessageSet(s.portID, hdr.Seq)
-			if err := s.dumpErrorMesage(ctx, hdr, ms, err); err != nil {
-				return err
-			}
+		if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil {
+			dumpErrorMesage(hdr, ms, err)
+		} else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
+			dumpAckMesage(hdr, ms)
 		}
 
 		if err := s.sendResponse(ctx, ms); err != nil {
diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go
index 1ee4296bc..029ba21b5 100644
--- a/pkg/sentry/socket/netlink/uevent/protocol.go
+++ b/pkg/sentry/socket/netlink/uevent/protocol.go
@@ -49,7 +49,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// Silently ignore all messages.
 	return nil
 }
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 31ea66eca..0692482e9 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -20,6 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -88,6 +90,59 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return nicAddrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	var (
+		protocol tcpip.NetworkProtocolNumber
+		address  tcpip.Address
+	)
+	switch addr.Family {
+	case linux.AF_INET:
+		if len(addr.Addr) < header.IPv4AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv4AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv4.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv4AddressSize])
+
+	case linux.AF_INET6:
+		if len(addr.Addr) < header.IPv6AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv6AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv6.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv6AddressSize])
+
+	default:
+		return syserror.ENOTSUP
+	}
+
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol: protocol,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   address,
+			PrefixLen: int(addr.PrefixLen),
+		},
+	}
+
+	// Attach address to interface.
+	if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
+		return syserr.TranslateNetstackError(err).ToError()
+	}
+
+	// Add route for local network.
+	s.Stack.AddRoute(tcpip.Route{
+		Destination: protocolAddress.AddressWithPrefix.Subnet(),
+		Gateway:     "", // No gateway for local network.
+		NIC:         tcpip.NICID(idx),
+	})
+	return nil
+}
+
 // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
 func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
 	var rs tcp.ReceiveBufferSizeOption
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7057b110e..b793f1d74 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -795,6 +795,8 @@ func (s *Stack) Forwarding() bool {
 
 // SetRouteTable assigns the route table to be used by this stack. It
 // specifies which NIC to use for given destination address ranges.
+//
+// This method takes ownership of the table.
 func (s *Stack) SetRouteTable(table []tcpip.Route) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -809,6 +811,13 @@ func (s *Stack) GetRouteTable() []tcpip.Route {
 	return append([]tcpip.Route(nil), s.routeTable...)
 }
 
+// AddRoute appends a route to the route table.
+func (s *Stack) AddRoute(route tcpip.Route) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.routeTable = append(s.routeTable, route)
+}
+
 // NewEndpoint creates a new transport layer endpoint of the given protocol.
 func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	t, ok := s.transportProtocols[transport]
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 273b014d6..f2e3c7072 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2769,9 +2769,11 @@ cc_binary(
     deps = [
         ":socket_netlink_util",
         ":socket_test_util",
+        "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
         gtest,
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 1e28e658d..e5aed1eec 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -14,6 +14,7 @@
 
 #include <arpa/inet.h>
 #include <ifaddrs.h>
+#include <linux/if.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 #include <sys/socket.h>
@@ -25,8 +26,10 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/optional.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
@@ -38,6 +41,8 @@ namespace testing {
 
 namespace {
 
+constexpr uint32_t kSeq = 12345;
+
 using ::testing::AnyOf;
 using ::testing::Eq;
 
@@ -113,58 +118,224 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
+  // Loopback is common among all tests, check that it's found.
+  bool loopbackFound = false;
+  ASSERT_NO_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    CheckGetLinkResponse(hdr, kSeq, port);
+    if (hdr->nlmsg_type != RTM_NEWLINK) {
+      return;
+    }
+    ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    std::cout << "Found interface idx=" << msg->ifi_index
+              << ", type=" << std::hex << msg->ifi_type;
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      loopbackFound = true;
+      EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
+    }
+  }));
+  EXPECT_TRUE(loopbackFound);
+}
+
+struct Link {
+  int index;
+  std::string name;
+};
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  absl::optional<Link> link;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+      if (rta == nullptr) {
+        // Ignore links that do not have a name.
+        return;
+      }
+
+      link = Link();
+      link->index = msg->ifi_index;
+      link->name = std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    }
+  }));
+  return link;
+}
+
+// CheckLinkMsg checks a netlink message against an expected link.
+void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
+  ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK));
+  ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+  const struct ifinfomsg* msg =
+      reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+  EXPECT_EQ(msg->ifi_index, link.index);
+
+  const struct rtattr* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+  EXPECT_NE(nullptr, rta) << "IFLA_IFNAME not found in message.";
+  if (rta != nullptr) {
+    std::string name(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    EXPECT_EQ(name, link.name);
+  }
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndex) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = loopback_link->index;
 
-  // Loopback is common among all tests, check that it's found.
-  bool loopbackFound = false;
+  bool found = false;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckGetLinkResponse(hdr, kSeq, port);
-        if (hdr->nlmsg_type != RTM_NEWLINK) {
-          return;
-        }
-        ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
-        const struct ifinfomsg* msg =
-            reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
-        std::cout << "Found interface idx=" << msg->ifi_index
-                  << ", type=" << std::hex << msg->ifi_type;
-        if (msg->ifi_type == ARPHRD_LOOPBACK) {
-          loopbackFound = true;
-          EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
-        }
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
       },
       false));
-  EXPECT_TRUE(loopbackFound);
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
 }
 
-TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+TEST(NetlinkRouteTest, GetLinkByName) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
   };
 
-  constexpr uint32_t kSeq = 12345;
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(loopback_link->name.size() + 1);
+  strncpy(req.ifname, loopback_link->name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  bool found = false;
+  ASSERT_NO_ERRNO(NetlinkRequestResponse(
+      fd, &req, sizeof(req),
+      [&](const struct nlmsghdr* hdr) {
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
+      },
+      false));
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndexNotFound) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = 1234590;
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, GetLinkByNameNotFound) {
+  const std::string name = "nodevice?!";
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(name.size() + 1);
+  strncpy(req.ifname, name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -175,18 +346,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
 
-  ASSERT_NO_ERRNO(NetlinkRequestResponse(
-      fd, &req, sizeof(req),
-      [&](const struct nlmsghdr* hdr) {
-        EXPECT_THAT(hdr->nlmsg_type, Eq(NLMSG_ERROR));
-        EXPECT_EQ(hdr->nlmsg_seq, kSeq);
-        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
-
-        const struct nlmsgerr* msg =
-            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
-        EXPECT_EQ(msg->error, -EOPNOTSUPP);
-      },
-      true));
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(EOPNOTSUPP, ::testing::_));
 }
 
 TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
@@ -198,8 +359,6 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -238,8 +397,6 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -282,8 +439,6 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
 
   // This control message is ignored. We still receive a response for the
@@ -317,8 +472,6 @@ TEST(NetlinkRouteTest, GetAddrDump) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -367,6 +520,57 @@ TEST(NetlinkRouteTest, LookupAll) {
   ASSERT_GT(count, 0);
 }
 
+TEST(NetlinkRouteTest, AddAddr) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifa;
+    struct rtattr rtattr;
+    struct in_addr addr;
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifa.ifa_family = AF_INET;
+  req.ifa.ifa_prefixlen = 24;
+  req.ifa.ifa_flags = 0;
+  req.ifa.ifa_scope = 0;
+  req.ifa.ifa_index = loopback_link->index;
+  req.rtattr.rta_type = IFA_LOCAL;
+  req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
+  inet_pton(AF_INET, "10.0.0.1", &req.addr);
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifa)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  // Create should succeed, as no such address in kernel.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Replace an existing address should succeed.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_REPLACE | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Create exclusive should fail, as we created the address above.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_THAT(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len),
+      PosixErrorIs(EEXIST, ::testing::_));
+}
+
 // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request.
 TEST(NetlinkRouteTest, GetRouteDump) {
   FileDescriptor fd =
@@ -378,8 +582,6 @@ TEST(NetlinkRouteTest, GetRouteDump) {
     struct rtmsg rtm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETROUTE;
@@ -538,8 +740,6 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -615,8 +815,6 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -695,8 +893,6 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -743,8 +939,6 @@ TEST(NetlinkRouteTest, PasscredCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index cd2212a1a..952eecfe8 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -16,6 +16,7 @@
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 #include <sys/socket.h>
 
 #include <vector>
@@ -71,9 +72,10 @@ PosixError NetlinkRequestResponse(
   iov.iov_base = buf.data();
   iov.iov_len = buf.size();
 
-  // Response is a series of NLM_F_MULTI messages, ending with a NLMSG_DONE
-  // message.
+  // If NLM_F_MULTI is set, response is a series of messages that ends with a
+  // NLMSG_DONE message.
   int type = -1;
+  int flags = 0;
   do {
     int len;
     RETURN_ERROR_IF_SYSCALL_FAIL(len = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
@@ -89,6 +91,7 @@ PosixError NetlinkRequestResponse(
     for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
          NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
       fn(hdr);
+      flags = hdr->nlmsg_flags;
       type = hdr->nlmsg_type;
       // Done should include an integer payload for dump_done_errno.
       // See net/netlink/af_netlink.c:netlink_dump
@@ -98,11 +101,11 @@ PosixError NetlinkRequestResponse(
         EXPECT_GE(hdr->nlmsg_len, NLMSG_LENGTH(sizeof(int)));
       }
     }
-  } while (type != NLMSG_DONE && type != NLMSG_ERROR);
+  } while ((flags & NLM_F_MULTI) && type != NLMSG_DONE && type != NLMSG_ERROR);
 
   if (expect_nlmsgerr) {
     EXPECT_EQ(type, NLMSG_ERROR);
-  } else {
+  } else if (flags & NLM_F_MULTI) {
     EXPECT_EQ(type, NLMSG_DONE);
   }
   return NoError();
@@ -146,5 +149,39 @@ PosixError NetlinkRequestResponseSingle(
   return NoError();
 }
 
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len) {
+  // Dummy negative number for no error message received.
+  // We won't get a negative error number so there will be no confusion.
+  int err = -42;
+  RETURN_IF_ERRNO(NetlinkRequestResponse(
+      fd, request, len,
+      [&](const struct nlmsghdr* hdr) {
+        EXPECT_EQ(NLMSG_ERROR, hdr->nlmsg_type);
+        EXPECT_EQ(hdr->nlmsg_seq, seq);
+        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
+
+        const struct nlmsgerr* msg =
+            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
+        err = -msg->error;
+      },
+      true));
+  return PosixError(err);
+}
+
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr) {
+  const int ifi_space = NLMSG_SPACE(sizeof(*msg));
+  int attrlen = hdr->nlmsg_len - ifi_space;
+  const struct rtattr* rta = reinterpret_cast<const struct rtattr*>(
+      reinterpret_cast<const uint8_t*>(hdr) + NLMSG_ALIGN(ifi_space));
+  for (; RTA_OK(rta, attrlen); rta = RTA_NEXT(rta, attrlen)) {
+    if (rta->rta_type == attr) {
+      return rta;
+    }
+  }
+  return nullptr;
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index 3678c0599..e13ead406 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -19,6 +19,7 @@
 // socket.h has to be included before if_arp.h.
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -47,6 +48,14 @@ PosixError NetlinkRequestResponseSingle(
     const FileDescriptor& fd, void* request, size_t len,
     const std::function<void(const struct nlmsghdr* hdr)>& fn);
 
+// Send the passed request then expect and return an ack or error.
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len);
+
+// Find rtnetlink attribute in message.
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr);
+
 }  // namespace testing
 }  // namespace gvisor
 
-- 
cgit v1.2.3


From f3d95607036b8a502c65aa7b3e8145227274dbbc Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 5 Feb 2020 17:56:00 -0800
Subject: recv() on a closed TCP socket returns ENOTCONN

From RFC 793 s3.9 p58 Event Processing:

If RECEIVE Call arrives in CLOSED state and the user has access to such a
connection, the return should be "error: connection does not exist"

Fixes #1598

PiperOrigin-RevId: 293494287
---
 pkg/sentry/socket/netstack/netstack.go | 7 ++++++-
 pkg/tcpip/tcpip.go                     | 4 ++++
 pkg/tcpip/transport/tcp/endpoint.go    | 4 ++--
 pkg/tcpip/transport/tcp/tcp_test.go    | 9 ++++-----
 test/syscalls/linux/tcp_socket.cc      | 9 +++++++++
 5 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 049d04bf2..ed2fbcceb 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2229,11 +2229,16 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 	var copied int
 
 	// Copy as many views as possible into the user-provided buffer.
-	for dst.NumBytes() != 0 {
+	for {
+		// Always do at least one fetchReadView, even if the number of bytes to
+		// read is 0.
 		err = s.fetchReadView()
 		if err != nil {
 			break
 		}
+		if dst.NumBytes() == 0 {
+			break
+		}
 
 		var n int
 		var e error
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0fa141d58..d29d9a704 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1124,6 +1124,10 @@ type ReadErrors struct {
 	// InvalidEndpointState is the number of times we found the endpoint state
 	// to be unexpected.
 	InvalidEndpointState StatCounter
+
+	// NotConnected is the number of times we tried to read but found that the
+	// endpoint was not connected.
+	NotConnected StatCounter
 }
 
 // WriteErrors collects packet write errors from an endpoint write call.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b5a8e15ee..e4a6b1b8b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1003,8 +1003,8 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
-		e.stats.ReadErrors.InvalidEndpointState.Increment()
-		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
+		e.stats.ReadErrors.NotConnected.Increment()
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected
 	}
 
 	v, err := e.readLocked()
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 2c1505067..cc118c993 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -5405,12 +5405,11 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
 
-	// Expect InvalidEndpointState errors on a read at this point.
-	if _, _, err := ep.Read(nil); err != tcpip.ErrInvalidEndpointState {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrInvalidEndpointState)
+	if _, _, err := ep.Read(nil); err != tcpip.ErrNotConnected {
+		t.Errorf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrNotConnected)
 	}
-	if got := ep.Stats().(*tcp.Stats).ReadErrors.InvalidEndpointState.Value(); got != 1 {
-		t.Fatalf("got EP stats Stats.ReadErrors.InvalidEndpointState got %v want %v", got, 1)
+	if got := ep.Stats().(*tcp.Stats).ReadErrors.NotConnected.Value(); got != 1 {
+		t.Errorf("got EP stats Stats.ReadErrors.NotConnected got %v want %v", got, 1)
 	}
 
 	if err := ep.Listen(10); err != nil {
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 525ccbd88..8a8b68e75 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1339,6 +1339,15 @@ TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
   EXPECT_EQ(get, kTCPDeferAccept);
 }
 
+TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
+  auto s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  char buf[1];
+  EXPECT_THAT(recv(s.get(), buf, 0, 0), SyscallFailsWithErrno(ENOTCONN));
+  EXPECT_THAT(recv(s.get(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 1b6a12a768216a99a5e0428c42ea4faf79cf3b50 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 5 Feb 2020 22:45:44 -0800
Subject: Add notes to relevant tests.

These were out-of-band notes that can help provide additional context
and simplify automated imports.

PiperOrigin-RevId: 293525915
---
 pkg/metric/metric.go                          |  1 -
 pkg/sentry/arch/arch_x86.go                   |  4 ++
 pkg/sentry/arch/signal_amd64.go               |  2 +-
 pkg/sentry/fs/file_overlay_test.go            |  1 +
 pkg/sentry/fs/proc/README.md                  |  4 ++
 pkg/sentry/kernel/BUILD                       |  1 +
 pkg/sentry/kernel/kernel.go                   |  3 ++
 pkg/sentry/kernel/kernel_opts.go              | 20 +++++++
 pkg/sentry/socket/hostinet/BUILD              |  1 +
 pkg/sentry/socket/hostinet/socket.go          |  5 +-
 pkg/sentry/socket/hostinet/sockopt_impl.go    | 27 ++++++++++
 pkg/tcpip/transport/tcp/endpoint.go           |  3 ++
 runsc/boot/filter/BUILD                       |  1 +
 runsc/boot/filter/config.go                   | 13 -----
 runsc/boot/filter/config_profile.go           | 34 ++++++++++++
 runsc/container/console_test.go               |  5 +-
 runsc/dockerutil/dockerutil.go                | 11 ++--
 runsc/testutil/BUILD                          |  5 +-
 runsc/testutil/testutil.go                    | 54 -------------------
 runsc/testutil/testutil_runfiles.go           | 75 +++++++++++++++++++++++++++
 test/image/image_test.go                      |  8 +--
 test/syscalls/build_defs.bzl                  | 35 +++++++++++--
 test/syscalls/linux/chroot.cc                 |  2 +-
 test/syscalls/linux/concurrency.cc            |  3 +-
 test/syscalls/linux/exec_proc_exe_workload.cc |  6 +++
 test/syscalls/linux/fork.cc                   |  5 +-
 test/syscalls/linux/mmap.cc                   |  8 +--
 test/syscalls/linux/open_create.cc            |  1 +
 test/syscalls/linux/preadv.cc                 |  1 +
 test/syscalls/linux/proc.cc                   | 46 +++++++++++++---
 test/syscalls/linux/readv.cc                  |  4 +-
 test/syscalls/linux/rseq.cc                   |  2 +-
 test/syscalls/linux/select.cc                 |  2 +-
 test/syscalls/linux/shm.cc                    |  2 +-
 test/syscalls/linux/sigprocmask.cc            |  2 +-
 test/syscalls/linux/socket_unix_non_stream.cc |  4 +-
 test/syscalls/linux/symlink.cc                |  2 +-
 test/syscalls/linux/tcp_socket.cc             |  3 +-
 test/syscalls/linux/time.cc                   |  1 +
 test/syscalls/linux/tkill.cc                  |  2 +-
 test/util/temp_path.cc                        |  1 +
 tools/build/tags.bzl                          |  4 ++
 tools/defs.bzl                                | 17 +++++-
 43 files changed, 318 insertions(+), 113 deletions(-)
 create mode 100644 pkg/sentry/kernel/kernel_opts.go
 create mode 100644 pkg/sentry/socket/hostinet/sockopt_impl.go
 create mode 100644 runsc/boot/filter/config_profile.go
 create mode 100644 runsc/testutil/testutil_runfiles.go

(limited to 'pkg/sentry/socket')

diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 93d4f2b8c..006fcd9ab 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -46,7 +46,6 @@ var (
 //
 // TODO(b/67298402): Support non-cumulative metrics.
 // TODO(b/67298427): Support metric fields.
-//
 type Uint64Metric struct {
 	// value is the actual value of the metric. It must be accessed
 	// atomically.
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index a18093155..3db8bd34b 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -114,6 +114,10 @@ func newX86FPStateSlice() []byte {
 	size, align := cpuid.HostFeatureSet().ExtendedStateSize()
 	capacity := size
 	// Always use at least 4096 bytes.
+	//
+	// For the KVM platform, this state is a fixed 4096 bytes, so make sure
+	// that the underlying array is at _least_ that size otherwise we will
+	// corrupt random memory. This is not a pleasant thing to debug.
 	if capacity < 4096 {
 		capacity = 4096
 	}
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 81b92bb43..6fb756f0e 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -55,7 +55,7 @@ type SignalContext64 struct {
 	Trapno  uint64
 	Oldmask linux.SignalSet
 	Cr2     uint64
-	// Pointer to a struct _fpstate.
+	// Pointer to a struct _fpstate. See b/33003106#comment8.
 	Fpstate  uint64
 	Reserved [8]uint64
 }
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 02538bb4f..a76d87e3a 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -177,6 +177,7 @@ func TestReaddirRevalidation(t *testing.T) {
 
 // TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with
 // a frozen dirent tree does not make Readdir calls to the underlying files.
+// This is a regression test for b/114808269.
 func TestReaddirOverlayFrozen(t *testing.T) {
 	ctx := contexttest.Context(t)
 
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 5d4ec6c7b..6667a0916 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -11,6 +11,8 @@ inconsistency, please file a bug.
 
 The following files are implemented:
 
+<!-- mdformat off(don't wrap the table) -->
+
 | File /proc/                 | Content                                               |
 | :------------------------   | :---------------------------------------------------- |
 | [cpuinfo](#cpuinfo)         | Info about the CPU                                    |
@@ -22,6 +24,8 @@ The following files are implemented:
 | [uptime](#uptime)           | Wall clock since boot, combined idle time of all cpus |
 | [version](#version)         | Kernel version                                        |
 
+<!-- mdformat on -->
+
 ### cpuinfo
 
 ```bash
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a27628c0a..2231d6973 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -91,6 +91,7 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_opts.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index dcd6e91c4..3ee760ba2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -235,6 +235,9 @@ type Kernel struct {
 	// events. This is initialized lazily on the first unimplemented
 	// syscall.
 	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
+
+	// SpecialOpts contains special kernel options.
+	SpecialOpts
 }
 
 // InitKernelArgs holds arguments to Init.
diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go
new file mode 100644
index 000000000..2e66ec587
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_opts.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// SpecialOpts contains non-standard options for the kernel.
+//
+// +stateify savable
+type SpecialOpts struct{}
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 5a07d5d0e..023bad156 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -10,6 +10,7 @@ go_library(
         "save_restore.go",
         "socket.go",
         "socket_unsafe.go",
+        "sockopt_impl.go",
         "stack.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 34f63986f..de76388ac 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -285,7 +285,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	}
 
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := getSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -330,7 +330,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := setSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -353,6 +353,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 			optlen = sizeofInt32
 		}
 	}
+
 	if optlen == 0 {
 		// Pretend to accept socket options we don't understand. This seems
 		// dangerous, but it's what netstack does...
diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go
new file mode 100644
index 000000000..8a783712e
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/sockopt_impl.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+func getSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
+
+func setSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index e4a6b1b8b..f2be0e651 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2166,6 +2166,9 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	e.isRegistered = true
 	e.setEndpointState(StateListen)
 
+	// The channel may be non-nil when we're restoring the endpoint, and it
+	// may be pre-populated with some previously accepted (but not Accepted)
+	// endpoints.
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index ce30f6c53..ed18f0047 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -8,6 +8,7 @@ go_library(
         "config.go",
         "config_amd64.go",
         "config_arm64.go",
+        "config_profile.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index f8d351c7b..c69f4c602 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -536,16 +536,3 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 		},
 	}
 }
-
-// profileFilters returns extra syscalls made by runtime/pprof package.
-func profileFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_OPENAT: []seccomp.Rule{
-			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
-			},
-		},
-	}
-}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
new file mode 100644
index 000000000..194952a7b
--- /dev/null
+++ b/runsc/boot/filter/config_profile.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 060b63bf3..c2518d52b 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -196,7 +196,10 @@ func TestJobControlSignalExec(t *testing.T) {
 	defer ptyMaster.Close()
 	defer ptySlave.Close()
 
-	// Exec bash and attach a terminal.
+	// Exec bash and attach a terminal. Note that occasionally /bin/sh
+	// may be a different shell or have a different configuration (such
+	// as disabling interactive mode and job control). Since we want to
+	// explicitly test interactive mode, use /bin/bash. See b/116981926.
 	execArgs := &control.ExecArgs{
 		Filename: "/bin/bash",
 		// Don't let bash execute from profile or rc files, otherwise
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 9b6346ca2..1ff5e8cc3 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -143,8 +143,11 @@ func PrepareFiles(names ...string) (string, error) {
 		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
 	}
 	for _, name := range names {
-		src := getLocalPath(name)
-		dst := path.Join(dir, name)
+		src, err := testutil.FindFile(name)
+		if err != nil {
+			return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err)
+		}
+		dst := path.Join(dir, path.Base(name))
 		if err := testutil.Copy(src, dst); err != nil {
 			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
 		}
@@ -152,10 +155,6 @@ func PrepareFiles(names ...string) (string, error) {
 	return dir, nil
 }
 
-func getLocalPath(file string) string {
-	return path.Join(".", file)
-}
-
 // do executes docker command.
 func do(args ...string) (string, error) {
 	log.Printf("Running: docker %s\n", args)
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index f845120b0..945405303 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -5,7 +5,10 @@ package(licenses = ["notice"])
 go_library(
     name = "testutil",
     testonly = 1,
-    srcs = ["testutil.go"],
+    srcs = [
+        "testutil.go",
+        "testutil_runfiles.go",
+    ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index edf2e809a..80c2c9680 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -79,60 +79,6 @@ func ConfigureExePath() error {
 	return nil
 }
 
-// FindFile searchs for a file inside the test run environment. It returns the
-// full path to the file. It fails if none or more than one file is found.
-func FindFile(path string) (string, error) {
-	wd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	// The test root is demarcated by a path element called "__main__". Search for
-	// it backwards from the working directory.
-	root := wd
-	for {
-		dir, name := filepath.Split(root)
-		if name == "__main__" {
-			break
-		}
-		if len(dir) == 0 {
-			return "", fmt.Errorf("directory __main__ not found in %q", wd)
-		}
-		// Remove ending slash to loop around.
-		root = dir[:len(dir)-1]
-	}
-
-	// Annoyingly, bazel adds the build type to the directory path for go
-	// binaries, but not for c++ binaries. We use two different patterns to
-	// to find our file.
-	patterns := []string{
-		// Try the obvious path first.
-		filepath.Join(root, path),
-		// If it was a go binary, use a wildcard to match the build
-		// type. The pattern is: /test-path/__main__/directories/*/file.
-		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
-	}
-
-	for _, p := range patterns {
-		matches, err := filepath.Glob(p)
-		if err != nil {
-			// "The only possible returned error is ErrBadPattern,
-			// when pattern is malformed." -godoc
-			return "", fmt.Errorf("error globbing %q: %v", p, err)
-		}
-		switch len(matches) {
-		case 0:
-			// Try the next pattern.
-		case 1:
-			// We found it.
-			return matches[0], nil
-		default:
-			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
-		}
-	}
-	return "", fmt.Errorf("file %q not found", path)
-}
-
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
 func TestConfig() *boot.Config {
diff --git a/runsc/testutil/testutil_runfiles.go b/runsc/testutil/testutil_runfiles.go
new file mode 100644
index 000000000..ece9ea9a1
--- /dev/null
+++ b/runsc/testutil/testutil_runfiles.go
@@ -0,0 +1,75 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// FindFile searchs for a file inside the test run environment. It returns the
+// full path to the file. It fails if none or more than one file is found.
+func FindFile(path string) (string, error) {
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	// The test root is demarcated by a path element called "__main__". Search for
+	// it backwards from the working directory.
+	root := wd
+	for {
+		dir, name := filepath.Split(root)
+		if name == "__main__" {
+			break
+		}
+		if len(dir) == 0 {
+			return "", fmt.Errorf("directory __main__ not found in %q", wd)
+		}
+		// Remove ending slash to loop around.
+		root = dir[:len(dir)-1]
+	}
+
+	// Annoyingly, bazel adds the build type to the directory path for go
+	// binaries, but not for c++ binaries. We use two different patterns to
+	// to find our file.
+	patterns := []string{
+		// Try the obvious path first.
+		filepath.Join(root, path),
+		// If it was a go binary, use a wildcard to match the build
+		// type. The pattern is: /test-path/__main__/directories/*/file.
+		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
+	}
+
+	for _, p := range patterns {
+		matches, err := filepath.Glob(p)
+		if err != nil {
+			// "The only possible returned error is ErrBadPattern,
+			// when pattern is malformed." -godoc
+			return "", fmt.Errorf("error globbing %q: %v", p, err)
+		}
+		switch len(matches) {
+		case 0:
+			// Try the next pattern.
+		case 1:
+			// We found it.
+			return matches[0], nil
+		default:
+			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+		}
+	}
+	return "", fmt.Errorf("file %q not found", path)
+}
diff --git a/test/image/image_test.go b/test/image/image_test.go
index d0dcb1861..0a1e19d6f 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -107,7 +107,7 @@ func TestHttpd(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("http-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -139,7 +139,7 @@ func TestNginx(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("net-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -183,7 +183,7 @@ func TestMysql(t *testing.T) {
 	}
 
 	client := dockerutil.MakeDocker("mysql-client-test")
-	dir, err := dockerutil.PrepareFiles("mysql.sql")
+	dir, err := dockerutil.PrepareFiles("test/image/mysql.sql")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -283,7 +283,7 @@ func TestRuby(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("ruby-test")
 
-	dir, err := dockerutil.PrepareFiles("ruby.rb", "ruby.sh")
+	dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index 1df761dd0..cbab85ef7 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -2,8 +2,6 @@
 
 load("//tools:defs.bzl", "loopback")
 
-# syscall_test is a macro that will create targets to run the given test target
-# on the host (native) and runsc.
 def syscall_test(
         test,
         shard_count = 5,
@@ -13,6 +11,19 @@ def syscall_test(
         add_uds_tree = False,
         add_hostinet = False,
         tags = None):
+    """syscall_test is a macro that will create targets for all platforms.
+
+    Args:
+      test: the test target.
+      shard_count: shards for defined tests.
+      size: the defined test size.
+      use_tmpfs: use tmpfs in the defined tests.
+      add_overlay: add an overlay test.
+      add_uds_tree: add a UDS test.
+      add_hostinet: add a hostinet test.
+      tags: starting test tags.
+    """
+
     _syscall_test(
         test = test,
         shard_count = shard_count,
@@ -111,6 +122,19 @@ def _syscall_test(
     # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
     tags += [full_platform, "file_" + file_access]
 
+    # Hash this target into one of 15 buckets. This can be used to
+    # randomly split targets between different workflows.
+    hash15 = hash(native.package_name() + name) % 15
+    tags.append("hash15:" + str(hash15))
+
+    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
+    # we figure out how to request ipv4 sockets on Guitar machines.
+    if network == "host":
+        tags.append("noguitar")
+
+    # Disable off-host networking.
+    tags.append("requires-net:loopback")
+
     # Add tag to prevent the tests from running in a Bazel sandbox.
     # TODO(b/120560048): Make the tests run without this tag.
     tags.append("no-sandbox")
@@ -118,8 +142,11 @@ def _syscall_test(
     # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
     # more stable.
     if platform == "kvm":
-        tags += ["manual"]
-        tags += ["requires-kvm"]
+        tags.append("manual")
+        tags.append("requires-kvm")
+
+        # TODO(b/112165693): Remove when tests pass reliably.
+        tags.append("notap")
 
     args = [
         # Arguments are passed directly to syscall_test_runner binary.
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 0a2d44a2c..85ec013d5 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -167,7 +167,7 @@ TEST(ChrootTest, DotDotFromOpenFD) {
 }
 
 // Test that link resolution in a chroot can escape the root by following an
-// open proc fd.
+// open proc fd. Regression test for b/32316719.
 TEST(ChrootTest, ProcFdLinkResolutionInChroot) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
 
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index f41f99900..7cd6a75bd 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -46,7 +46,8 @@ TEST(ConcurrencyTest, SingleProcessMultithreaded) {
 }
 
 // Test that multiple threads in this process continue to execute in parallel,
-// even if an unrelated second process is spawned.
+// even if an unrelated second process is spawned. Regression test for
+// b/32119508.
 TEST(ConcurrencyTest, MultiProcessMultithreaded) {
   // In PID 1, start TIDs 1 and 2, and put both to sleep.
   //
diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc
index b790fe5be..2989379b7 100644
--- a/test/syscalls/linux/exec_proc_exe_workload.cc
+++ b/test/syscalls/linux/exec_proc_exe_workload.cc
@@ -21,6 +21,12 @@
 #include "test/util/posix_error.h"
 
 int main(int argc, char** argv, char** envp) {
+  // This is annoying. Because remote build systems may put these binaries
+  // in a content-addressable-store, you may wind up with /proc/self/exe
+  // pointing to some random path (but with a sensible argv[0]).
+  //
+  // Therefore, this test simply checks that the /proc/self/exe
+  // is absolute and *doesn't* match argv[1].
   std::string exe =
       gvisor::testing::ProcessExePath(getpid()).ValueOrDie();
   if (exe[0] != '/') {
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 906f3358d..ff8bdfeb0 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -271,7 +271,7 @@ TEST_F(ForkTest, Alarm) {
   EXPECT_EQ(0, alarmed);
 }
 
-// Child cannot affect parent private memory.
+// Child cannot affect parent private memory. Regression test for b/24137240.
 TEST_F(ForkTest, PrivateMemory) {
   std::atomic<uint32_t> local(0);
 
@@ -298,6 +298,9 @@ TEST_F(ForkTest, PrivateMemory) {
 }
 
 // Kernel-accessed buffers should remain coherent across COW.
+//
+// The buffer must be >= usermem.ZeroCopyMinBytes, as UnsafeAccess operates
+// differently. Regression test for b/33811887.
 TEST_F(ForkTest, COWSegment) {
   constexpr int kBufSize = 1024;
   char* read_buf = private_;
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 1c4d9f1c7..11fb1b457 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -1418,7 +1418,7 @@ TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) {
 //
 // On most platforms this is trivial, but when the file is mapped via the sentry
 // page cache (which does not yet support writing to shared mappings), a bug
-// caused reads to fail unnecessarily on such mappings.
+// caused reads to fail unnecessarily on such mappings. See b/28913513.
 TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
   uintptr_t addr;
   size_t len = strlen(kFileContents);
@@ -1435,7 +1435,7 @@ TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
 
 // Tests that EFAULT is returned when invoking a syscall that requires the OS to
 // read past end of file (resulting in a fault in sentry context in the gVisor
-// case).
+// case). See b/28913513.
 TEST_F(MMapFileTest, InternalSigBus) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
@@ -1578,7 +1578,7 @@ TEST_F(MMapFileTest, Bug38498194) {
 }
 
 // Tests that reading from a file to a memory mapping of the same file does not
-// deadlock.
+// deadlock. See b/34813270.
 TEST_F(MMapFileTest, SelfRead) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
@@ -1590,7 +1590,7 @@ TEST_F(MMapFileTest, SelfRead) {
 }
 
 // Tests that writing to a file from a memory mapping of the same file does not
-// deadlock.
+// deadlock. Regression test for b/34813270.
 TEST_F(MMapFileTest, SelfWrite) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 431733dbe..902d0a0dc 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -132,6 +132,7 @@ TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) {
 }
 
 // A file originally created RW, but opened RO can later be opened RW.
+// Regression test for b/65385065.
 TEST(CreateTest, OpenCreateROThenRW) {
   TempPath file(NewTempAbsPath());
 
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index f7ea44054..5b0743fe9 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -37,6 +37,7 @@ namespace testing {
 
 namespace {
 
+// Stress copy-on-write. Attempts to reproduce b/38430174.
 TEST(PreadvTest, MMConcurrencyStress) {
   // Fill a one-page file with zeroes (the contents don't really matter).
   const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 169b723eb..a23fdb58d 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1352,13 +1352,19 @@ TEST(ProcPidSymlink, SubprocessZombied) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 & gVisor: Syscall succeeds and returns 1
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 &  gVisor: Syscall succeeds and returns 1.
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 }
@@ -1431,8 +1437,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 TEST(ProcPidFile, SubprocessZombie) {
   char buf[1];
 
-  // 4.17: Succeeds and returns 1
-  // gVisor: Succeeds and returns 0
+  // FIXME(gvisor.dev/issue/164): Loosen requirement due to inconsistent
+  // behavior on different kernels.
+  //
+  // ~4.3: Succeds and returns 0.
+  // 4.17: Succeeds and returns 1.
+  // gVisor: Succeeds and returns 0.
   EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds());
 
   EXPECT_THAT(ReadWhileZombied("cmdline", buf, sizeof(buf)),
@@ -1458,7 +1468,10 @@ TEST(ProcPidFile, SubprocessZombie) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
+  //
+  // ~4.3: Fails and returns EACCES.
   // gVisor & 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)),
   //          SyscallFailsWithErrno(EACCES));
 }
@@ -1467,9 +1480,12 @@ TEST(ProcPidFile, SubprocessZombie) {
 TEST(ProcPidFile, SubprocessExited) {
   char buf[1];
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels.
+  //
+  // ~4.3: Fails and returns ESRCH.
   // gVisor: Fails with ESRCH.
   // 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(ESRCH));
 
@@ -1641,7 +1657,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task",
                                      TaskFiles(initial, {child1.Tid()})));
 
-  // Stat child1's task file.
+  // Stat child1's task file. Regression test for b/32097707.
   struct stat statbuf;
   const std::string child1_task_file =
       absl::StrCat("/proc/self/task/", child1.Tid());
@@ -1669,7 +1685,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(EventuallyDirContainsExactly(
       "/proc/self/task", TaskFiles(initial, {child3.Tid(), child5.Tid()})));
 
-  // Stat child1's task file again.  This time it should fail.
+  // Stat child1's task file again.  This time it should fail. See b/32097707.
   EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf),
               SyscallFailsWithErrno(ENOENT));
 
@@ -1824,7 +1840,7 @@ TEST(ProcSysVmOvercommitMemory, HasNumericValue) {
 }
 
 // Check that link for proc fd entries point the target node, not the
-// symlink itself.
+// symlink itself. Regression test for b/31155070.
 TEST(ProcTaskFd, FstatatFollowsSymlink) {
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const FileDescriptor fd =
@@ -1883,6 +1899,20 @@ TEST(ProcMounts, IsSymlink) {
   EXPECT_EQ(link, "self/mounts");
 }
 
+TEST(ProcSelfMountinfo, RequiredFieldsArePresent) {
+  auto mountinfo =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mountinfo"));
+  EXPECT_THAT(
+      mountinfo,
+      AllOf(
+          // Root mount.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / / (rw|ro).*- \S+ \S+ (rw|ro)\S*)"),
+          // Proc mount - always rw.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / /proc rw.*- \S+ \S+ rw\S*)")));
+}
+
 // Check that /proc/self/mounts looks something like a real mounts file.
 TEST(ProcSelfMounts, RequiredFieldsArePresent) {
   auto mounts = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mounts"));
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
index 4069cbc7e..baaf9f757 100644
--- a/test/syscalls/linux/readv.cc
+++ b/test/syscalls/linux/readv.cc
@@ -254,7 +254,9 @@ TEST_F(ReadvTest, IovecOutsideTaskAddressRangeInNonemptyArray) {
 // This test depends on the maximum extent of a single readv() syscall, so
 // we can't tolerate interruption from saving.
 TEST(ReadvTestNoFixture, TruncatedAtMax_NoRandomSave) {
-  // Ensure that we won't be interrupted by ITIMER_PROF.
+  // Ensure that we won't be interrupted by ITIMER_PROF. This is particularly
+  // important in environments where automated profiling tools may start
+  // ITIMER_PROF automatically.
   struct itimerval itv = {};
   auto const cleanup_itimer =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_PROF, itv));
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
index 106c045e3..4bfb1ff56 100644
--- a/test/syscalls/linux/rseq.cc
+++ b/test/syscalls/linux/rseq.cc
@@ -36,7 +36,7 @@ namespace {
 // We must be very careful about how these tests are written. Each thread may
 // only have one struct rseq registration, which may be done automatically at
 // thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
-// not do so).
+// not do so, but other libraries do).
 //
 // Testing of rseq is thus done primarily in a child process with no
 // registration. This means exec'ing a nostdlib binary, as rseq registration can
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 424e2a67f..be2364fb8 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -146,7 +146,7 @@ TEST_F(SelectTest, IgnoreBitsAboveNfds) {
 
 // This test illustrates Linux's behavior of 'select' calls passing after
 // setrlimit RLIMIT_NOFILE is called. In particular, versions of sshd rely on
-// this behavior.
+// this behavior. See b/122318458.
 TEST_F(SelectTest, SetrlimitCallNOFILE) {
   fd_set read_set;
   FD_ZERO(&read_set);
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index 7ba752599..c7fdbb924 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -473,7 +473,7 @@ TEST(ShmTest, PartialUnmap) {
 }
 
 // Check that sentry does not panic when asked for a zero-length private shm
-// segment.
+// segment. Regression test for b/110694797.
 TEST(ShmTest, GracefullyFailOnZeroLenSegmentCreation) {
   EXPECT_THAT(Shmget(IPC_PRIVATE, 0, 0), PosixErrorIs(EINVAL, _));
 }
diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc
index 654c6a47f..a603fc1d1 100644
--- a/test/syscalls/linux/sigprocmask.cc
+++ b/test/syscalls/linux/sigprocmask.cc
@@ -237,7 +237,7 @@ TEST_F(SigProcMaskTest, SignalHandler) {
 }
 
 // Check that sigprocmask correctly handles aliasing of the set and oldset
-// pointers.
+// pointers. Regression test for b/30502311.
 TEST_F(SigProcMaskTest, AliasedSets) {
   sigset_t mask;
 
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index 276a94eb8..884319e1d 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -109,7 +109,7 @@ PosixErrorOr<std::vector<Mapping>> CreateFragmentedRegion(const int size,
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be sent
-// successfully.
+// successfully. See b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -165,7 +165,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be received
-// into successfully.
+// into successfully. Regression test for b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index b249ff91f..03ee1250d 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -38,7 +38,7 @@ mode_t FilePermission(const std::string& path) {
 }
 
 // Test that name collisions are checked on the new link path, not the source
-// path.
+// path. Regression test for b/31782115.
 TEST(SymlinkTest, CanCreateSymlinkWithCachedSourceDirent) {
   const std::string srcname = NewTempAbsPath();
   const std::string newname = NewTempAbsPath();
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 8a8b68e75..c4591a3b9 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -244,7 +244,8 @@ TEST_P(TcpSocketTest, ZeroWriteAllowed) {
 }
 
 // Test that a non-blocking write with a buffer that is larger than the send
-// buffer size will not actually write the whole thing at once.
+// buffer size will not actually write the whole thing at once. Regression test
+// for b/64438887.
 TEST_P(TcpSocketTest, NonblockingLargeWrite) {
   // Set the FD to O_NONBLOCK.
   int opts;
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index c7eead17e..1ccb95733 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -62,6 +62,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
               ::testing::KilledBySignal(SIGSEGV), "");
 }
 
+// Mimics the gettimeofday(2) wrapper from the Go runtime <= 1.2.
 int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
   constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
   return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
index bae377c69..8d8ebbb24 100644
--- a/test/syscalls/linux/tkill.cc
+++ b/test/syscalls/linux/tkill.cc
@@ -54,7 +54,7 @@ void SigHandler(int sig, siginfo_t* info, void* context) {
   TEST_CHECK(info->si_code == SI_TKILL);
 }
 
-// Test with a real signal.
+// Test with a real signal. Regression test for b/24790092.
 TEST(TkillTest, ValidTIDAndRealSignal) {
   struct sigaction sa;
   sa.sa_sigaction = SigHandler;
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index 35aacb172..9c10b6674 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -77,6 +77,7 @@ std::string NewTempAbsPath() {
 std::string NewTempRelPath() { return NextTempBasename(); }
 
 std::string GetAbsoluteTestTmpdir() {
+  // Note that TEST_TMPDIR is guaranteed to be set.
   char* env_tmpdir = getenv("TEST_TMPDIR");
   std::string tmp_dir =
       env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp";
diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl
index e99c87f81..a6db44e47 100644
--- a/tools/build/tags.bzl
+++ b/tools/build/tags.bzl
@@ -33,4 +33,8 @@ go_suffixes = [
     "_wasm_unsafe",
     "_linux",
     "_linux_unsafe",
+    "_opts",
+    "_opts_unsafe",
+    "_impl",
+    "_impl_unsafe",
 ]
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 5d5fa134a..c03b557ae 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -73,6 +73,16 @@ def calculate_sets(srcs):
             result[target].append(file)
     return result
 
+def go_imports(name, src, out):
+    """Simplify a single Go source file by eliminating unused imports."""
+    native.genrule(
+        name = name,
+        srcs = [src],
+        outs = [out],
+        tools = ["@org_golang_x_tools//cmd/goimports:goimports"],
+        cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
+    )
+
 def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
@@ -107,10 +117,15 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         state_sets = calculate_sets(srcs)
         for (suffix, srcs) in state_sets.items():
             go_stateify(
-                name = name + suffix + "_state_autogen",
+                name = name + suffix + "_state_autogen_with_imports",
                 srcs = srcs,
                 imports = imports,
                 package = name,
+                out = name + suffix + "_state_autogen_with_imports.go",
+            )
+            go_imports(
+                name = name + suffix + "_state_autogen",
+                src = name + suffix + "_state_autogen_with_imports.go",
                 out = name + suffix + "_state_autogen.go",
             )
         all_srcs = all_srcs + [
-- 
cgit v1.2.3


From c141eb5f430dc50f6bf90232c369b7b3a542155e Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 7 Feb 2020 13:47:57 -0800
Subject: Address GH comments.

---
 pkg/abi/linux/netfilter.go                | 2 +-
 pkg/sentry/socket/netfilter/extensions.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index e4aabb6bb..7363185b7 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -363,7 +363,7 @@ type XTTCP struct {
 	// range to which the matcher applies.
 	DestinationPortStart uint16
 
-	// DestinationPortEnd specifies the start of the destination port
+	// DestinationPortEnd specifies the end of the destination port
 	// range to which the matcher applies.
 	DestinationPortEnd uint16
 
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index b5fbb52e4..3082976cd 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-- 
cgit v1.2.3


From 762e4761cc4edd92108f6836ad1933c7158b8be8 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 11 Feb 2020 11:08:28 -0800
Subject: Move Align{Up,Down} into binary package.

PiperOrigin-RevId: 294477647
---
 pkg/binary/binary.go                      | 10 ++++++++++
 pkg/sentry/socket/control/control.go      | 26 ++++++++------------------
 pkg/sentry/socket/netfilter/extensions.go |  7 +------
 pkg/sentry/socket/netlink/message.go      | 15 ++++-----------
 pkg/sentry/strace/BUILD                   |  1 -
 pkg/sentry/strace/socket.go               |  7 +++----
 6 files changed, 26 insertions(+), 40 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/binary/binary.go b/pkg/binary/binary.go
index 631785f7b..25065aef9 100644
--- a/pkg/binary/binary.go
+++ b/pkg/binary/binary.go
@@ -254,3 +254,13 @@ func WriteUint64(w io.Writer, order binary.ByteOrder, num uint64) error {
 	_, err := w.Write(buf)
 	return err
 }
+
+// AlignUp rounds a length up to an alignment. align must be a power of 2.
+func AlignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
+
+// AlignDown rounds a length down to an alignment. align must be a power of 2.
+func AlignDown(length int, align uint) int {
+	return length & ^(int(align) - 1)
+}
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 00265f15b..6145a7fc3 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -189,7 +189,7 @@ func putUint32(buf []byte, n uint32) []byte {
 // putCmsg writes a control message header and as much data as will fit into
 // the unused capacity of a buffer.
 func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([]byte, int) {
-	space := AlignDown(cap(buf)-len(buf), 4)
+	space := binary.AlignDown(cap(buf)-len(buf), 4)
 
 	// We can't write to space that doesn't exist, so if we are going to align
 	// the available space, we must align down.
@@ -282,19 +282,9 @@ func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte, flags int
 	return putCmsg(buf, flags, linux.SCM_CREDENTIALS, align, c)
 }
 
-// AlignUp rounds a length up to an alignment. align must be a power of 2.
-func AlignUp(length int, align uint) int {
-	return (length + int(align) - 1) & ^(int(align) - 1)
-}
-
-// AlignDown rounds a down to an alignment. align must be a power of 2.
-func AlignDown(length int, align uint) int {
-	return length & ^(int(align) - 1)
-}
-
 // alignSlice extends a slice's length (up to the capacity) to align it.
 func alignSlice(buf []byte, align uint) []byte {
-	aligned := AlignUp(len(buf), align)
+	aligned := binary.AlignUp(len(buf), align)
 	if aligned > cap(buf) {
 		// Linux allows unaligned data if there isn't room for alignment.
 		// Since there isn't room for alignment, there isn't room for any
@@ -377,7 +367,7 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt
 
 // cmsgSpace is equivalent to CMSG_SPACE in Linux.
 func cmsgSpace(t *kernel.Task, dataLen int) int {
-	return linux.SizeOfControlMessageHeader + AlignUp(dataLen, t.Arch().Width())
+	return linux.SizeOfControlMessageHeader + binary.AlignUp(dataLen, t.Arch().Width())
 }
 
 // CmsgsSpace returns the number of bytes needed to fit the control messages
@@ -437,7 +427,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 		case linux.SOL_SOCKET:
 			switch h.Type {
 			case linux.SCM_RIGHTS:
-				rightsSize := AlignDown(length, linux.SizeOfControlMessageRight)
+				rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight)
 				numRights := rightsSize / linux.SizeOfControlMessageRight
 
 				if len(fds)+numRights > linux.SCM_MAX_FD {
@@ -448,7 +438,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 					fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
 				}
 
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			case linux.SCM_CREDENTIALS:
 				if length < linux.SizeOfControlMessageCredentials {
@@ -462,7 +452,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 					return socket.ControlMessages{}, err
 				}
 				cmsgs.Unix.Credentials = scmCreds
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			default:
 				// Unknown message type.
@@ -476,7 +466,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				}
 				cmsgs.IP.HasTOS = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
@@ -489,7 +479,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				}
 				cmsgs.IP.HasTClass = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass)
-				i += AlignUp(length, width)
+				i += binary.AlignUp(length, width)
 
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 22fd0ebe7..b4b244abf 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -72,7 +72,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
 	nflog("marshaling matcher %q", name)
 
 	// We have to pad this struct size to a multiple of 8 bytes.
-	size := alignUp(linux.SizeOfXTEntryMatch+len(data), 8)
+	size := binary.AlignUp(linux.SizeOfXTEntryMatch+len(data), 8)
 	matcher := linux.KernelXTEntryMatch{
 		XTEntryMatch: linux.XTEntryMatch{
 			MatchSize: uint16(size),
@@ -93,8 +93,3 @@ func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter,
 	}
 	return matchMaker.unmarshal(buf, filter)
 }
-
-// alignUp rounds a length up to an alignment. align must be a power of 2.
-func alignUp(length int, align uint) int {
-	return (length + int(align) - 1) & ^(int(align) - 1)
-}
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index 4ea252ccb..0899c61d1 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -23,18 +23,11 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// alignUp rounds a length up to an alignment.
-//
-// Preconditions: align is a power of two.
-func alignUp(length int, align uint) int {
-	return (length + int(align) - 1) &^ (int(align) - 1)
-}
-
 // alignPad returns the length of padding required for alignment.
 //
 // Preconditions: align is a power of two.
 func alignPad(length int, align uint) int {
-	return alignUp(length, align) - length
+	return binary.AlignUp(length, align) - length
 }
 
 // Message contains a complete serialized netlink message.
@@ -138,7 +131,7 @@ func (m *Message) Finalize() []byte {
 	// Align the message. Note that the message length in the header (set
 	// above) is the useful length of the message, not the total aligned
 	// length. See net/netlink/af_netlink.c:__nlmsg_put.
-	aligned := alignUp(len(m.buf), linux.NLMSG_ALIGNTO)
+	aligned := binary.AlignUp(len(m.buf), linux.NLMSG_ALIGNTO)
 	m.putZeros(aligned - len(m.buf))
 	return m.buf
 }
@@ -173,7 +166,7 @@ func (m *Message) PutAttr(atype uint16, v interface{}) {
 	m.Put(v)
 
 	// Align the attribute.
-	aligned := alignUp(l, linux.NLA_ALIGNTO)
+	aligned := binary.AlignUp(l, linux.NLA_ALIGNTO)
 	m.putZeros(aligned - l)
 }
 
@@ -190,7 +183,7 @@ func (m *Message) PutAttrString(atype uint16, s string) {
 	m.putZeros(1)
 
 	// Align the attribute.
-	aligned := alignUp(l, linux.NLA_ALIGNTO)
+	aligned := binary.AlignUp(l, linux.NLA_ALIGNTO)
 	m.putZeros(aligned - l)
 }
 
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 762a946fe..2f39a6f2b 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -30,7 +30,6 @@ go_library(
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/syscalls/linux",
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index f7ff4573e..51e6d81b2 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
@@ -220,13 +219,13 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 
 		if skipData {
 			strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length))
-			i += control.AlignUp(length, width)
+			i += binary.AlignUp(length, width)
 			continue
 		}
 
 		switch h.Type {
 		case linux.SCM_RIGHTS:
-			rightsSize := control.AlignDown(length, linux.SizeOfControlMessageRight)
+			rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight)
 
 			numRights := rightsSize / linux.SizeOfControlMessageRight
 			fds := make(linux.ControlMessageRights, numRights)
@@ -295,7 +294,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 		default:
 			panic("unreachable")
 		}
-		i += control.AlignUp(length, width)
+		i += binary.AlignUp(length, width)
 	}
 
 	return fmt.Sprintf("%#x %s", addr, strings.Join(strs, ", "))
-- 
cgit v1.2.3


From 6fdf2c53a1d084b70602170b660242036fd8fe4f Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 7 Feb 2020 11:21:07 -0800
Subject: iptables: User chains

- Adds creation of user chains via `-N <chainname>`
- Adds `-j RETURN` support for built-in chains, which triggers the
  chain's underflow rule (usually the default policy).
- Adds tests for chain creation, default policies, and `-j RETURN' from
  built-in chains.
---
 pkg/sentry/socket/netfilter/netfilter.go | 115 +++++++++++++++++++-----------
 pkg/tcpip/iptables/iptables.go           |  74 ++++++++++++-------
 pkg/tcpip/iptables/targets.go            |  41 ++++++++---
 pkg/tcpip/iptables/types.go              |  50 +++++--------
 test/iptables/filter_input.go            | 117 ++++++++++++++++++++++++++++++-
 test/iptables/iptables_test.go           |  24 +++++++
 6 files changed, 310 insertions(+), 111 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index ea02627de..3fc80e0de 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -50,7 +50,9 @@ type metadata struct {
 
 // nflog logs messages related to the writing and reading of iptables.
 func nflog(format string, args ...interface{}) {
-	log.Infof("netfilter: "+format, args...)
+	if log.IsLogging(log.Debug) {
+		log.Debugf("netfilter: "+format, args...)
+	}
 }
 
 // GetInfo returns information about iptables.
@@ -227,19 +229,23 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 }
 
 func marshalTarget(target iptables.Target) []byte {
-	switch target.(type) {
-	case iptables.UnconditionalAcceptTarget:
-		return marshalStandardTarget(iptables.Accept)
-	case iptables.UnconditionalDropTarget:
-		return marshalStandardTarget(iptables.Drop)
+	switch tg := target.(type) {
+	case iptables.AcceptTarget:
+		return marshalStandardTarget(iptables.RuleAccept)
+	case iptables.DropTarget:
+		return marshalStandardTarget(iptables.RuleDrop)
 	case iptables.ErrorTarget:
-		return marshalErrorTarget()
+		return marshalErrorTarget(errorTargetName)
+	case iptables.UserChainTarget:
+		return marshalErrorTarget(tg.Name)
+	case iptables.ReturnTarget:
+		return marshalStandardTarget(iptables.RuleReturn)
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
 }
 
-func marshalStandardTarget(verdict iptables.Verdict) []byte {
+func marshalStandardTarget(verdict iptables.RuleVerdict) []byte {
 	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
 
 	// The target's name will be the empty string.
@@ -254,14 +260,14 @@ func marshalStandardTarget(verdict iptables.Verdict) []byte {
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
-func marshalErrorTarget() []byte {
+func marshalErrorTarget(errorName string) []byte {
 	// This is an error target named error
 	target := linux.XTErrorTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTErrorTarget,
 		},
 	}
-	copy(target.Name[:], errorTargetName)
+	copy(target.Name[:], errorName)
 	copy(target.Target.Name[:], errorTargetName)
 
 	ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
@@ -270,38 +276,35 @@ func marshalErrorTarget() []byte {
 
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
-func translateFromStandardVerdict(verdict iptables.Verdict) int32 {
+func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
 	switch verdict {
-	case iptables.Accept:
+	case iptables.RuleAccept:
 		return -linux.NF_ACCEPT - 1
-	case iptables.Drop:
+	case iptables.RuleDrop:
 		return -linux.NF_DROP - 1
-	case iptables.Queue:
-		return -linux.NF_QUEUE - 1
-	case iptables.Return:
+	case iptables.RuleReturn:
 		return linux.NF_RETURN
-	case iptables.Jump:
+	default:
 		// TODO(gvisor.dev/issue/170): Support Jump.
-		panic("Jump isn't supported yet")
+		panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
 	}
-	panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
 }
 
-// translateToStandardVerdict translates from the value in a
+// translateToStandardTarget translates from the value in a
 // linux.XTStandardTarget to an iptables.Verdict.
-func translateToStandardVerdict(val int32) (iptables.Verdict, error) {
+func translateToStandardTarget(val int32) (iptables.Target, error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
-		return iptables.Accept, nil
+		return iptables.AcceptTarget{}, nil
 	case -linux.NF_DROP - 1:
-		return iptables.Drop, nil
+		return iptables.DropTarget{}, nil
 	case -linux.NF_QUEUE - 1:
-		return iptables.Invalid, errors.New("unsupported iptables verdict QUEUE")
+		return nil, errors.New("unsupported iptables verdict QUEUE")
 	case linux.NF_RETURN:
-		return iptables.Invalid, errors.New("unsupported iptables verdict RETURN")
+		return iptables.ReturnTarget{}, nil
 	default:
-		return iptables.Invalid, fmt.Errorf("unknown iptables verdict %d", val)
+		return nil, fmt.Errorf("unknown iptables verdict %d", val)
 	}
 }
 
@@ -411,6 +414,10 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
+					if !validUnderflow(table.Rules[ruleIdx]) {
+						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP.")
+						return syserr.ErrInvalidArgument
+					}
 					table.Underflows[hk] = ruleIdx
 				}
 			}
@@ -425,12 +432,34 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		}
 	}
 
+	// Add the user chains.
+	for ruleIdx, rule := range table.Rules {
+		target, ok := rule.Target.(iptables.UserChainTarget)
+		if !ok {
+			continue
+		}
+
+		// We found a user chain. Before inserting it into the table,
+		// check that:
+		// - There's some other rule after it.
+		// - There are no matchers.
+		if ruleIdx == len(table.Rules)-1 {
+			nflog("user chain must have a rule or default policy.")
+			return syserr.ErrInvalidArgument
+		}
+		if len(table.Rules[ruleIdx].Matchers) != 0 {
+			nflog("user chain's first node must have no matcheres.")
+			return syserr.ErrInvalidArgument
+		}
+		table.UserChains[target.Name] = ruleIdx + 1
+	}
+
 	// TODO(gvisor.dev/issue/170): Support other chains.
 	// Since we only support modifying the INPUT chain right now, make sure
 	// all other chains point to ACCEPT rules.
 	for hook, ruleIdx := range table.BuiltinChains {
 		if hook != iptables.Input {
-			if _, ok := table.Rules[ruleIdx].Target.(iptables.UnconditionalAcceptTarget); !ok {
+			if _, ok := table.Rules[ruleIdx].Target.(iptables.AcceptTarget); !ok {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
@@ -519,18 +548,7 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		buf = optVal[:linux.SizeOfXTStandardTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
 
-		verdict, err := translateToStandardVerdict(standardTarget.Verdict)
-		if err != nil {
-			return nil, err
-		}
-		switch verdict {
-		case iptables.Accept:
-			return iptables.UnconditionalAcceptTarget{}, nil
-		case iptables.Drop:
-			return iptables.UnconditionalDropTarget{}, nil
-		default:
-			return nil, fmt.Errorf("Unknown verdict: %v", verdict)
-		}
+		return translateToStandardTarget(standardTarget.Verdict)
 
 	case errorTargetName:
 		// Error target.
@@ -548,11 +566,14 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		//   somehow fall through every rule.
 		// * To mark the start of a user defined chain. These
 		//   rules have an error with the name of the chain.
-		switch errorTarget.Name.String() {
+		switch name := errorTarget.Name.String(); name {
 		case errorTargetName:
+			nflog("set entries: error target")
 			return iptables.ErrorTarget{}, nil
 		default:
-			return nil, fmt.Errorf("unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
+			// User defined chain.
+			nflog("set entries: user-defined target %q", name)
+			return iptables.UserChainTarget{Name: name}, nil
 		}
 	}
 
@@ -585,6 +606,18 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool {
 		iptip.InverseFlags != 0
 }
 
+func validUnderflow(rule iptables.Rule) bool {
+	if len(rule.Matchers) != 0 {
+		return false
+	}
+	switch rule.Target.(type) {
+	case iptables.AcceptTarget, iptables.DropTarget:
+		return true
+	default:
+		return false
+	}
+}
+
 func hookFromLinux(hook int) iptables.Hook {
 	switch hook {
 	case linux.NF_INET_PRE_ROUTING:
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 1b9485bbd..75a433a3b 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -52,10 +52,10 @@ func DefaultTables() IPTables {
 		Tables: map[string]Table{
 			TablenameNat: Table{
 				Rules: []Rule{
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
@@ -74,8 +74,8 @@ func DefaultTables() IPTables {
 			},
 			TablenameMangle: Table{
 				Rules: []Rule{
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
@@ -90,9 +90,9 @@ func DefaultTables() IPTables {
 			},
 			TablenameFilter: Table{
 				Rules: []Rule{
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
-					Rule{Target: UnconditionalAcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
 				BuiltinChains: map[Hook]int{
@@ -149,13 +149,11 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 	for _, tablename := range it.Priorities[hook] {
 		switch verdict := it.checkTable(hook, pkt, tablename); verdict {
 		// If the table returns Accept, move on to the next table.
-		case Accept:
+		case TableAccept:
 			continue
 		// The Drop verdict is final.
-		case Drop:
+		case TableDrop:
 			return false
-		case Stolen, Queue, Repeat, None, Jump, Return, Continue:
-			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
 		default:
 			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
 		}
@@ -166,36 +164,58 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 }
 
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) Verdict {
+func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) TableVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
 	table := it.Tables[tablename]
 	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
 		switch verdict := it.checkRule(hook, pkt, table, ruleIdx); verdict {
-		// In either of these cases, this table is done with the packet.
-		case Accept, Drop:
-			return verdict
-		// Continue traversing the rules of the table.
-		case Continue:
+		case RuleAccept:
+			return TableAccept
+
+		case RuleDrop:
+			return TableDrop
+
+		case RuleContinue:
 			continue
-		case Stolen, Queue, Repeat, None, Jump, Return:
-			panic(fmt.Sprintf("Unimplemented verdict %v.", verdict))
+
+		case RuleReturn:
+			// TODO(gvisor.dev/issue/170): We don't implement jump
+			// yet, so any Return is from a built-in chain. That
+			// means we have to to call the underflow.
+			underflow := table.Rules[table.Underflows[hook]]
+			// Underflow is guaranteed to be an unconditional
+			// ACCEPT or DROP.
+			switch v, _ := underflow.Target.Action(pkt); v {
+			case RuleAccept:
+				return TableAccept
+			case RuleDrop:
+				return TableDrop
+			case RuleContinue, RuleReturn:
+				panic("Underflows should only return RuleAccept or RuleDrop.")
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", v))
+			}
+
 		default:
-			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
+			panic(fmt.Sprintf("Unknown verdict: %d", verdict))
 		}
+
 	}
 
-	panic(fmt.Sprintf("Traversed past the entire list of iptables rules in table %q.", tablename))
+	// We got through the entire table without a decision. Default to DROP
+	// for safety.
+	return TableDrop
 }
 
 // Precondition: pk.NetworkHeader is set.
-func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) Verdict {
+func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) RuleVerdict {
 	rule := table.Rules[ruleIdx]
 
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
 	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
-		return Continue
+		return RuleContinue
 	}
 
 	// Go through each rule matcher. If they all match, run
@@ -203,10 +223,10 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 	for _, matcher := range rule.Matchers {
 		matches, hotdrop := matcher.Match(hook, pkt, "")
 		if hotdrop {
-			return Drop
+			return RuleDrop
 		}
 		if !matches {
-			return Continue
+			return RuleContinue
 		}
 	}
 
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 4dd281371..9fc60cfad 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -21,20 +21,20 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
-// UnconditionalAcceptTarget accepts all packets.
-type UnconditionalAcceptTarget struct{}
+// AcceptTarget accepts packets.
+type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (UnconditionalAcceptTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
-	return Accept, ""
+func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	return RuleAccept, ""
 }
 
-// UnconditionalDropTarget denies all packets.
-type UnconditionalDropTarget struct{}
+// DropTarget drops packets.
+type DropTarget struct{}
 
 // Action implements Target.Action.
-func (UnconditionalDropTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
-	return Drop, ""
+func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	return RuleDrop, ""
 }
 
 // ErrorTarget logs an error and drops the packet. It represents a target that
@@ -42,7 +42,26 @@ func (UnconditionalDropTarget) Action(packet tcpip.PacketBuffer) (Verdict, strin
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (Verdict, string) {
-	log.Warningf("ErrorTarget triggered.")
-	return Drop, ""
+func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+	log.Debugf("ErrorTarget triggered.")
+	return RuleDrop, ""
+}
+
+// UserChainTarget marks a rule as the beginning of a user chain.
+type UserChainTarget struct {
+	Name string
+}
+
+// Action implements Target.Action.
+func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+	panic("UserChainTarget should never be called.")
+}
+
+// ReturnTarget returns from the current chain. If the chain is a built-in, the
+// hook's underflow should be called.
+type ReturnTarget struct{}
+
+// Action implements Target.Action.
+func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+	return RuleReturn, ""
 }
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 7d593c35c..5735d001b 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -56,44 +56,32 @@ const (
 	NumHooks
 )
 
-// A Verdict is returned by a rule's target to indicate how traversal of rules
-// should (or should not) continue.
-type Verdict int
+// A TableVerdict is what a table decides should be done with a packet.
+type TableVerdict int
 
 const (
-	// Invalid indicates an unkonwn or erroneous verdict.
-	Invalid Verdict = iota
+	// TableAccept indicates the packet should continue through netstack.
+	TableAccept TableVerdict = iota
 
-	// Accept indicates the packet should continue traversing netstack as
-	// normal.
-	Accept
-
-	// Drop inicates the packet should be dropped, stopping traversing
-	// netstack.
-	Drop
-
-	// Stolen indicates the packet was co-opted by the target and should
-	// stop traversing netstack.
-	Stolen
-
-	// Queue indicates the packet should be queued for userspace processing.
-	Queue
+	// TableAccept indicates the packet should be dropped.
+	TableDrop
+)
 
-	// Repeat indicates the packet should re-traverse the chains for the
-	// current hook.
-	Repeat
+// A RuleVerdict is what a rule decides should be done with a packet.
+type RuleVerdict int
 
-	// None indicates no verdict was reached.
-	None
+const (
+	// RuleAccept indicates the packet should continue through netstack.
+	RuleAccept RuleVerdict = iota
 
-	// Jump indicates a jump to another chain.
-	Jump
+	// RuleContinue indicates the packet should continue to the next rule.
+	RuleContinue
 
-	// Continue indicates that traversal should continue at the next rule.
-	Continue
+	// RuleDrop indicates the packet should be dropped.
+	RuleDrop
 
-	// Return indicates that traversal should return to the calling chain.
-	Return
+	// RuleReturn indicates the packet should return to the previous chain.
+	RuleReturn
 )
 
 // IPTables holds all the tables for a netstack.
@@ -187,5 +175,5 @@ type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the name of the chain to jump to.
-	Action(packet tcpip.PacketBuffer) (Verdict, string)
+	Action(packet tcpip.PacketBuffer) (RuleVerdict, string)
 }
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index bd6059921..e26d6a7d2 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -36,6 +36,10 @@ func init() {
 	RegisterTestCase(FilterInputDropTCPSrcPort{})
 	RegisterTestCase(FilterInputDropUDPPort{})
 	RegisterTestCase(FilterInputDropUDP{})
+	RegisterTestCase(FilterInputCreateUserChain{})
+	RegisterTestCase(FilterInputDefaultPolicyAccept{})
+	RegisterTestCase(FilterInputDefaultPolicyDrop{})
+	RegisterTestCase(FilterInputReturnUnderflow{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -295,8 +299,119 @@ func (FilterInputRequireProtocolUDP) ContainerAction(ip net.IP) error {
 	return nil
 }
 
-// LocalAction implements TestCase.LocalAction.
 func (FilterInputRequireProtocolUDP) LocalAction(ip net.IP) error {
 	// No-op.
 	return nil
 }
+
+// FilterInputCreateUserChain tests chain creation.
+type FilterInputCreateUserChain struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputCreateUserChain) Name() string {
+	return "FilterInputCreateUserChain"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputCreateUserChain) ContainerAction(ip net.IP) error {
+	// Create a chain.
+	const chainName = "foochain"
+	if err := filterTable("-N", chainName); err != nil {
+		return err
+	}
+
+	// Add a simple rule to the chain.
+	return filterTable("-A", chainName, "-j", "DROP")
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputCreateUserChain) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// FilterInputDefaultPolicyAccept tests the default ACCEPT policy.
+type FilterInputDefaultPolicyAccept struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDefaultPolicyAccept) Name() string {
+	return "FilterInputDefaultPolicyAccept"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDefaultPolicyAccept) ContainerAction(ip net.IP) error {
+	// Set the default policy to accept, then receive a packet.
+	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+		return err
+	}
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDefaultPolicyAccept) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputDefaultPolicyDrop tests the default DROP policy.
+type FilterInputDefaultPolicyDrop struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDefaultPolicyDrop) Name() string {
+	return "FilterInputDefaultPolicyDrop"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDefaultPolicyDrop) ContainerAction(ip net.IP) error {
+	if err := filterTable("-P", "INPUT", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDefaultPolicyDrop) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputReturnUnderflow tests that -j RETURN in a built-in chain causes
+// the underflow rule (i.e. default policy) to be executed.
+type FilterInputReturnUnderflow struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputReturnUnderflow) Name() string {
+	return "FilterInputReturnUnderflow"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
+	// Add a RETURN rule followed by an unconditional accept, and set the
+	// default policy to DROP.
+	if err := filterTable("-A", "INPUT", "-j", "RETURN"); err != nil {
+		return err
+	}
+	if err := filterTable("-A", "INPUT", "-j", "DROP"); err != nil {
+		return err
+	}
+	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+		return err
+	}
+
+	// We should receive packets, as the RETURN rule will trigger the default
+	// ACCEPT policy.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputReturnUnderflow) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 41909582a..46a7c99b0 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -214,6 +214,30 @@ func TestFilterInputDropTCPSrcPort(t *testing.T) {
 	}
 }
 
+func TestFilterInputCreateUserChain(t *testing.T) {
+	if err := singleTest(FilterInputCreateUserChain{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDefaultPolicyAccept(t *testing.T) {
+	if err := singleTest(FilterInputDefaultPolicyAccept{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDefaultPolicyDrop(t *testing.T) {
+	if err := singleTest(FilterInputDefaultPolicyDrop{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputReturnUnderflow(t *testing.T) {
+	if err := singleTest(FilterInputReturnUnderflow{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
 func TestFilterOutputDropTCPDestPort(t *testing.T) {
 	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
 		t.Fatal(err)
-- 
cgit v1.2.3


From 69bf39e8a47d3b4dcbbd04d2e8df476cdfab5e74 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 13 Feb 2020 10:58:47 -0800
Subject: Internal change.

PiperOrigin-RevId: 294952610
---
 pkg/abi/linux/socket.go                        | 13 ++++
 pkg/sentry/socket/control/BUILD                |  1 +
 pkg/sentry/socket/control/control.go           | 43 +++++++++++++
 pkg/sentry/socket/hostinet/socket.go           | 11 +++-
 pkg/sentry/socket/netstack/netstack.go         | 37 ++++++++++--
 pkg/tcpip/tcpip.go                             | 25 ++++++++
 pkg/tcpip/transport/udp/endpoint.go            | 26 ++++++++
 test/syscalls/linux/socket_ip_udp_generic.cc   | 44 ++++++++++++++
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 84 ++++++++++++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc   |  1 -
 10 files changed, 278 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 766ee4014..4a14ef691 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -411,6 +411,15 @@ type ControlMessageCredentials struct {
 	GID uint32
 }
 
+// A ControlMessageIPPacketInfo is IP_PKTINFO socket control message.
+//
+// ControlMessageIPPacketInfo represents struct in_pktinfo from linux/in.h.
+type ControlMessageIPPacketInfo struct {
+	NIC             int32
+	LocalAddr       InetAddr
+	DestinationAddr InetAddr
+}
+
 // SizeOfControlMessageCredentials is the binary size of a
 // ControlMessageCredentials struct.
 var SizeOfControlMessageCredentials = int(binary.Size(ControlMessageCredentials{}))
@@ -431,6 +440,10 @@ const SizeOfControlMessageTOS = 1
 // SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message.
 const SizeOfControlMessageTClass = 4
 
+// SizeOfControlMessageIPPacketInfo is the size of an IP_PKTINFO
+// control message.
+const SizeOfControlMessageIPPacketInfo = 12
+
 // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call.
 // From net/scm.h.
 const SCM_MAX_FD = 253
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 79e16d6e8..4d42d29cb 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/syserror",
+        "//pkg/tcpip",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 6145a7fc3..4667373d2 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -338,6 +339,22 @@ func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
 	)
 }
 
+// PackIPPacketInfo packs an IP_PKTINFO socket control message.
+func PackIPPacketInfo(t *kernel.Task, packetInfo tcpip.IPPacketInfo, buf []byte) []byte {
+	var p linux.ControlMessageIPPacketInfo
+	p.NIC = int32(packetInfo.NIC)
+	copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr))
+	copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr))
+
+	return putCmsgStruct(
+		buf,
+		linux.SOL_IP,
+		linux.IP_PKTINFO,
+		t.Arch().Width(),
+		p,
+	)
+}
+
 // PackControlMessages packs control messages into the given buffer.
 //
 // We skip control messages specific to Unix domain sockets.
@@ -362,6 +379,10 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt
 		buf = PackTClass(t, cmsgs.IP.TClass, buf)
 	}
 
+	if cmsgs.IP.HasIPPacketInfo {
+		buf = PackIPPacketInfo(t, cmsgs.IP.PacketInfo, buf)
+	}
+
 	return buf
 }
 
@@ -394,6 +415,16 @@ func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int {
 	return space
 }
 
+// NewIPPacketInfo returns the IPPacketInfo struct.
+func NewIPPacketInfo(packetInfo linux.ControlMessageIPPacketInfo) tcpip.IPPacketInfo {
+	var p tcpip.IPPacketInfo
+	p.NIC = tcpip.NICID(packetInfo.NIC)
+	copy([]byte(p.LocalAddr), packetInfo.LocalAddr[:])
+	copy([]byte(p.DestinationAddr), packetInfo.DestinationAddr[:])
+
+	return p
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) {
 	var (
@@ -468,6 +499,18 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
 				i += binary.AlignUp(length, width)
 
+			case linux.IP_PKTINFO:
+				if length < linux.SizeOfControlMessageIPPacketInfo {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
+
+				cmsgs.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+
+				cmsgs.IP.PacketInfo = NewIPPacketInfo(packetInfo)
+				i += binary.AlignUp(length, width)
+
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
 			}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index de76388ac..22f78d2e2 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -289,7 +289,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	switch level {
 	case linux.SOL_IP:
 		switch name {
-		case linux.IP_TOS, linux.IP_RECVTOS:
+		case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO:
 			optlen = sizeofInt32
 		}
 	case linux.SOL_IPV6:
@@ -336,6 +336,8 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 		switch name {
 		case linux.IP_TOS, linux.IP_RECVTOS:
 			optlen = sizeofInt32
+		case linux.IP_PKTINFO:
+			optlen = linux.SizeOfControlMessageIPPacketInfo
 		}
 	case linux.SOL_IPV6:
 		switch name {
@@ -473,7 +475,14 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			case syscall.IP_TOS:
 				controlMessages.IP.HasTOS = true
 				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS)
+
+			case syscall.IP_PKTINFO:
+				controlMessages.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+				controlMessages.IP.PacketInfo = control.NewIPPacketInfo(packetInfo)
 			}
+
 		case syscall.SOL_IPV6:
 			switch unixCmsg.Header.Type {
 			case syscall.IPV6_TCLASS:
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index ed2fbcceb..9757fbfba 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1414,6 +1414,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return o, nil
 
+	case linux.IP_PKTINFO:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveIPPacketInfoOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1762,6 +1777,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		linux.IPV6_IPSEC_POLICY,
 		linux.IPV6_JOIN_ANYCAST,
 		linux.IPV6_LEAVE_ANYCAST,
+		// TODO(b/148887420): Add support for IPV6_PKTINFO.
 		linux.IPV6_PKTINFO,
 		linux.IPV6_ROUTER_ALERT,
 		linux.IPV6_XFRM_POLICY,
@@ -1949,6 +1965,16 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
 
+	case linux.IP_PKTINFO:
+		if len(optVal) == 0 {
+			return nil
+		}
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1964,7 +1990,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_NODEFRAG,
 		linux.IP_OPTIONS,
 		linux.IP_PASSSEC,
-		linux.IP_PKTINFO,
 		linux.IP_RECVERR,
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
@@ -2395,10 +2420,12 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 func (s *SocketOperations) controlMessages() socket.ControlMessages {
 	return socket.ControlMessages{
 		IP: tcpip.ControlMessages{
-			HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp,
-			Timestamp:    s.readCM.Timestamp,
-			HasTOS:       s.readCM.HasTOS,
-			TOS:          s.readCM.TOS,
+			HasTimestamp:    s.readCM.HasTimestamp && s.sockOptTimestamp,
+			Timestamp:       s.readCM.Timestamp,
+			HasTOS:          s.readCM.HasTOS,
+			TOS:             s.readCM.TOS,
+			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
+			PacketInfo:      s.readCM.PacketInfo,
 		},
 	}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0e944712f..9ca39ce40 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -328,6 +328,12 @@ type ControlMessages struct {
 
 	// Tclass is the IPv6 traffic class of the associated packet.
 	TClass int32
+
+	// HasIPPacketInfo indicates whether PacketInfo is set.
+	HasIPPacketInfo bool
+
+	// PacketInfo holds interface and address data on an incoming packet.
+	PacketInfo IPPacketInfo
 }
 
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
@@ -503,6 +509,11 @@ const (
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
 	V6OnlyOption
+
+	// ReceiveIPPacketInfoOption is used by {G,S}etSockOptBool to specify
+	// if more inforamtion is provided with incoming packets such
+	// as interface index and address.
+	ReceiveIPPacketInfoOption
 )
 
 // SockOptInt represents socket options which values have the int type.
@@ -685,6 +696,20 @@ type IPv4TOSOption uint8
 // for all subsequent outgoing IPv6 packets from the endpoint.
 type IPv6TrafficClassOption uint8
 
+// IPPacketInfo is the message struture for IP_PKTINFO.
+//
+// +stateify savable
+type IPPacketInfo struct {
+	// NIC is the ID of the NIC to be used.
+	NIC NICID
+
+	// LocalAddr is the local address.
+	LocalAddr Address
+
+	// DestinationAddr is the destination address.
+	DestinationAddr Address
+}
+
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination address in the row.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index c9cbed8f4..3fe91cac2 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -29,6 +29,7 @@ import (
 type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
+	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
 	tos           uint8
@@ -118,6 +119,9 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveIPPacketInfo determines if the packet info is returned by Read.
+	receiveIPPacketInfo bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -254,11 +258,17 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
+
+	if receiveIPPacketInfo {
+		cm.HasIPPacketInfo = true
+		cm.PacketInfo = p.packetInfo
+	}
 	return p.data.ToView(), cm, nil
 }
 
@@ -495,6 +505,13 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		}
 
 		e.v6only = v
+		return nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.Lock()
+		e.receiveIPPacketInfo = v
+		e.mu.Unlock()
+		return nil
 	}
 
 	return nil
@@ -703,6 +720,12 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 
 		return v, nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.RLock()
+		v := e.receiveIPPacketInfo
+		e.mu.RUnlock()
+		return v, nil
 	}
 
 	return false, tcpip.ErrUnknownProtocolOption
@@ -1247,6 +1270,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	switch r.NetProto {
 	case header.IPv4ProtocolNumber:
 		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+		packet.packetInfo.LocalAddr = r.LocalAddress
+		packet.packetInfo.DestinationAddr = r.RemoteAddress
+		packet.packetInfo.NIC = r.NICID()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 53290bed7..db5663ecd 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -357,5 +357,49 @@ TEST_P(UDPSocketPairTest, SetReuseAddrReusePort) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Test getsockopt for a socket which is not set with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, IPPKTINFODefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_IP, IP_PKTINFO, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test setsockopt and getsockopt for a socket with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  // Check getsockopt before IP_PKTINFO is set.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOn);
+  EXPECT_EQ(get_len, sizeof(get));
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOff);
+  EXPECT_EQ(get_len, sizeof(get));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 990ccf23c..bc4b07a62 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -15,6 +15,7 @@
 #include "test/syscalls/linux/socket_ipv4_udp_unbound.h"
 
 #include <arpa/inet.h>
+#include <net/if.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -2128,5 +2129,88 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
               SyscallSucceedsWithValue(kMessageSize));
 }
 
+// Test that socket will receive packet info control message.
+TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF((IsRunningWithHostinet()));
+
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto sender_addr = V4Loopback();
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+           sender_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t sender_addr_len = sender_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&sender_addr.addr),
+                          &sender_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(sender_addr_len, sender_addr.addr_len);
+
+  auto receiver_addr = V4Loopback();
+  reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&sender_addr.addr)->sin_port;
+  ASSERT_THAT(
+      connect(sender->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+              receiver_addr.addr_len),
+      SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  ASSERT_THAT(
+      setsockopt(receiver->get(), level, type, &kSockOptOn, sizeof(kSockOptOn)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  msghdr sent_msg = {};
+  iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = sent_data;
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+  sent_msg.msg_flags = 0;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sender->get(), &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  msghdr received_msg = {};
+  iovec received_iov = {};
+  char received_data[kDataLength];
+  char received_cmsg_buf[CMSG_SPACE(sizeof(in_pktinfo))] = {};
+  size_t cmsg_data_len = sizeof(in_pktinfo);
+  received_iov.iov_base = received_data;
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+  received_msg.msg_control = received_cmsg_buf;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(receiver->get(), &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, level);
+  EXPECT_EQ(cmsg->cmsg_type, type);
+
+  // Get loopback index.
+  ifreq ifr = {};
+  absl::SNPrintF(ifr.ifr_name, IFNAMSIZ, "lo");
+  ASSERT_THAT(ioctl(sender->get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  ASSERT_NE(ifr.ifr_ifindex, 0);
+
+  // Check the data
+  in_pktinfo received_pktinfo = {};
+  memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(in_pktinfo));
+  EXPECT_EQ(received_pktinfo.ipi_ifindex, ifr.ifr_ifindex);
+  EXPECT_EQ(received_pktinfo.ipi_spec_dst.s_addr, htonl(INADDR_LOOPBACK));
+  EXPECT_EQ(received_pktinfo.ipi_addr.s_addr, htonl(INADDR_LOOPBACK));
+}
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index a2f6ef8cc..9f8de6b48 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1495,6 +1495,5 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
   EXPECT_EQ(received_tos, sent_tos);
 }
-
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 6ef63cd7da107d487fda7c48af50fa9802913cd9 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 12 Feb 2020 16:19:06 -0800
Subject: We can now create and jump in iptables. For example:

$ iptables -N foochain
$ iptables -A INPUT -j foochain
---
 pkg/abi/linux/netfilter.go               |   9 +-
 pkg/sentry/socket/netfilter/BUILD        |   1 +
 pkg/sentry/socket/netfilter/netfilter.go |  62 +++++++--
 pkg/sentry/socket/netfilter/targets.go   |  35 +++++
 pkg/tcpip/iptables/iptables.go           | 103 +++++++++------
 pkg/tcpip/iptables/targets.go            |  20 ++-
 pkg/tcpip/iptables/types.go              |  21 +--
 test/iptables/filter_input.go            | 217 ++++++++++++++++++++++++++++---
 test/iptables/iptables_test.go           |  36 +++++
 test/iptables/iptables_util.go           |  10 ++
 10 files changed, 420 insertions(+), 94 deletions(-)
 create mode 100644 pkg/sentry/socket/netfilter/targets.go

(limited to 'pkg/sentry/socket')

diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index bbc4df74c..bd2e13ba1 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -225,11 +225,14 @@ type XTEntryTarget struct {
 // SizeOfXTEntryTarget is the size of an XTEntryTarget.
 const SizeOfXTEntryTarget = 32
 
-// XTStandardTarget is a builtin target, one of ACCEPT, DROP, JUMP, QUEUE, or
-// RETURN. It corresponds to struct xt_standard_target in
+// XTStandardTarget is a built-in target, one of ACCEPT, DROP, JUMP, QUEUE,
+// RETURN, or jump. It corresponds to struct xt_standard_target in
 // include/uapi/linux/netfilter/x_tables.h.
 type XTStandardTarget struct {
-	Target  XTEntryTarget
+	Target XTEntryTarget
+	// A positive verdict indicates a jump, and is the offset from the
+	// start of the table to jump to. A negative value means one of the
+	// other built-in targets.
 	Verdict int32
 	_       [4]byte
 }
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index c91ec7494..7cd2ce55b 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "extensions.go",
         "netfilter.go",
+        "targets.go",
         "tcp_matcher.go",
         "udp_matcher.go",
     ],
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3fc80e0de..d322e4144 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -240,13 +240,15 @@ func marshalTarget(target iptables.Target) []byte {
 		return marshalErrorTarget(tg.Name)
 	case iptables.ReturnTarget:
 		return marshalStandardTarget(iptables.RuleReturn)
+	case JumpTarget:
+		return marshalJumpTarget(tg)
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
 }
 
 func marshalStandardTarget(verdict iptables.RuleVerdict) []byte {
-	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+	nflog("convert to binary: marshalling standard target")
 
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
@@ -274,6 +276,23 @@ func marshalErrorTarget(errorName string) []byte {
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
+func marshalJumpTarget(jt JumpTarget) []byte {
+	nflog("convert to binary: marshalling jump target")
+
+	// The target's name will be the empty string.
+	target := linux.XTStandardTarget{
+		Target: linux.XTEntryTarget{
+			TargetSize: linux.SizeOfXTStandardTarget,
+		},
+		// Verdict is overloaded by the ABI. When positive, it holds
+		// the jump offset from the start of the table.
+		Verdict: int32(jt.Offset),
+	}
+
+	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
 func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
@@ -335,7 +354,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
-	var offsets []uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
 		nflog("set entries: processing entry at offset %d", offset)
 
@@ -396,11 +416,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			Target:   target,
 			Matchers: matchers,
 		})
-		offsets = append(offsets, offset)
+		offsets[offset] = int(entryIdx)
 		offset += uint32(entry.NextOffset)
 
 		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
 			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 	}
 
@@ -409,13 +430,13 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	for hook, _ := range replace.HookEntry {
 		if table.ValidHooks()&(1<<hook) != 0 {
 			hk := hookFromLinux(hook)
-			for ruleIdx, offset := range offsets {
+			for offset, ruleIdx := range offsets {
 				if offset == replace.HookEntry[hook] {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
 					if !validUnderflow(table.Rules[ruleIdx]) {
-						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP.")
+						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP")
 						return syserr.ErrInvalidArgument
 					}
 					table.Underflows[hk] = ruleIdx
@@ -444,16 +465,35 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// - There's some other rule after it.
 		// - There are no matchers.
 		if ruleIdx == len(table.Rules)-1 {
-			nflog("user chain must have a rule or default policy.")
+			nflog("user chain must have a rule or default policy")
 			return syserr.ErrInvalidArgument
 		}
 		if len(table.Rules[ruleIdx].Matchers) != 0 {
-			nflog("user chain's first node must have no matcheres.")
+			nflog("user chain's first node must have no matchers")
 			return syserr.ErrInvalidArgument
 		}
 		table.UserChains[target.Name] = ruleIdx + 1
 	}
 
+	// Set each jump to point to the appropriate rule. Right now they hold byte
+	// offsets.
+	for ruleIdx, rule := range table.Rules {
+		jump, ok := rule.Target.(JumpTarget)
+		if !ok {
+			continue
+		}
+
+		// Find the rule corresponding to the jump rule offset.
+		jumpTo, ok := offsets[jump.Offset]
+		if !ok {
+			nflog("failed to find a rule to jump to")
+			return syserr.ErrInvalidArgument
+		}
+		jump.RuleNum = jumpTo
+		rule.Target = jump
+		table.Rules[ruleIdx] = rule
+	}
+
 	// TODO(gvisor.dev/issue/170): Support other chains.
 	// Since we only support modifying the INPUT chain right now, make sure
 	// all other chains point to ACCEPT rules.
@@ -548,7 +588,13 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		buf = optVal[:linux.SizeOfXTStandardTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
 
-		return translateToStandardTarget(standardTarget.Verdict)
+		if standardTarget.Verdict < 0 {
+			// A Verdict < 0 indicates a non-jump verdict.
+			return translateToStandardTarget(standardTarget.Verdict)
+		} else {
+			// A verdict >= 0 indicates a jump.
+			return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
+		}
 
 	case errorTargetName:
 		// Error target.
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
new file mode 100644
index 000000000..c421b87cf
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+)
+
+// JumpTarget implements iptables.Target.
+type JumpTarget struct {
+	// Offset is the byte offset of the rule to jump to. It is used for
+	// marshaling and unmarshaling.
+	Offset uint32
+
+	// RuleNum is the rule to jump to.
+	RuleNum int
+}
+
+// Action implements iptables.Target.Action.
+func (jt JumpTarget) Action(tcpip.PacketBuffer) (iptables.RuleVerdict, int) {
+	return iptables.RuleJump, jt.RuleNum
+}
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 75a433a3b..dbaccbb36 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -135,25 +135,53 @@ func EmptyFilterTable() Table {
 	}
 }
 
+// A chainVerdict is what a table decides should be done with a packet.
+type chainVerdict int
+
+const (
+	// chainAccept indicates the packet should continue through netstack.
+	chainAccept chainVerdict = iota
+
+	// chainAccept indicates the packet should be dropped.
+	chainDrop
+
+	// chainReturn indicates the packet should return to the calling chain
+	// or the underflow rule of a builtin chain.
+	chainReturn
+)
+
 // Check runs pkt through the rules for hook. It returns true when the packet
 // should continue traversing the network stack and false when it should be
 // dropped.
 //
 // Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
-	// TODO(gvisor.dev/issue/170): A lot of this is uncomplicated because
-	// we're missing features. Jumps, the call stack, etc. aren't checked
-	// for yet because we're yet to support them.
-
 	// Go through each table containing the hook.
 	for _, tablename := range it.Priorities[hook] {
-		switch verdict := it.checkTable(hook, pkt, tablename); verdict {
+		table := it.Tables[tablename]
+		ruleIdx := table.BuiltinChains[hook]
+		switch verdict := it.checkChain(hook, pkt, table, ruleIdx); verdict {
 		// If the table returns Accept, move on to the next table.
-		case TableAccept:
+		case chainAccept:
 			continue
 		// The Drop verdict is final.
-		case TableDrop:
+		case chainDrop:
 			return false
+		case chainReturn:
+			// Any Return from a built-in chain means we have to
+			// call the underflow.
+			underflow := table.Rules[table.Underflows[hook]]
+			switch v, _ := underflow.Target.Action(pkt); v {
+			case RuleAccept:
+				continue
+			case RuleDrop:
+				return false
+			case RuleJump, RuleReturn:
+				panic("Underflows should only return RuleAccept or RuleDrop.")
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", v))
+			}
+
 		default:
 			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
 		}
@@ -164,37 +192,37 @@ func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
 }
 
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) TableVerdict {
+func (it *IPTables) checkChain(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
-	table := it.Tables[tablename]
-	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
-		switch verdict := it.checkRule(hook, pkt, table, ruleIdx); verdict {
+	for ruleIdx < len(table.Rules) {
+		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx); verdict {
 		case RuleAccept:
-			return TableAccept
+			return chainAccept
 
 		case RuleDrop:
-			return TableDrop
-
-		case RuleContinue:
-			continue
+			return chainDrop
 
 		case RuleReturn:
-			// TODO(gvisor.dev/issue/170): We don't implement jump
-			// yet, so any Return is from a built-in chain. That
-			// means we have to to call the underflow.
-			underflow := table.Rules[table.Underflows[hook]]
-			// Underflow is guaranteed to be an unconditional
-			// ACCEPT or DROP.
-			switch v, _ := underflow.Target.Action(pkt); v {
-			case RuleAccept:
-				return TableAccept
-			case RuleDrop:
-				return TableDrop
-			case RuleContinue, RuleReturn:
-				panic("Underflows should only return RuleAccept or RuleDrop.")
+			return chainReturn
+
+		case RuleJump:
+			// "Jumping" to the next rule just means we're
+			// continuing on down the list.
+			if jumpTo == ruleIdx+1 {
+				ruleIdx++
+				continue
+			}
+			switch verdict := it.checkChain(hook, pkt, table, jumpTo); verdict {
+			case chainAccept:
+				return chainAccept
+			case chainDrop:
+				return chainDrop
+			case chainReturn:
+				ruleIdx++
+				continue
 			default:
-				panic(fmt.Sprintf("Unknown verdict: %d", v))
+				panic(fmt.Sprintf("Unknown verdict: %d", verdict))
 			}
 
 		default:
@@ -205,17 +233,18 @@ func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename stri
 
 	// We got through the entire table without a decision. Default to DROP
 	// for safety.
-	return TableDrop
+	return chainDrop
 }
 
 // Precondition: pk.NetworkHeader is set.
-func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) RuleVerdict {
+func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
 	// First check whether the packet matches the IP header filter.
 	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
 	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
-		return RuleContinue
+		// Continue on to the next rule.
+		return RuleJump, ruleIdx + 1
 	}
 
 	// Go through each rule matcher. If they all match, run
@@ -223,14 +252,14 @@ func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ru
 	for _, matcher := range rule.Matchers {
 		matches, hotdrop := matcher.Match(hook, pkt, "")
 		if hotdrop {
-			return RuleDrop
+			return RuleDrop, 0
 		}
 		if !matches {
-			return RuleContinue
+			// Continue on to the next rule.
+			return RuleJump, ruleIdx + 1
 		}
 	}
 
 	// All the matchers matched, so run the target.
-	verdict, _ := rule.Target.Action(pkt)
-	return verdict
+	return rule.Target.Action(pkt)
 }
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 9fc60cfad..81a2e39a2 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// This file contains various Targets.
-
 package iptables
 
 import (
@@ -25,16 +23,16 @@ import (
 type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleAccept, ""
+func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
+	return RuleAccept, 0
 }
 
 // DropTarget drops packets.
 type DropTarget struct{}
 
 // Action implements Target.Action.
-func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleDrop, ""
+func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
+	return RuleDrop, 0
 }
 
 // ErrorTarget logs an error and drops the packet. It represents a target that
@@ -42,9 +40,9 @@ func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
+func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, int) {
 	log.Debugf("ErrorTarget triggered.")
-	return RuleDrop, ""
+	return RuleDrop, 0
 }
 
 // UserChainTarget marks a rule as the beginning of a user chain.
@@ -53,7 +51,7 @@ type UserChainTarget struct {
 }
 
 // Action implements Target.Action.
-func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
+func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
 	panic("UserChainTarget should never be called.")
 }
 
@@ -62,6 +60,6 @@ func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
 type ReturnTarget struct{}
 
 // Action implements Target.Action.
-func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleReturn, ""
+func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, int) {
+	return RuleReturn, 0
 }
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 5735d001b..7d032fd23 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -56,17 +56,6 @@ const (
 	NumHooks
 )
 
-// A TableVerdict is what a table decides should be done with a packet.
-type TableVerdict int
-
-const (
-	// TableAccept indicates the packet should continue through netstack.
-	TableAccept TableVerdict = iota
-
-	// TableAccept indicates the packet should be dropped.
-	TableDrop
-)
-
 // A RuleVerdict is what a rule decides should be done with a packet.
 type RuleVerdict int
 
@@ -74,12 +63,12 @@ const (
 	// RuleAccept indicates the packet should continue through netstack.
 	RuleAccept RuleVerdict = iota
 
-	// RuleContinue indicates the packet should continue to the next rule.
-	RuleContinue
-
 	// RuleDrop indicates the packet should be dropped.
 	RuleDrop
 
+	// RuleJump indicates the packet should jump to another chain.
+	RuleJump
+
 	// RuleReturn indicates the packet should return to the previous chain.
 	RuleReturn
 )
@@ -174,6 +163,6 @@ type Matcher interface {
 type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
-	// Jump, it also returns the name of the chain to jump to.
-	Action(packet tcpip.PacketBuffer) (RuleVerdict, string)
+	// Jump, it also returns the index of the rule to jump to.
+	Action(packet tcpip.PacketBuffer) (RuleVerdict, int)
 }
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index e26d6a7d2..706c09cea 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -26,6 +26,7 @@ const (
 	acceptPort       = 2402
 	sendloopDuration = 2 * time.Second
 	network          = "udp4"
+	chainName        = "foochain"
 )
 
 func init() {
@@ -40,6 +41,12 @@ func init() {
 	RegisterTestCase(FilterInputDefaultPolicyAccept{})
 	RegisterTestCase(FilterInputDefaultPolicyDrop{})
 	RegisterTestCase(FilterInputReturnUnderflow{})
+	RegisterTestCase(FilterInputSerializeJump{})
+	RegisterTestCase(FilterInputJumpBasic{})
+	RegisterTestCase(FilterInputJumpReturn{})
+	RegisterTestCase(FilterInputJumpReturnDrop{})
+	RegisterTestCase(FilterInputJumpBuiltin{})
+	RegisterTestCase(FilterInputJumpTwice{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -267,13 +274,12 @@ func (FilterInputMultiUDPRules) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputMultiUDPRules) ContainerAction(ip net.IP) error {
-	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
-		return err
-	}
-	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
-		return err
+	rules := [][]string{
+		{"-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"},
+		{"-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"},
+		{"-L"},
 	}
-	return filterTable("-L")
+	return filterTableRules(rules)
 }
 
 // LocalAction implements TestCase.LocalAction.
@@ -314,14 +320,13 @@ func (FilterInputCreateUserChain) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputCreateUserChain) ContainerAction(ip net.IP) error {
-	// Create a chain.
-	const chainName = "foochain"
-	if err := filterTable("-N", chainName); err != nil {
-		return err
+	rules := [][]string{
+		// Create a chain.
+		{"-N", chainName},
+		// Add a simple rule to the chain.
+		{"-A", chainName, "-j", "DROP"},
 	}
-
-	// Add a simple rule to the chain.
-	return filterTable("-A", chainName, "-j", "DROP")
+	return filterTableRules(rules)
 }
 
 // LocalAction implements TestCase.LocalAction.
@@ -396,13 +401,12 @@ func (FilterInputReturnUnderflow) Name() string {
 func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
 	// Add a RETURN rule followed by an unconditional accept, and set the
 	// default policy to DROP.
-	if err := filterTable("-A", "INPUT", "-j", "RETURN"); err != nil {
-		return err
+	rules := [][]string{
+		{"-A", "INPUT", "-j", "RETURN"},
+		{"-A", "INPUT", "-j", "DROP"},
+		{"-P", "INPUT", "ACCEPT"},
 	}
-	if err := filterTable("-A", "INPUT", "-j", "DROP"); err != nil {
-		return err
-	}
-	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+	if err := filterTableRules(rules); err != nil {
 		return err
 	}
 
@@ -415,3 +419,178 @@ func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
 func (FilterInputReturnUnderflow) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// Verify that we can serialize jumps.
+type FilterInputSerializeJump struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputSerializeJump) Name() string {
+	return "FilterInputSerializeJump"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputSerializeJump) ContainerAction(ip net.IP) error {
+	// Write a JUMP rule, the serialize it with `-L`.
+	rules := [][]string{
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-L"},
+	}
+	return filterTableRules(rules)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputSerializeJump) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// Jump to a chain and execute a rule there.
+type FilterInputJumpBasic struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpBasic) Name() string {
+	return "FilterInputJumpBasic"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpBasic) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpBasic) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// Jump, return, and execute a rule.
+type FilterInputJumpReturn struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpReturn) Name() string {
+	return "FilterInputJumpReturn"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpReturn) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-N", chainName},
+		{"-P", "INPUT", "ACCEPT"},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", "RETURN"},
+		{"-A", chainName, "-j", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpReturn) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+type FilterInputJumpReturnDrop struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpReturnDrop) Name() string {
+	return "FilterInputJumpReturnDrop"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpReturnDrop) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", "INPUT", "-j", "DROP"},
+		{"-A", chainName, "-j", "RETURN"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpReturnDrop) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// Jumping to a top-levl chain is illegal.
+type FilterInputJumpBuiltin struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpBuiltin) Name() string {
+	return "FilterInputJumpBuiltin"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpBuiltin) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-j", "OUTPUT"); err == nil {
+		return fmt.Errorf("iptables should be unable to jump to a built-in chain")
+	}
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpBuiltin) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// Jump twice, then return twice and execute a rule.
+type FilterInputJumpTwice struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpTwice) Name() string {
+	return "FilterInputJumpTwice"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpTwice) ContainerAction(ip net.IP) error {
+	const chainName2 = chainName + "2"
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-N", chainName},
+		{"-N", chainName2},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", chainName2},
+		{"-A", "INPUT", "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// UDP packets should jump and return twice, eventually hitting the
+	// ACCEPT rule.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpTwice) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 46a7c99b0..0621861eb 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -249,3 +249,39 @@ func TestFilterOutputDropTCPSrcPort(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestJumpSerialize(t *testing.T) {
+	if err := singleTest(FilterInputSerializeJump{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpBasic(t *testing.T) {
+	if err := singleTest(FilterInputJumpBasic{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpReturn(t *testing.T) {
+	if err := singleTest(FilterInputJumpReturn{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpReturnDrop(t *testing.T) {
+	if err := singleTest(FilterInputJumpReturnDrop{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpBuiltin(t *testing.T) {
+	if err := singleTest(FilterInputJumpBuiltin{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestJumpTwice(t *testing.T) {
+	if err := singleTest(FilterInputJumpTwice{}); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 043114c78..293c4e6ed 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -35,6 +35,16 @@ func filterTable(args ...string) error {
 	return nil
 }
 
+// filterTableRules is like filterTable, but runs multiple iptables commands.
+func filterTableRules(argsList [][]string) error {
+	for _, args := range argsList {
+		if err := filterTable(args...); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 // listenUDP listens on a UDP port and returns the value of net.Conn.Read() for
 // the first read on that port.
 func listenUDP(port int, timeout time.Duration) error {
-- 
cgit v1.2.3


From 56fd9504aab44a738d3df164cbee8e572b309f28 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 15:44:22 -0800
Subject: Enable IPV6_RECVTCLASS socket option for datagram sockets

Added the ability to get/set the IP_RECVTCLASS socket option on UDP endpoints.
If enabled, traffic class from the incoming Network Header passed as ancillary
data in the ControlMessages.

Adding Get/SetSockOptBool to decrease the overhead of getting/setting simple
options. (This was absorbed in a CL that will be landing before this one).

Test:
* Added unit test to udp_test.go that tests getting/setting as well as
verifying that we receive expected TOS from incoming packet.
* Added a syscall test for verifying getting/setting
* Removed test skip for existing syscall test to enable end to end test.
PiperOrigin-RevId: 295840218
---
 pkg/sentry/socket/control/control.go         |   2 +-
 pkg/sentry/socket/netstack/netstack.go       |  27 +++++-
 pkg/tcpip/checker/checker.go                 |  14 +++
 pkg/tcpip/tcpip.go                           |  15 ++-
 pkg/tcpip/transport/udp/endpoint.go          |  38 +++++++-
 pkg/tcpip/transport/udp/udp_test.go          | 120 ++++++++++++++----------
 test/syscalls/linux/ip_socket_test_util.h    |  16 ++--
 test/syscalls/linux/socket_ip_udp_generic.cc | 133 +++++++++++++++++++--------
 test/syscalls/linux/udp_socket_test_cases.cc |   4 -
 9 files changed, 260 insertions(+), 109 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4667373d2..8834a1e1a 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -329,7 +329,7 @@ func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 }
 
 // PackTClass packs an IPV6_TCLASS socket control message.
-func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
+func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IPV6,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 9757fbfba..e187276c5 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1318,6 +1318,22 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		}
 		return ib, nil
 
+	case linux.IPV6_RECVTCLASS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -1803,6 +1819,14 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v)))
 
+	case linux.IPV6_RECVTCLASS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -2086,7 +2110,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
 		linux.IPV6_RECVPATHMTU,
 		linux.IPV6_RECVPKTINFO,
 		linux.IPV6_RECVRTHDR,
-		linux.IPV6_RECVTCLASS,
 		linux.IPV6_RTHDR,
 		linux.IPV6_RTHDRDSTOPTS,
 		linux.IPV6_TCLASS,
@@ -2424,6 +2447,8 @@ func (s *SocketOperations) controlMessages() socket.ControlMessages {
 			Timestamp:       s.readCM.Timestamp,
 			HasTOS:          s.readCM.HasTOS,
 			TOS:             s.readCM.TOS,
+			HasTClass:       s.readCM.HasTClass,
+			TClass:          s.readCM.TClass,
 			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
 			PacketInfo:      s.readCM.PacketInfo,
 		},
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 4d6ae0871..c6c160dfc 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -161,6 +161,20 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTClass creates a checker that checks the TCLASS field in
+// ControlMessages.
+func ReceiveTClass(want uint32) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTClass {
+			t.Fatalf("got cm.HasTClass = %t, want cm.TClass = %d", cm.HasTClass, want)
+		}
+		if got := cm.TClass; got != want {
+			t.Fatalf("got cm.TClass = %d, want %d", got, want)
+		}
+	}
+}
+
 // ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
 func ReceiveTOS(want uint8) ControlMessagesChecker {
 	return func(t *testing.T, cm tcpip.ControlMessages) {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 9ca39ce40..ce5527391 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -323,11 +323,11 @@ type ControlMessages struct {
 	// TOS is the IPv4 type of service of the associated packet.
 	TOS uint8
 
-	// HasTClass indicates whether Tclass is valid/set.
+	// HasTClass indicates whether TClass is valid/set.
 	HasTClass bool
 
-	// Tclass is the IPv6 traffic class of the associated packet.
-	TClass int32
+	// TClass is the IPv6 traffic class of the associated packet.
+	TClass uint32
 
 	// HasIPPacketInfo indicates whether PacketInfo is set.
 	HasIPPacketInfo bool
@@ -502,9 +502,13 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// ReceiveTClassOption is used by SetSockOpt/GetSockOpt to specify if the
+	// IPV6_TCLASS ancillary message is passed with incoming packets.
+	ReceiveTClassOption SockOptBool = iota
+
 	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
 	// ancillary message is passed with incoming packets.
-	ReceiveTOSOption SockOptBool = iota
+	ReceiveTOSOption
 
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
@@ -514,6 +518,9 @@ const (
 	// if more inforamtion is provided with incoming packets such
 	// as interface index and address.
 	ReceiveIPPacketInfoOption
+
+	// TODO(b/146901447): convert existing bool socket options to be handled via
+	// Get/SetSockOptBool
 )
 
 // SockOptInt represents socket options which values have the int type.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 3fe91cac2..eff7f3600 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,7 +32,8 @@ type udpPacket struct {
 	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	tos           uint8
+	// tos stores either the receiveTOS or receiveTClass value.
+	tos uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -119,6 +120,10 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveTClass determines if the incoming IPv6 TClass header field is
+	// passed as ancillary data to ControlMessages on Read.
+	receiveTClass bool
+
 	// receiveIPPacketInfo determines if the packet info is returned by Read.
 	receiveIPPacketInfo bool
 
@@ -258,13 +263,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveTClass := e.receiveTClass
 	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
-
+	if receiveTClass {
+		cm.HasTClass = true
+		// Although TClass is an 8-bit value it's read in the CMsg as a uint32.
+		cm.TClass = uint32(p.tos)
+	}
 	if receiveIPPacketInfo {
 		cm.HasIPPacketInfo = true
 		cm.PacketInfo = p.packetInfo
@@ -490,6 +500,17 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrNotSupported
+		}
+
+		e.mu.Lock()
+		e.receiveTClass = v
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -709,6 +730,17 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 		return v, nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrNotSupported
+		}
+
+		e.mu.RLock()
+		v := e.receiveTClass
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -1273,6 +1305,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		packet.packetInfo.LocalAddr = r.LocalAddress
 		packet.packetInfo.DestinationAddr = r.RemoteAddress
 		packet.packetInfo.NIC = r.NICID()
+	case header.IPv6ProtocolNumber:
+		packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index f0ff3fe71..34b7c2360 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -409,6 +409,7 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
 	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
 		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
 		NextHeader:    uint8(udp.ProtocolNumber),
 		HopLimit:      65,
@@ -1336,7 +1337,7 @@ func TestSetTTL(t *testing.T) {
 	}
 }
 
-func TestTOSV4(t *testing.T) {
+func TestSetTOS(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1347,23 +1348,23 @@ func TestTOSV4(t *testing.T) {
 			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
 			if err := c.ep.SetSockOpt(tcpip.IPv4TOSOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv4TOSOption(tos), err)
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv4TOSOption(tos), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
 			if want := tcpip.IPv4TOSOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
 			testWrite(c, flow, checker.TOS(tos, 0))
@@ -1371,7 +1372,7 @@ func TestTOSV4(t *testing.T) {
 	}
 }
 
-func TestTOSV6(t *testing.T) {
+func TestSetTClass(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, broadcastIn6} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1379,71 +1380,92 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
+			const tClass = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
-			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt failed: %s", err)
+			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tClass)); err != nil {
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv6TrafficClassOption(tClass), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
-			if want := tcpip.IPv6TrafficClassOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+			if want := tcpip.IPv6TrafficClassOption(tClass); v != want {
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
-			testWrite(c, flow, checker.TOS(tos, 0))
+			// The header getter for TClass is called TOS, so use that checker.
+			testWrite(c, flow, checker.TOS(tClass, 0))
 		})
 	}
 }
 
-func TestReceiveTOSV4(t *testing.T) {
-	for _, flow := range []testFlow{unicastV4, broadcast} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
+func TestReceiveTosTClass(t *testing.T) {
+	testCases := []struct {
+		name             string
+		getReceiveOption tcpip.SockOptBool
+		tests            []testFlow
+	}{
+		{"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}},
+		{"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
+	}
+	for _, testCase := range testCases {
+		for _, flow := range testCase.tests {
+			t.Run(fmt.Sprintf("%s:flow:%s", testCase.name, flow), func(t *testing.T) {
+				c := newDualTestContext(t, defaultMTU)
+				defer c.cleanup()
 
-			c.createEndpointForFlow(flow)
+				c.createEndpointForFlow(flow)
+				option := testCase.getReceiveOption
+				name := testCase.name
 
-			// Verify that setting and reading the option works.
-			v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			// Test for expected default value.
-			if v != false {
-				c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
-			}
+				// Verify that setting and reading the option works.
+				v, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
+				// Test for expected default value.
+				if v != false {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false)
+				}
 
-			want := true
-			if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
-				c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
-			}
+				want := true
+				if err := c.ep.SetSockOptBool(option, want); err != nil {
+					c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err)
+				}
 
-			got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			if got != want {
-				c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
-			}
+				got, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
 
-			// Verify that the correct received TOS is handed through as
-			// ancillary data to the ControlMessages struct.
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				c.t.Fatal("Bind failed:", err)
-			}
-			testRead(c, flow, checker.ReceiveTOS(testTOS))
-		})
+				if got != want {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want)
+				}
+
+				// Verify that the correct received TOS or TClass is handed through as
+				// ancillary data to the ControlMessages struct.
+				if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+					c.t.Fatalf("Bind failed: %s", err)
+				}
+				switch option {
+				case tcpip.ReceiveTClassOption:
+					testRead(c, flow, checker.ReceiveTClass(testTOS))
+				case tcpip.ReceiveTOSOption:
+					testRead(c, flow, checker.ReceiveTOS(testTOS))
+				default:
+					t.Fatalf("unknown test variant: %s", name)
+				}
+			})
+		}
 	}
 }
 
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 083ebbcf0..39fd6709d 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -84,20 +84,20 @@ SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
 // SocketPairs created with AF_INET and the given type.
 SocketPairKind IPv4UDPUnboundSocketPair(int type);
 
-// IPv4UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_DGRAM, and the given type.
+// IPv4UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_DGRAM, and the given type.
 SocketKind IPv4UDPUnboundSocket(int type);
 
-// IPv6UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_DGRAM, and the given type.
+// IPv6UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_DGRAM, and the given type.
 SocketKind IPv6UDPUnboundSocket(int type);
 
-// IPv4TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_STREAM and the given type.
+// IPv4TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_STREAM and the given type.
 SocketKind IPv4TCPUnboundSocket(int type);
 
-// IPv6TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_STREAM and the given type.
+// IPv6TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_STREAM and the given type.
 SocketKind IPv6TCPUnboundSocket(int type);
 
 // IfAddrHelper is a helper class that determines the local interfaces present
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index db5663ecd..1c533fdf2 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -14,6 +14,7 @@
 
 #include "test/syscalls/linux/socket_ip_udp_generic.h"
 
+#include <errno.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <poll.h>
@@ -209,46 +210,6 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
-// Ensure that Receiving TOS is off by default.
-TEST_P(UDPSocketPairTest, RecvTosDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-}
-
-// Test that setting and getting IP_RECVTOS works as expected.
-TEST_P(UDPSocketPairTest, SetRecvTos) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOff, sizeof(kSockOptOff)),
-              SyscallSucceeds());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOn, sizeof(kSockOptOn)),
-              SyscallSucceeds());
-
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOn);
-}
-
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -401,5 +362,97 @@ TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
   EXPECT_EQ(get_len, sizeof(get));
 }
 
+// Holds TOS or TClass information for IPv4 or IPv6 respectively.
+struct RecvTosOption {
+  int level;
+  int option;
+};
+
+RecvTosOption GetRecvTosOption(int domain) {
+  TEST_CHECK(domain == AF_INET || domain == AF_INET6);
+  RecvTosOption opt;
+  switch (domain) {
+    case AF_INET:
+      opt.level = IPPROTO_IP;
+      opt.option = IP_RECVTOS;
+      break;
+    case AF_INET6:
+      opt.level = IPPROTO_IPV6;
+      opt.option = IPV6_RECVTCLASS;
+      break;
+  }
+  return opt;
+}
+
+// Ensure that Receiving TOS or TCLASS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS or IPV6_RECVTCLASS works as
+// expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+// Test that any socket (including IPv6 only) accepts the IPv4 TOS option: this
+// mirrors behavior in linux.
+TEST_P(UDPSocketPairTest, TOSRecvMismatch) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(AF_INET);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+}
+
+// Test that an IPv4 socket does not support the IPv6 TClass option.
+TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
+  // This should only test AF_INET sockets for the mismatch behavior.
+  SKIP_IF(GetParam().domain != AF_INET);
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IPV6, IPV6_RECVTCLASS,
+                         &get, &get_len),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 9f8de6b48..57b1a357c 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,9 +1349,6 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
-          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1422,7 +1419,6 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 2daa21e4d73f2297a8bca32c76100333e9ac4af4 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 16:47:58 -0800
Subject: Internal change.

PiperOrigin-RevId: 296088213
---
 pkg/sentry/socket/netstack/provider.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index 5afff2564..5f181f017 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -75,6 +75,8 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 		switch protocol {
 		case syscall.IPPROTO_ICMP:
 			return header.ICMPv4ProtocolNumber, true, nil
+		case syscall.IPPROTO_ICMPV6:
+			return header.ICMPv6ProtocolNumber, true, nil
 		case syscall.IPPROTO_UDP:
 			return header.UDPProtocolNumber, true, nil
 		case syscall.IPPROTO_TCP:
-- 
cgit v1.2.3


From abf7ebcd38e8c2750f4542f29115140bb2b44a9b Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Thu, 27 Feb 2020 10:59:32 -0800
Subject: Internal change.

PiperOrigin-RevId: 297638665
---
 pkg/sentry/socket/netstack/netstack.go |  40 +++++++++--
 pkg/tcpip/transport/packet/endpoint.go |  21 +++++-
 test/syscalls/linux/packet_socket.cc   | 124 ++++++++++++++++++++++++++++++---
 3 files changed, 167 insertions(+), 18 deletions(-)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e187276c5..48c268bfa 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,14 +712,40 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, family, err := AddressAndFamily(sockaddr)
-	if err != nil {
-		return err
-	}
-	if err := s.checkFamily(family, true /* exact */); err != nil {
-		return err
+	family := usermem.ByteOrder.Uint16(sockaddr)
+	var addr tcpip.FullAddress
+
+	// Bind for AF_PACKET requires only family, protocol and ifindex.
+	// In function AddressAndFamily, we check the address length which is
+	// not needed for AF_PACKET bind.
+	if family == linux.AF_PACKET {
+		var a linux.SockAddrLink
+		if len(sockaddr) < sockAddrLinkSize {
+			return syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a)
+
+		if a.Protocol != uint16(s.protocol) {
+			return syserr.ErrInvalidArgument
+		}
+
+		addr = tcpip.FullAddress{
+			NIC:  tcpip.NICID(a.InterfaceIndex),
+			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
+		}
+	} else {
+		var err *syserr.Error
+		addr, family, err = AddressAndFamily(sockaddr)
+		if err != nil {
+			return err
+		}
+
+		if err = s.checkFamily(family, true /* exact */); err != nil {
+			return err
+		}
+
+		addr = s.mapFamily(addr, family)
 	}
-	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 5722815e9..09a1cd436 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -76,6 +76,7 @@ type endpoint struct {
 	sndBufSize int
 	closed     bool
 	stats      tcpip.TransportEndpointStats `state:"nosave"`
+	bound      bool
 }
 
 // NewEndpoint returns a new packet endpoint.
@@ -125,6 +126,7 @@ func (ep *endpoint) Close() {
 	}
 
 	ep.closed = true
+	ep.bound = false
 	ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 }
 
@@ -216,7 +218,24 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	// sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
 	// - packet(7).
 
-	return tcpip.ErrNotSupported
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	if ep.bound {
+		return tcpip.ErrAlreadyBound
+	}
+
+	// Unregister endpoint with all the nics.
+	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+	// Bind endpoint to receive packets from specific interface.
+	if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil {
+		return err
+	}
+
+	ep.bound = true
+
+	return nil
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index 92ae55eec..bc22de788 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
+#include <ifaddrs.h>
 #include <linux/capability.h>
 #include <linux/if_arp.h>
 #include <linux/if_packet.h>
@@ -163,16 +164,11 @@ int CookedPacketTest::GetLoopbackIndex() {
   return ifr.ifr_ifindex;
 }
 
-// Receive via a packet socket.
-TEST_P(CookedPacketTest, Receive) {
-  // Let's use a simple IP payload: a UDP datagram.
-  FileDescriptor udp_sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
-  SendUDPMessage(udp_sock.get());
-
+// Receive and verify the message via packet socket on interface.
+void ReceiveMessage(int sock, int ifindex) {
   // Wait for the socket to become readable.
   struct pollfd pfd = {};
-  pfd.fd = socket_;
+  pfd.fd = sock;
   pfd.events = POLLIN;
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1));
 
@@ -182,9 +178,10 @@ TEST_P(CookedPacketTest, Receive) {
   char buf[64];
   struct sockaddr_ll src = {};
   socklen_t src_len = sizeof(src);
-  ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0,
+  ASSERT_THAT(recvfrom(sock, buf, sizeof(buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_len),
               SyscallSucceedsWithValue(packet_size));
+
   // sockaddr_ll ends with an 8 byte physical address field, but ethernet
   // addresses only use 6 bytes.  Linux used to return sizeof(sockaddr_ll)-2
   // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
@@ -194,7 +191,7 @@ TEST_P(CookedPacketTest, Receive) {
   // TODO(b/129292371): Verify protocol once we return it.
   // Verify the source address.
   EXPECT_EQ(src.sll_family, AF_PACKET);
-  EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex());
+  EXPECT_EQ(src.sll_ifindex, ifindex);
   EXPECT_EQ(src.sll_halen, ETH_ALEN);
   // This came from the loopback device, so the address is all 0s.
   for (int i = 0; i < src.sll_halen; i++) {
@@ -222,6 +219,18 @@ TEST_P(CookedPacketTest, Receive) {
   EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0);
 }
 
+// Receive via a packet socket.
+TEST_P(CookedPacketTest, Receive) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  int loopback_index = GetLoopbackIndex();
+  ReceiveMessage(socket_, loopback_index);
+}
+
 // Send via a packet socket.
 TEST_P(CookedPacketTest, Send) {
   // TODO(b/129292371): Remove once we support packet socket writing.
@@ -313,6 +322,101 @@ TEST_P(CookedPacketTest, Send) {
   EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
 }
 
+// Bind and receive via packet socket.
+TEST_P(CookedPacketTest, BindReceive) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  ReceiveMessage(socket_, bind_addr.sll_ifindex);
+}
+
+// Double Bind socket.
+TEST_P(CookedPacketTest, DoubleBind) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Binding socket again should fail.
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+           sizeof(bind_addr)),
+      // Linux 4.09 returns EINVAL here, but some time before 4.19 it switched
+      // to EADDRINUSE.
+      AnyOf(SyscallFailsWithErrno(EADDRINUSE), SyscallFailsWithErrno(EINVAL)));
+}
+
+// Bind and verify we do not receive data on interface which is not bound
+TEST_P(CookedPacketTest, BindDrop) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct ifaddrs* if_addr_list = nullptr;
+  auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); });
+
+  ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds());
+
+  // Get interface other than loopback.
+  struct ifreq ifr = {};
+  for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) {
+    if (strcmp(i->ifa_name, "lo") != 0) {
+      strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name));
+      break;
+    }
+  }
+
+  // Skip if no interface is available other than loopback.
+  if (strlen(ifr.ifr_name) == 0) {
+    GTEST_SKIP();
+  }
+
+  // Get interface index.
+  EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_NE(ifr.ifr_ifindex, 0);
+
+  // Bind to packet socket requires only family, protocol and ifindex.
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = ifr.ifr_ifindex;
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Send to loopback interface.
+  struct sockaddr_in dest = {};
+  dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  dest.sin_family = AF_INET;
+  dest.sin_port = kPort;
+  EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0,
+                     reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+              SyscallSucceedsWithValue(sizeof(kMessage)));
+
+  // Wait and make sure the socket never receives any data.
+  struct pollfd pfd = {};
+  pfd.fd = socket_;
+  pfd.events = POLLIN;
+  EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3


From 42fb7d349137bd8847e7c3df6493fde3bc8e6e89 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 2 Mar 2020 10:32:20 -0800
Subject: socket: take readMu to access readView

DATA RACE in netstack.(*SocketOperations).fetchReadView

Write at 0x00c001dca138 by goroutine 1001:
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).fetchReadView()
      pkg/sentry/socket/netstack/netstack.go:418 +0x85
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).coalescingRead()
      pkg/sentry/socket/netstack/netstack.go:2309 +0x67
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).nonBlockingRead()
      pkg/sentry/socket/netstack/netstack.go:2378 +0x183d

Previous read at 0x00c001dca138 by goroutine 1111:
  gvisor.dev/gvisor/pkg/sentry/socket/netstack.(*SocketOperations).Ioctl()
      pkg/sentry/socket/netstack/netstack.go:2666 +0x533
  gvisor.dev/gvisor/pkg/sentry/syscalls/linux.Ioctl()

Reported-by: syzbot+d4c3885fcc346f08deb6@syzkaller.appspotmail.com
PiperOrigin-RevId: 298387377
---
 pkg/sentry/socket/netstack/netstack.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 48c268bfa..1eeb37446 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2663,7 +2663,9 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 		}
 
 		// Add bytes removed from the endpoint but not yet sent to the caller.
+		s.readMu.Lock()
 		v += len(s.readView)
+		s.readMu.Unlock()
 
 		if v > math.MaxInt32 {
 			v = math.MaxInt32
-- 
cgit v1.2.3


From 43abb24657e737dee1108ff0d512b2e1b6d8a3f6 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Mon, 2 Mar 2020 16:30:51 -0800
Subject: Fix panic caused by invalid address for Bind in packet sockets.

PiperOrigin-RevId: 298476533
---
 pkg/sentry/socket/netstack/netstack.go |  4 ++++
 test/syscalls/linux/packet_socket.cc   | 13 +++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'pkg/sentry/socket')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 1eeb37446..13a9a60b4 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,6 +712,10 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	if len(sockaddr) < 2 {
+		return syserr.ErrInvalidArgument
+	}
+
 	family := usermem.ByteOrder.Uint16(sockaddr)
 	var addr tcpip.FullAddress
 
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index bc22de788..248762ca9 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -417,6 +417,19 @@ TEST_P(CookedPacketTest, BindDrop) {
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
 }
 
+// Bind with invalid address.
+TEST_P(CookedPacketTest, BindFail) {
+  // Null address.
+  ASSERT_THAT(bind(socket_, nullptr, sizeof(struct sockaddr)),
+              SyscallFailsWithErrno(EFAULT));
+
+  // Address of size 1.
+  uint8_t addr = 0;
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3